From 5e1229a75fa3b214cedfd2815c99133253340111 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 6 Dec 2023 15:02:00 +0800 Subject: [PATCH 1/4] Fix year 0 is out of range for test_from_json_struct_timestamp Signed-off-by: Haoyang Li --- integration_tests/src/main/python/cast_test.py | 10 ++++------ integration_tests/src/main/python/data_gen.py | 9 ++++++++- integration_tests/src/main/python/json_test.py | 8 ++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index b81d6a2e050..59fece0984d 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -61,13 +61,11 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) -date_after_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]' - def test_cast_string_date_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, StringGen(date_after_1_2_1)).select(f.col('a').cast(DateType())), + lambda spark : unary_op_df(spark, StringGen(date_start_1_2_1)).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim @@ -148,9 +146,9 @@ def test_cast_string_date_non_ansi(): lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) -@pytest.mark.parametrize('data_gen', [StringGen(date_after_1_2_1), - StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), - StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?') +@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_2_1), + StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), + StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?') ], ids=idfn) @allow_non_gpu(*non_utc_allow) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 2232f80d197..ea8a31515df 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -779,6 +779,7 @@ def gen_df_help(data_gen, length, seed_value): rand = random.Random(seed_value) data_gen.start(rand) data = [data_gen.gen() for index in range(0, length)] + print(data) return data def gen_df(spark, data_gen, length=2048, seed=None, num_slices=None): @@ -1200,4 +1201,10 @@ def get_25_partitions_df(spark): # This will be deprecated and replaced case specified non GPU allow list non_utc_allow = ['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec', 'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec', 'ShuffleExchangeExec', - 'ExecutedCommandExec'] if is_not_utc() else [] \ No newline at end of file + 'ExecutedCommandExec'] if is_not_utc() else [] + +# date related regexps for generating date strings within python's range limits + +date_start_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]' + +yyyy_start_0003 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' \ No newline at end of file diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index d5b308caa52..b02042ab4e9 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -637,7 +637,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format # "yyyy-MM" "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "yyyy" - "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[0-9]{4}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", + "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0003 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "dd/MM/yyyy" "\"[0-9]{2}/[0-9]{2}/[1-8]{1}[0-9]{3}\"", # special constant values @@ -664,7 +664,6 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')), "CORRECTED" ]) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9747') @pytest.mark.parametrize('ansi_enabled', [ True, False ]) def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled): json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \ @@ -678,6 +677,11 @@ def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser 'spark.sql.legacy.timeParserPolicy': time_parser_policy, 'spark.sql.ansi.enabled': ansi_enabled }) +def test_yyyy_start_0003(): + # 0003 is not a leap year + assert_gpu_and_cpu_are_equal_collect( + lambda spark : gen_df(spark, StringGen(yyyy_start_0003, nullable=False))) + @allow_non_gpu('ProjectExec') @pytest.mark.parametrize('timestamp_gen', ["\"[1-8]{1}[0-9]{3}-[0-3]{1,2}-[0-3]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\\.[0-9]{1,6})?Z?\""]) @pytest.mark.parametrize('timestamp_format', [ From eb1910883eb4ce0be2488025da8fdf53b35e0eb1 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 6 Dec 2023 15:04:28 +0800 Subject: [PATCH 2/4] clean up Signed-off-by: Haoyang Li --- integration_tests/src/main/python/data_gen.py | 1 - integration_tests/src/main/python/json_test.py | 5 ----- 2 files changed, 6 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index ea8a31515df..03412a15c17 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -779,7 +779,6 @@ def gen_df_help(data_gen, length, seed_value): rand = random.Random(seed_value) data_gen.start(rand) data = [data_gen.gen() for index in range(0, length)] - print(data) return data def gen_df(spark, data_gen, length=2048, seed=None, num_slices=None): diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index b02042ab4e9..2a34ebec07f 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -677,11 +677,6 @@ def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser 'spark.sql.legacy.timeParserPolicy': time_parser_policy, 'spark.sql.ansi.enabled': ansi_enabled }) -def test_yyyy_start_0003(): - # 0003 is not a leap year - assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, StringGen(yyyy_start_0003, nullable=False))) - @allow_non_gpu('ProjectExec') @pytest.mark.parametrize('timestamp_gen', ["\"[1-8]{1}[0-9]{3}-[0-3]{1,2}-[0-3]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\\.[0-9]{1,6})?Z?\""]) @pytest.mark.parametrize('timestamp_format', [ From 7e75ca224919a306145ca3cbaf996ece49fb1083 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 6 Dec 2023 17:16:47 +0800 Subject: [PATCH 3/4] address comments Signed-off-by: Haoyang Li --- integration_tests/src/main/python/data_gen.py | 4 +++- integration_tests/src/main/python/json_test.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 03412a15c17..0431feada2e 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -1204,6 +1204,8 @@ def get_25_partitions_df(spark): # date related regexps for generating date strings within python's range limits +# regexp to generate date from 0001-02-01, format is yyyy-MM-dd date_start_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]' -yyyy_start_0003 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' \ No newline at end of file +# regexp to generate year from 0002, forat is yyyy +yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' \ No newline at end of file diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 2a34ebec07f..def66df6cab 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -637,7 +637,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format # "yyyy-MM" "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "yyyy" - "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0003 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", + "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0002 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"", # "dd/MM/yyyy" "\"[0-9]{2}/[0-9]{2}/[1-8]{1}[0-9]{3}\"", # special constant values From bca3fbc2683d2d74591c650da6fe6190e22fd219 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 7 Dec 2023 11:11:17 +0800 Subject: [PATCH 4/4] fix nit and typo Signed-off-by: Haoyang Li --- integration_tests/src/main/python/data_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 0431feada2e..d356ac18e26 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -1207,5 +1207,5 @@ def get_25_partitions_df(spark): # regexp to generate date from 0001-02-01, format is yyyy-MM-dd date_start_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]' -# regexp to generate year from 0002, forat is yyyy -yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])' \ No newline at end of file +# regexp to generate year from 0002, format is yyyy +yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])'