From 617291fb269a0f97cdac14654b3605c2483d1733 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 12 Dec 2023 13:51:46 -0800 Subject: [PATCH 1/2] Remove GpuToTimestampImproved and spark.rapids.sql.improvedTimeOps since no one is using and results are inconsistent with Spark Signed-off-by: Navin Kumar --- .../advanced_configs.md | 1 - .../src/main/python/date_time_test.py | 19 ----- .../nvidia/spark/rapids/GpuOverrides.scala | 14 +--- .../com/nvidia/spark/rapids/RapidsConf.scala | 9 -- .../sql/rapids/datetimeExpressions.scala | 83 ------------------- .../spark/rapids/ParseDateTimeSuite.scala | 10 --- 6 files changed, 2 insertions(+), 134 deletions(-) diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 089a9cb9b27..ce3cd8696fa 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -114,7 +114,6 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.hasExtendedYearValues|Spark 3.2.0+ extended parsing of years in dates and timestamps to support the full range of possible values. Prior to this it was limited to a positive 4 digit year. The Accelerator does not support the extended range yet. This config indicates if your data includes this extended range or not, or if you don't care about getting the correct values on values with the extended range.|true|Runtime spark.rapids.sql.hashOptimizeSort.enabled|Whether sorts should be inserted after some hashed operations to improve output ordering. This can improve output file sizes when saving to columnar formats.|false|Runtime spark.rapids.sql.improvedFloatOps.enabled|For some floating point operations spark uses one way to compute the value and the underlying cudf implementation can use an improved algorithm. In some cases this can result in cudf producing an answer when spark overflows.|true|Runtime -spark.rapids.sql.improvedTimeOps.enabled|When set to true, some operators will avoid overflowing by converting epoch days directly to seconds without first converting to microseconds|false|Runtime spark.rapids.sql.incompatibleDateFormats.enabled|When parsing strings as dates and timestamps in functions like unix_timestamp, some formats are fully supported on the GPU and some are unsupported and will fall back to the CPU. Some formats behave differently on the GPU than the CPU. Spark on the CPU interprets date formats with unsupported trailing characters as nulls, while Spark on the GPU will parse the date with invalid trailing characters. More detail can be found at [parsing strings as dates or timestamps](../compatibility.md#parsing-strings-as-dates-or-timestamps).|false|Runtime spark.rapids.sql.incompatibleOps.enabled|For operations that work, but are not 100% compatible with the Spark equivalent set if they should be enabled by default or disabled by default.|true|Runtime spark.rapids.sql.join.cross.enabled|When set to true cross joins are enabled on the GPU|true|Runtime diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index d44c261a9a5..1d4ce5e65d8 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -379,16 +379,6 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun, conf=copy_and_update(parser_policy_dic, ansi_enabled_conf)) -@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@allow_non_gpu(*non_utc_allow) -def test_unix_timestamp_improved(data_gen, ansi_enabled): - conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", - "spark.sql.legacy.timeParserPolicy": "CORRECTED"} - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))), - copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) - @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) @@ -397,15 +387,6 @@ def test_unix_timestamp(data_gen, ansi_enabled): lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), {'spark.sql.ansi.enabled': ansi_enabled}) -@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@allow_non_gpu(*non_utc_allow) -def test_to_unix_timestamp_improved(data_gen, ansi_enabled): - conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), - copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) - str_date_and_format_gen = [pytest.param(StringGen('[0-9]{4}/[01][0-9]'),'yyyy/MM', marks=pytest.mark.xfail(reason="cudf does no checks")), (StringGen('[0-9]{4}/[01][12]/[0-2][1-8]'),'yyyy/MM/dd'), (StringGen('[01][12]/[0-2][1-8]'), 'MM/dd'), diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 74430ae8e90..ec649273514 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1696,12 +1696,7 @@ object GpuOverrides extends Logging { TypeSig.STRING)), (a, conf, p, r) => new UnixTimeExprMeta[ToUnixTimestamp](a, conf, p, r) { override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = { - if (conf.isImprovedTimestampOpsEnabled) { - // passing the already converted strf string for a little optimization - GpuToUnixTimestampImproved(lhs, rhs, sparkFormat, strfFormat) - } else { - GpuToUnixTimestamp(lhs, rhs, sparkFormat, strfFormat) - } + GpuToUnixTimestamp(lhs, rhs, sparkFormat, strfFormat) } }), expr[UnixTimestamp]( @@ -1715,12 +1710,7 @@ object GpuOverrides extends Logging { TypeSig.STRING)), (a, conf, p, r) => new UnixTimeExprMeta[UnixTimestamp](a, conf, p, r) { override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = { - if (conf.isImprovedTimestampOpsEnabled) { - // passing the already converted strf string for a little optimization - GpuUnixTimestampImproved(lhs, rhs, sparkFormat, strfFormat) - } else { - GpuUnixTimestamp(lhs, rhs, sparkFormat, strfFormat) - } + GpuUnixTimestamp(lhs, rhs, sparkFormat, strfFormat) } }), expr[Hour]( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index fa793ba84fa..631a935ede5 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -667,13 +667,6 @@ object RapidsConf { // ENABLE/DISABLE PROCESSING - val IMPROVED_TIMESTAMP_OPS = - conf("spark.rapids.sql.improvedTimeOps.enabled") - .doc("When set to true, some operators will avoid overflowing by converting epoch days " + - "directly to seconds without first converting to microseconds") - .booleanConf - .createWithDefault(false) - val SQL_ENABLED = conf("spark.rapids.sql.enabled") .doc("Enable (true) or disable (false) sql operations on the GPU") .commonlyUsed() @@ -2378,8 +2371,6 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val shouldExplainAll: Boolean = explain.equalsIgnoreCase("ALL") - lazy val isImprovedTimestampOpsEnabled: Boolean = get(IMPROVED_TIMESTAMP_OPS) - lazy val chunkedReaderEnabled: Boolean = get(CHUNKED_READER) lazy val maxReadBatchSizeRows: Int = get(MAX_READER_BATCH_SIZE_ROWS) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 913594e810c..7e4de33bb70 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -866,59 +866,6 @@ abstract class GpuToTimestamp } } -/** - * An improved version of GpuToTimestamp conversion which converts time to UNIX timestamp without - * first converting to microseconds - */ -abstract class GpuToTimestampImproved extends GpuToTimestamp { - import GpuToTimestamp._ - - override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { - val tmp = if (lhs.dataType == StringType) { - // rhs is ignored we already parsed the format - if (getTimeParserPolicy == LegacyTimeParserPolicy) { - parseStringAsTimestampWithLegacyParserPolicy( - lhs, - sparkFormat, - strfFormat, - DType.TIMESTAMP_SECONDS, - (col, strfFormat) => col.asTimestampSeconds(strfFormat)) - } else { - parseStringAsTimestamp( - lhs, - sparkFormat, - strfFormat, - DType.TIMESTAMP_SECONDS, - failOnError) - } - } else if (lhs.dataType() == DateType){ - lhs.getBase.asTimestampSeconds() - } else { // Timestamp - // https://github.com/rapidsai/cudf/issues/5166 - // The time is off by 1 second if the result is < 0 - val longSecs = withResource(lhs.getBase.asTimestampSeconds()) { secs => - secs.asLongs() - } - withResource(longSecs) { secs => - val plusOne = withResource(Scalar.fromLong(1)) { one => - secs.add(one) - } - withResource(plusOne) { plusOne => - withResource(Scalar.fromLong(0)) { zero => - withResource(secs.lessThan(zero)) { neg => - neg.ifElse(plusOne, secs) - } - } - } - } - } - withResource(tmp) { r => - // The type we are returning is a long not an actual timestamp - r.asLongs() - } - } -} - case class GpuUnixTimestamp(strTs: Expression, format: Expression, sparkFormat: String, @@ -949,36 +896,6 @@ case class GpuToUnixTimestamp(strTs: Expression, } -case class GpuUnixTimestampImproved(strTs: Expression, - format: Expression, - sparkFormat: String, - strf: String, - timeZoneId: Option[String] = None) extends GpuToTimestampImproved { - override def strfFormat = strf - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { - copy(timeZoneId = Option(timeZoneId)) - } - - override def left: Expression = strTs - override def right: Expression = format - -} - -case class GpuToUnixTimestampImproved(strTs: Expression, - format: Expression, - sparkFormat: String, - strf: String, - timeZoneId: Option[String] = None) extends GpuToTimestampImproved { - override def strfFormat = strf - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { - copy(timeZoneId = Option(timeZoneId)) - } - - override def left: Expression = strTs - override def right: Expression = format - -} - case class GpuGetTimestamp( strTs: Expression, format: Expression, diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index 74ef6e5cc73..e790be9cd8f 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -158,16 +158,6 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE } } - testSparkResultsAreEqual("to_unix_timestamp parse yyyy/MM (improvedTimeOps)", - timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED") - .set(RapidsConf.IMPROVED_TIMESTAMP_OPS.key, "true")) { - df => { - df.createOrReplaceTempView("df") - df.sqlContext.sql("SELECT c0, to_unix_timestamp(c0, 'yyyy/MM') FROM df") - } - } - testSparkResultsAreEqual("unix_timestamp parse timestamp", timestampsAsStrings, CORRECTED_TIME_PARSER_POLICY) { From 1fc968cbacd4229c0a28b714000d5d1675d8775f Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 12 Dec 2023 13:53:40 -0800 Subject: [PATCH 2/2] Remove this config since it's not used Signed-off-by: Navin Kumar --- integration_tests/src/main/python/spark_session.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 606f9a31dc4..e65db7df3b3 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -47,7 +47,6 @@ def _from_scala_map(scala_map): 'spark.rapids.sql.hasExtendedYearValues': 'true', 'spark.rapids.sql.hashOptimizeSort.enabled': 'false', 'spark.rapids.sql.improvedFloatOps.enabled': 'false', - 'spark.rapids.sql.improvedTimeOps.enabled': 'false', 'spark.rapids.sql.incompatibleDateFormats.enabled': 'false', 'spark.rapids.sql.incompatibleOps.enabled': 'false', 'spark.rapids.sql.mode': 'executeongpu',