[SPARK-48191][SQL] Support UTF-32 for string encode and decode

### What changes were proposed in this pull request? Enable support of UTF-32 ### Why are the changes needed? It already works, so we just need to enable it ### Does this PR introduce _any_ user-facing change? Yes, `decode(..., 'UTF-32')` and `encode(..., 'UTF-32')` will start working ### How was this patch tested? Manually checked in the spark shell ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#46469 from vladimirg-db/vladimirg-db/support-utf-32-for-string-decode. Authored-by: Vladimir Golubev <[email protected]> Signed-off-by: Kent Yao <[email protected]>
JacobZheng0927 · May 11, 2024 · 0e44f00 · 0e44f00
1 parent ba4a23b
commit 0e44f00
Show file tree

Hide file tree

Showing 8 changed files with 39 additions and 6 deletions.
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -32,7 +32,7 @@ license: |
 - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions.
 - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead.
 - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value.
-- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`.
+- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`.
 - Since Spark 4.0, the legacy datetime rebasing SQL configs with the prefix `spark.sql.legacy` are removed. To restore the previous behavior, use the following configs:
   - `spark.sql.parquet.int96RebaseModeInWrite` instead of `spark.sql.legacy.parquet.int96RebaseModeInWrite`
   - `spark.sql.parquet.datetimeRebaseModeInWrite` instead of `spark.sql.legacy.parquet.datetimeRebaseModeInWrite`

diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2646,7 +2646,7 @@ object Decode {
   arguments = """
     Arguments:
       * bin - a binary expression to decode
-      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive.
+      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive.
   """,
   examples = """
     Examples:
@@ -2690,7 +2690,7 @@ case class Decode(params: Seq[Expression], replacement: Expression)
   arguments = """
     Arguments:
       * bin - a binary expression to decode
-      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive.
+      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive.
   """,
   since = "1.5.0",
   group = "string_funcs")
@@ -2707,7 +2707,7 @@ case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Bo
   override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation)
 
   private val supportedCharsets = Set(
-    "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16")
+    "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     val fromCharset = input2.asInstanceOf[UTF8String].toString
@@ -2762,7 +2762,7 @@ object StringDecode {
   arguments = """
     Arguments:
       * str - a string expression
-      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to encode `str` into a BINARY. It is case insensitive.
+      * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to encode `str` into a BINARY. It is case insensitive.
   """,
   examples = """
     Examples:
@@ -2785,7 +2785,7 @@ case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean)
     Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
   private val supportedCharsets = Set(
-    "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16")
+    "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     val toCharset = input2.asInstanceOf[UTF8String].toString

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -489,6 +489,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     checkEvaluation(
       StringDecode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界")
+    checkEvaluation(
+      StringDecode(Encode(Literal("大千世界"), Literal("UTF-32")), Literal("UTF-32")), "大千世界")
     checkEvaluation(
       StringDecode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界"))
     checkEvaluation(

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8),
 +- OneRowRelation
 
 
+-- !query
+select decode(encode('大千世界', 'utf-32'), 'utf-32')
+-- !query analysis
+Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x]
++- OneRowRelation
+
+
 -- !query
 select decode(1, 1, 'Southlake')
 -- !query analysis

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8),
 +- OneRowRelation
 
 
+-- !query
+select decode(encode('大千世界', 'utf-32'), 'utf-32')
+-- !query analysis
+Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x]
++- OneRowRelation
+
+
 -- !query
 select decode(1, 1, 'Southlake')
 -- !query analysis

diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -131,6 +131,7 @@ select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
 select decode();
 select decode(encode('abc', 'utf-8'));
 select decode(encode('abc', 'utf-8'), 'utf-8');
+select decode(encode('大千世界', 'utf-32'), 'utf-32');
 select decode(1, 1, 'Southlake');
 select decode(2, 1, 'Southlake');
 select decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');

diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -961,6 +961,14 @@ struct<decode(encode(abc, utf-8), utf-8):string>
 abc
 
 
+-- !query
+select decode(encode('大千世界', 'utf-32'), 'utf-32')
+-- !query schema
+struct<decode(encode(大千世界, utf-32), utf-32):string>
+-- !query output
+大千世界
+
+
 -- !query
 select decode(1, 1, 'Southlake')
 -- !query schema

diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -893,6 +893,14 @@ struct<decode(encode(abc, utf-8), utf-8):string>
 abc
 
 
+-- !query
+select decode(encode('大千世界', 'utf-32'), 'utf-32')
+-- !query schema
+struct<decode(encode(大千世界, utf-32), utf-32):string>
+-- !query output
+大千世界
+
+
 -- !query
 select decode(1, 1, 'Southlake')
 -- !query schema