Skip to content

Commit

Permalink
[SPARK-48191][SQL] Support UTF-32 for string encode and decode
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Enable support of UTF-32

### Why are the changes needed?
It already works, so we just need to enable it

### Does this PR introduce _any_ user-facing change?
Yes, `decode(..., 'UTF-32')` and `encode(..., 'UTF-32')` will start working

### How was this patch tested?
Manually checked in the spark shell

### Was this patch authored or co-authored using generative AI tooling?
No

Closes apache#46469 from vladimirg-db/vladimirg-db/support-utf-32-for-string-decode.

Authored-by: Vladimir Golubev <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
  • Loading branch information
vladimirg-db authored and JacobZheng0927 committed May 11, 2024
1 parent ba4a23b commit 0e44f00
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 6 deletions.
2 changes: 1 addition & 1 deletion docs/sql-migration-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ license: |
- Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions.
- Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead.
- Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value.
- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`.
- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`.
- Since Spark 4.0, the legacy datetime rebasing SQL configs with the prefix `spark.sql.legacy` are removed. To restore the previous behavior, use the following configs:
- `spark.sql.parquet.int96RebaseModeInWrite` instead of `spark.sql.legacy.parquet.int96RebaseModeInWrite`
- `spark.sql.parquet.datetimeRebaseModeInWrite` instead of `spark.sql.legacy.parquet.datetimeRebaseModeInWrite`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2646,7 +2646,7 @@ object Decode {
arguments = """
Arguments:
* bin - a binary expression to decode
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive.
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive.
""",
examples = """
Examples:
Expand Down Expand Up @@ -2690,7 +2690,7 @@ case class Decode(params: Seq[Expression], replacement: Expression)
arguments = """
Arguments:
* bin - a binary expression to decode
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive.
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive.
""",
since = "1.5.0",
group = "string_funcs")
Expand All @@ -2707,7 +2707,7 @@ case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Bo
override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation)

private val supportedCharsets = Set(
"US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16")
"US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")

protected override def nullSafeEval(input1: Any, input2: Any): Any = {
val fromCharset = input2.asInstanceOf[UTF8String].toString
Expand Down Expand Up @@ -2762,7 +2762,7 @@ object StringDecode {
arguments = """
Arguments:
* str - a string expression
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to encode `str` into a BINARY. It is case insensitive.
* charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to encode `str` into a BINARY. It is case insensitive.
""",
examples = """
Examples:
Expand All @@ -2785,7 +2785,7 @@ case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean)
Seq(StringTypeAnyCollation, StringTypeAnyCollation)

private val supportedCharsets = Set(
"US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16")
"US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")

protected override def nullSafeEval(input1: Any, input2: Any): Any = {
val toCharset = input2.asInstanceOf[UTF8String].toString
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// non ascii characters are not allowed in the code, so we disable the scalastyle here.
checkEvaluation(
StringDecode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界")
checkEvaluation(
StringDecode(Encode(Literal("大千世界"), Literal("UTF-32")), Literal("UTF-32")), "大千世界")
checkEvaluation(
StringDecode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界"))
checkEvaluation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8),
+- OneRowRelation


-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query analysis
Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x]
+- OneRowRelation


-- !query
select decode(1, 1, 'Southlake')
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8),
+- OneRowRelation


-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query analysis
Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x]
+- OneRowRelation


-- !query
select decode(1, 1, 'Southlake')
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
select decode();
select decode(encode('abc', 'utf-8'));
select decode(encode('abc', 'utf-8'), 'utf-8');
select decode(encode('大千世界', 'utf-32'), 'utf-32');
select decode(1, 1, 'Southlake');
select decode(2, 1, 'Southlake');
select decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,14 @@ struct<decode(encode(abc, utf-8), utf-8):string>
abc


-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query schema
struct<decode(encode(大千世界, utf-32), utf-32):string>
-- !query output
大千世界


-- !query
select decode(1, 1, 'Southlake')
-- !query schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,14 @@ struct<decode(encode(abc, utf-8), utf-8):string>
abc


-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query schema
struct<decode(encode(大千世界, utf-32), utf-32):string>
-- !query output
大千世界


-- !query
select decode(1, 1, 'Southlake')
-- !query schema
Expand Down

0 comments on commit 0e44f00

Please sign in to comment.