Skip to content

Commit

Permalink
[SPARK-35047][SQL] Allow Json datasources to write non-ascii characte…
Browse files Browse the repository at this point in the history
…rs as codepoints

### What changes were proposed in this pull request?

This PR proposes to enable the JSON datasources to write non-ascii characters as codepoints.
To enable/disable this feature, I introduce a new option `writeNonAsciiCharacterAsCodePoint` for JSON datasources.

### Why are the changes needed?

JSON specification allows codepoints as literal but Spark SQL's JSON datasources don't support the way to do it.
It's great if we can write non-ascii characters as codepoints, which is a platform neutral representation.

### Does this PR introduce _any_ user-facing change?

Yes. Users can write non-ascii characters as codepoints with JSON datasources.

### How was this patch tested?

New test.

Closes apache#32147 from sarutak/json-unicode-write.

Authored-by: Kousuke Saruta <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
sarutak authored and dongjoon-hyun committed Apr 29, 2021
1 parent 8a5af37 commit e8bf8fe
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ private[sql] class JSONOptions(
*/
val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)

/**
* Generating \u0000 style codepoints for non-ASCII characters if the parameter is enabled.
*/
val writeNonAsciiCharacterAsCodePoint: Boolean =
parameters.get("writeNonAsciiCharacterAsCodePoint").map(_.toBoolean).getOrElse(false)

/** Build a Jackson [[JsonFactory]] using JSON options. */
def buildJsonFactory(): JsonFactory = {
new JsonFactoryBuilder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,13 @@ private[sql] class JacksonGenerator(

private val gen = {
val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
if (options.pretty) generator.setPrettyPrinter(new DefaultPrettyPrinter("")) else generator
if (options.pretty) {
generator.setPrettyPrinter(new DefaultPrettyPrinter(""))
}
if (options.writeNonAsciiCharacterAsCodePoint) {
generator.setHighestNonEscapedChar(0x7F)
}
generator
}

private val lineSeparator: String = options.lineSeparatorInWrite
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2845,6 +2845,61 @@ abstract class JsonSuite
}
}

test("SPARK-35047: Write Non-ASCII character as codepoint") {
// scalastyle:off nonascii
withTempPaths(2) { paths =>
paths.foreach(_.delete())
val seq = Seq("a", "\n", "\u3042")
val df = seq.toDF

val basePath1 = paths(0).getCanonicalPath
df.write.option("writeNonAsciiCharacterAsCodePoint", "true")
.option("pretty", "false").json(basePath1)
val actualText1 = spark.read.option("wholetext", "true").text(basePath1)
.sort("value").map(_.getString(0)).collect().mkString
val expectedText1 =
s"""{"value":"\\n"}
|{"value":"\\u3042"}
|{"value":"a"}
|""".stripMargin
assert(actualText1 === expectedText1)

val actualJson1 = spark.read.json(basePath1)
.sort("value").map(_.getString(0)).collect().mkString
val expectedJson1 = "\na\u3042"
assert(actualJson1 === expectedJson1)

// Test for pretty printed JSON.
// If multiLine option is set to true, the format should be should be
// one JSON record per file. So LEAF_NODE_DEFAULT_PARALLELISM is set here.
withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> s"${seq.length}") {
val basePath2 = paths(1).getCanonicalPath
df.write.option("writeNonAsciiCharacterAsCodePoint", "true")
.option("pretty", "true").json(basePath2)
val actualText2 = spark.read.option("wholetext", "true").text(basePath2)
.sort("value").map(_.getString(0)).collect().mkString
val expectedText2 =
s"""{
| "value" : "\\n"
|}
|{
| "value" : "\\u3042"
|}
|{
| "value" : "a"
|}
|""".stripMargin
assert(actualText2 === expectedText2)

val actualJson2 = spark.read.option("multiLine", "true").json(basePath2)
.sort("value").map(_.getString(0)).collect().mkString
val expectedJson2 = "\na\u3042"
assert(actualJson2 === expectedJson2)
}
}
// scalastyle:on nonascii
}

test("SPARK-35104: Fix wrong indentation for multiple JSON even if `pretty` option is true") {
withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> "1") {
withTempPath { path =>
Expand Down

0 comments on commit e8bf8fe

Please sign in to comment.