Skip to content

Commit

Permalink
[SPARK-31594][SQL] Do not display the seed of rand/randn with no argu…
Browse files Browse the repository at this point in the history
…ment in output schema

### What changes were proposed in this pull request?

This PR intends to update `sql` in `Rand`/`Randn` with no argument to make a column name deterministic.

Before this PR (a column name changes run-by-run):
```
scala> sql("select rand()").show()
+-------------------------+
|rand(7986133828002692830)|
+-------------------------+
|       0.9524061403696937|
+-------------------------+
```
After this PR (a column name fixed):
```
scala> sql("select rand()").show()
+------------------+
|            rand()|
+------------------+
|0.7137935639522275|
+------------------+

// If a seed given, it is still shown in a column name
// (the same with the current behaviour)
scala> sql("select rand(1)").show()
+------------------+
|           rand(1)|
+------------------+
|0.6363787615254752|
+------------------+

// We can still check a seed in explain output:
scala> sql("select rand()").explain()
== Physical Plan ==
*(1) Project [rand(-2282124938778456838) AS rand()#0]
+- *(1) Scan OneRowRelation[]
```

Note: This fix comes from #28194; the ongoing PR tests the output schema of expressions, so their schemas must be deterministic for the tests.

### Why are the changes needed?

To make output schema deterministic.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added unit tests.

Closes #28392 from maropu/SPARK-31594.

Authored-by: Takeshi Yamamuro <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
maropu authored and dongjoon-hyun committed Apr 29, 2020
1 parent 3680303 commit 97f2c03
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,12 @@ trait ExpressionWithRandomSeed {
""",
since = "1.5.0")
// scalastyle:on line.size.limit
case class Rand(child: Expression) extends RDG with ExpressionWithRandomSeed {
case class Rand(child: Expression, hideSeed: Boolean = false)
extends RDG with ExpressionWithRandomSeed {

def this() = this(Literal(Utils.random.nextLong(), LongType))
def this() = this(Literal(Utils.random.nextLong(), LongType), true)

def this(child: Expression) = this(child, false)

override def withNewSeed(seed: Long): Rand = Rand(Literal(seed, LongType))

Expand All @@ -101,7 +104,12 @@ case class Rand(child: Expression) extends RDG with ExpressionWithRandomSeed {
isNull = FalseLiteral)
}

override def freshCopy(): Rand = Rand(child)
override def freshCopy(): Rand = Rand(child, hideSeed)

override def flatArguments: Iterator[Any] = Iterator(child)
override def sql: String = {
s"rand(${if (hideSeed) "" else child.sql})"
}
}

object Rand {
Expand All @@ -126,9 +134,12 @@ object Rand {
""",
since = "1.5.0")
// scalastyle:on line.size.limit
case class Randn(child: Expression) extends RDG with ExpressionWithRandomSeed {
case class Randn(child: Expression, hideSeed: Boolean = false)
extends RDG with ExpressionWithRandomSeed {

def this() = this(Literal(Utils.random.nextLong(), LongType))
def this() = this(Literal(Utils.random.nextLong(), LongType), true)

def this(child: Expression) = this(child, false)

override def withNewSeed(seed: Long): Randn = Randn(Literal(seed, LongType))

Expand All @@ -144,7 +155,12 @@ case class Randn(child: Expression) extends RDG with ExpressionWithRandomSeed {
isNull = FalseLiteral)
}

override def freshCopy(): Randn = Randn(child)
override def freshCopy(): Randn = Randn(child, hideSeed)

override def flatArguments: Iterator[Any] = Iterator(child)
override def sql: String = {
s"randn(${if (hideSeed) "" else child.sql})"
}
}

object Randn {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,11 @@ class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(Rand(5419823303878592871L), 0.7145363364564755)
checkEvaluation(Randn(5419823303878592871L), 0.7816815274533012)
}

test("SPARK-31594: Do not display the seed of rand/randn with no argument in output schema") {
assert(Rand(Literal(1L), true).sql === "rand()")
assert(Randn(Literal(1L), true).sql === "randn()")
assert(Rand(Literal(1L), false).sql === "rand(1L)")
assert(Randn(Literal(1L), false).sql === "randn(1L)")
}
}
23 changes: 23 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3425,6 +3425,29 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
assert(SQLConf.get.getConf(SQLConf.CODEGEN_FALLBACK) === true)
}
}

test("SPARK-31594: Do not display the seed of rand/randn with no argument in output schema") {
def checkIfSeedExistsInExplain(df: DataFrame): Unit = {
val output = new java.io.ByteArrayOutputStream()
Console.withOut(output) {
df.explain()
}
val projectExplainOutput = output.toString.split("\n").find(_.contains("Project")).get
assert(projectExplainOutput.matches(""".*randn?\(-?[0-9]+\).*"""))
}
val df1 = sql("SELECT rand()")
assert(df1.schema.head.name === "rand()")
checkIfSeedExistsInExplain(df1)
val df2 = sql("SELECT rand(1L)")
assert(df2.schema.head.name === "rand(1)")
checkIfSeedExistsInExplain(df2)
val df3 = sql("SELECT randn()")
assert(df3.schema.head.name === "randn()")
checkIfSeedExistsInExplain(df1)
val df4 = sql("SELECT randn(1L)")
assert(df4.schema.head.name === "randn(1)")
checkIfSeedExistsInExplain(df2)
}
}

case class Foo(bar: Option[String])

0 comments on commit 97f2c03

Please sign in to comment.