-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union #15567
Changes from all commits
f3b9b10
1a66858
06a39e1
7840c95
ccd2fe7
1ca355e
9478fd6
bc4ea2c
7ffe0ed
38dcb7a
da9d261
2ec3206
e2ebd88
0de225d
6659795
80f26c6
ecb4f08
553c6a5
ababaa9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,17 +272,28 @@ trait Nondeterministic extends Expression { | |
final override def deterministic: Boolean = false | ||
final override def foldable: Boolean = false | ||
|
||
@transient | ||
private[this] var initialized = false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we change this to transient? then it will always get reset to false on a new partition. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will do |
||
|
||
final def setInitialValues(): Unit = { | ||
initInternal() | ||
/** | ||
* Initializes internal states given the current partition index and mark this as initialized. | ||
* Subclasses should override [[initializeInternal()]]. | ||
*/ | ||
final def initialize(partitionIndex: Int): Unit = { | ||
initializeInternal(partitionIndex) | ||
initialized = true | ||
} | ||
|
||
protected def initInternal(): Unit | ||
protected def initializeInternal(partitionIndex: Int): Unit | ||
|
||
/** | ||
* @inheritdoc | ||
* Throws an exception if [[initialize()]] is not called yet. | ||
* Subclasses should override [[evalInternal()]]. | ||
*/ | ||
final override def eval(input: InternalRow = null): Any = { | ||
require(initialized, "nondeterministic expression should be initialized before evaluate") | ||
require(initialized, | ||
s"Nondeterministic expression ${this.getClass.getName} should be initialized before eval.") | ||
evalInternal(input) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,16 +17,15 @@ | |
|
||
package org.apache.spark.sql.catalyst.expressions | ||
|
||
import org.apache.spark.TaskContext | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} | ||
import org.apache.spark.sql.types.{DataType, IntegerType} | ||
|
||
/** | ||
* Expression that returns the current partition id of the Spark task. | ||
* Expression that returns the current partition id. | ||
*/ | ||
@ExpressionDescription( | ||
usage = "_FUNC_() - Returns the current partition id of the Spark task", | ||
usage = "_FUNC_() - Returns the current partition id", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm this is behavior changing, and there is some value to the old partition id ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd consider introducing a new expression for the proper id and leave the old one as is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about this. But I don't think the current behavior is the expected behavior from users. This is the same issue as in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea but it is consistent with TaskContext.partitionId (which is also the name of the function) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The name is |
||
extended = "> SELECT _FUNC_();\n 0") | ||
case class SparkPartitionID() extends LeafExpression with Nondeterministic { | ||
|
||
|
@@ -38,16 +37,16 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic { | |
|
||
override val prettyName = "SPARK_PARTITION_ID" | ||
|
||
override protected def initInternal(): Unit = { | ||
partitionId = TaskContext.getPartitionId() | ||
override protected def initializeInternal(partitionIndex: Int): Unit = { | ||
partitionId = partitionIndex | ||
} | ||
|
||
override protected def evalInternal(input: InternalRow): Int = partitionId | ||
|
||
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
val idTerm = ctx.freshName("partitionId") | ||
ctx.addMutableState(ctx.JAVA_INT, idTerm, | ||
s"$idTerm = org.apache.spark.TaskContext.getPartitionId();") | ||
ctx.addMutableState(ctx.JAVA_INT, idTerm, "") | ||
ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;") | ||
ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false") | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -184,6 +184,20 @@ class CodegenContext { | |
splitExpressions(initCodes, "init", Nil) | ||
} | ||
|
||
/** | ||
* Code statements to initialize states that depend on the partition index. | ||
* An integer `partitionIndex` will be made available within the scope. | ||
*/ | ||
val partitionInitializationStatements: mutable.ArrayBuffer[String] = mutable.ArrayBuffer.empty | ||
|
||
def addPartitionInitializationStatement(statement: String): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any reason you are creating this rather than just using addMutableState? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little worried about introducing more issues by moving |
||
partitionInitializationStatements += statement | ||
} | ||
|
||
def initPartition(): String = { | ||
partitionInitializationStatements.mkString("\n") | ||
} | ||
|
||
/** | ||
* Holding all the functions those will be added into generated class. | ||
*/ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we get rid of this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are 20+ probably valid use of
mapPartitionsInternal
. The main problem is that changing it tomapPartitionsWithIndexInternal
doesn't really force people to initialize the partition.