-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-18758][SS] StreamingQueryListener events from a StreamingQuery should be sent only to the listeners in the same session as the query #16186
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,11 +70,11 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext) | |
|
||
def schema: StructType = encoder.schema | ||
|
||
def toDS()(implicit sqlContext: SQLContext): Dataset[A] = { | ||
def toDS(): Dataset[A] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed this because this is not needed. the sqlContext is in the constructor. |
||
Dataset(sqlContext.sparkSession, logicalPlan) | ||
} | ||
|
||
def toDF()(implicit sqlContext: SQLContext): DataFrame = { | ||
def toDF(): DataFrame = { | ||
Dataset.ofRows(sqlContext.sparkSession, logicalPlan) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -231,8 +231,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts { | |
outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = { | ||
|
||
val stream = _stream.toDF() | ||
val sparkSession = stream.sparkSession // use the session in DF, not the default session | ||
var pos = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not used. |
||
var currentPlan: LogicalPlan = stream.logicalPlan | ||
var currentStream: StreamExecution = null | ||
var lastStream: StreamExecution = null | ||
val awaiting = new mutable.HashMap[Int, Offset]() // source index -> offset to wait for | ||
|
@@ -319,7 +319,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts { | |
""".stripMargin) | ||
} | ||
|
||
val testThread = Thread.currentThread() | ||
val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not used. |
||
var manualClockExpectedTime = -1L | ||
try { | ||
|
@@ -337,14 +336,16 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts { | |
|
||
additionalConfs.foreach(pair => { | ||
val value = | ||
if (spark.conf.contains(pair._1)) Some(spark.conf.get(pair._1)) else None | ||
if (sparkSession.conf.contains(pair._1)) { | ||
Some(sparkSession.conf.get(pair._1)) | ||
} else None | ||
resetConfValues(pair._1) = value | ||
spark.conf.set(pair._1, pair._2) | ||
sparkSession.conf.set(pair._1, pair._2) | ||
}) | ||
|
||
lastStream = currentStream | ||
currentStream = | ||
spark | ||
sparkSession | ||
.streams | ||
.startQuery( | ||
None, | ||
|
@@ -518,8 +519,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts { | |
|
||
// Rollback prev configuration values | ||
resetConfValues.foreach { | ||
case (key, Some(value)) => spark.conf.set(key, value) | ||
case (key, None) => spark.conf.unset(key) | ||
case (key, Some(value)) => sparkSession.conf.set(key, value) | ||
case (key, None) => sparkSession.conf.unset(key) | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just a qq: why do we use runId's instead of the id's of the streams? We already don't want concurrent runs for streams since the offset log directories will be messed up.Wait, ok, got it. onStart's are called synchronously where as onTerminations are asynchronous. Basically we can get a second stream start report before the firs run completes. Do you think that's worth adding to the docs? You don't have to if you don't need a second pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
It's worth to document it. This is different from other Spark's listener buses because of the
synchronous
event.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even if this behavior was not different (that is, all async), this component should not be responsible preventing concurrent runs. This component should be simple and not deal with such issues. I have added more docs regarding why runIds instead of ids