Refactored to address PR comments

apache · Dec 1, 2015 · a1d3f75 · a1d3f75
1 parent 0c5fe55
commit a1d3f75
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 11 deletions.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TrackStateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TrackStateDStream.scala
@@ -138,18 +138,16 @@ class InternalTrackStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassT
           // If the RDD is not partitioned the right way, let us repartition it using the
           // partition index as the key. This is to ensure that state RDD is always partitioned
           // before creating another state RDD using it
-          val kvRDD = rdd.mapPartitions { iter =>
-            iter.map { x => (TaskContext.get().partitionId(), x)}
-          }
-          kvRDD.partitionBy(partitioner).mapPartitions(iter => iter.map { _._2 },
-            preservesPartitioning = true)
+          TrackStateRDD.createFromRDD[K, V, S, E](
+            rdd.flatMap { _.stateMap.getAll() }, partitioner, validTime)
         } else {
           rdd
         }
       case None =>
         TrackStateRDD.createFromPairRDD[K, V, S, E](
           spec.getInitialStateRDD().getOrElse(new EmptyRDD[(K, S)](ssc.sparkContext)),
-          partitioner, validTime
+          partitioner,
+          validTime
         )
     }
 

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/TrackStateRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/TrackStateRDD.scala
@@ -179,22 +179,43 @@ private[streaming] class TrackStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E:
 
 private[streaming] object TrackStateRDD {
 
-  def createFromPairRDD[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
+  def createFromPairRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
       pairRDD: RDD[(K, S)],
       partitioner: Partitioner,
-      updateTime: Time): TrackStateRDD[K, V, S, T] = {
+      updateTime: Time): TrackStateRDD[K, V, S, E] = {
 
     val rddOfTrackStateRecords = pairRDD.partitionBy(partitioner).mapPartitions ({ iterator =>
       val stateMap = StateMap.create[K, S](SparkEnv.get.conf)
       iterator.foreach { case (key, state) => stateMap.put(key, state, updateTime.milliseconds) }
-      Iterator(TrackStateRDDRecord(stateMap, Seq.empty[T]))
+      Iterator(TrackStateRDDRecord(stateMap, Seq.empty[E]))
     }, preservesPartitioning = true)
 
     val emptyDataRDD = pairRDD.sparkContext.emptyRDD[(K, V)].partitionBy(partitioner)
 
     val noOpFunc = (time: Time, key: K, value: Option[V], state: State[S]) => None
 
-    new TrackStateRDD[K, V, S, T](rddOfTrackStateRecords, emptyDataRDD, noOpFunc, updateTime, None)
+    new TrackStateRDD[K, V, S, E](rddOfTrackStateRecords, emptyDataRDD, noOpFunc, updateTime, None)
+  }
+
+  def createFromRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
+      rdd: RDD[(K, S, Long)],
+      partitioner: Partitioner,
+      updateTime: Time): TrackStateRDD[K, V, S, E] = {
+
+    val pairRDD = rdd.map { x => (x._1, (x._2, x._3)) }
+    val rddOfTrackStateRecords = pairRDD.partitionBy(partitioner).mapPartitions({ iterator =>
+      val stateMap = StateMap.create[K, S](SparkEnv.get.conf)
+      iterator.foreach { case (key, (state, updateTime)) =>
+        stateMap.put(key, state, updateTime)
+      }
+      Iterator(TrackStateRDDRecord(stateMap, Seq.empty[E]))
+    }, preservesPartitioning = true)
+
+    val emptyDataRDD = pairRDD.sparkContext.emptyRDD[(K, V)].partitionBy(partitioner)
+
+    val noOpFunc = (time: Time, key: K, value: Option[V], state: State[S]) => None
+
+    new TrackStateRDD[K, V, S, E](rddOfTrackStateRecords, emptyDataRDD, noOpFunc, updateTime, None)
   }
 }
 

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TrackStateByKeySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/TrackStateByKeySuite.scala
@@ -478,7 +478,7 @@ class TrackStateByKeySuite extends SparkFunSuite
   }
 
 
-  test("trackStateByKey - drivery failure recovery") {
+  test("trackStateByKey - driver failure recovery") {
     val inputData =
       Seq(
         Seq(),