From 6d0633e3ec9518278fcc7eba58549d4ad3d5813f Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 14 May 2015 19:49:44 -0700
Subject: [PATCH 1/7] [SPARK-7548] [SQL] Add explode function for DataFrames

Add an `explode` function for dataframes and modify the analyzer so that single table generating functions can be present in a select clause along with other expressions.   There are currently the following restrictions:
 - only top level TGFs are allowed (i.e. no `select(explode('list) + 1)`)
 - only one may be present in a single select to avoid potentially confusing implicit Cartesian products.

TODO:
 - [ ] Python

Author: Michael Armbrust <michael@databricks.com>

Closes #6107 from marmbrus/explodeFunction and squashes the following commits:

7ee2c87 [Michael Armbrust] whitespace
6f80ba3 [Michael Armbrust] Update dataframe.py
c176c89 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into explodeFunction
81b5da3 [Michael Armbrust] style
d3faa05 [Michael Armbrust] fix self join case
f9e1e3e [Michael Armbrust] fix python, add since
4f0d0a9 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into explodeFunction
e710fe4 [Michael Armbrust] add java and python
52ca0dc [Michael Armbrust] [SPARK-7548][SQL] Add explode function for dataframes.
---
 python/pyspark/sql/dataframe.py               |  12 +-
 python/pyspark/sql/functions.py               |  20 +++
 python/pyspark/sql/tests.py                   |  15 +++
 .../sql/catalyst/analysis/Analyzer.scala      | 117 +++++++++++-------
 .../plans/logical/basicOperators.scala        |   3 +
 .../sql/catalyst/analysis/AnalysisSuite.scala |  10 +-
 .../scala/org/apache/spark/sql/Column.scala   |  27 +++-
 .../org/apache/spark/sql/DataFrame.scala      |   5 +-
 .../org/apache/spark/sql/functions.scala      |   5 +
 .../spark/sql/ColumnExpressionSuite.scala     |  60 +++++++++
 10 files changed, 223 insertions(+), 51 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 82cb1c2fdbf94..2ed95ac8e2505 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1511,13 +1511,19 @@ def inSet(self, *cols):
     isNull = _unary_op("isNull", "True if the current expression is null.")
     isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
 
-    def alias(self, alias):
-        """Return a alias for this column
+    def alias(self, *alias):
+        """Returns this column aliased with a new name or names (in the case of expressions that
+        return more than one column, such as explode).
 
         >>> df.select(df.age.alias("age2")).collect()
         [Row(age2=2), Row(age2=5)]
         """
-        return Column(getattr(self._jc, "as")(alias))
+
+        if len(alias) == 1:
+            return Column(getattr(self._jc, "as")(alias[0]))
+        else:
+            sc = SparkContext._active_spark_context
+            return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias))))
 
     @ignore_unicode_prefix
     def cast(self, dataType):
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d91265ee0bec8..6cd6974b0e5bb 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -169,6 +169,26 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
+def explode(col):
+    """Returns a new row for each element in the given array or map.
+
+    >>> from pyspark.sql import Row
+    >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
+    >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()
+    [Row(anInt=1), Row(anInt=2), Row(anInt=3)]
+
+    >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()
+    +---+-----+
+    |key|value|
+    +---+-----+
+    |  a|    b|
+    +---+-----+
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.explode(_to_java_column(col))
+    return Column(jc)
+
+
 def coalesce(*cols):
     """Returns the first column that is not null.
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1922d03af61da..d37c5dbed7f6b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -117,6 +117,21 @@ def tearDownClass(cls):
         ReusedPySparkTestCase.tearDownClass()
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
+    def test_explode(self):
+        from pyspark.sql.functions import explode
+        d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]
+        rdd = self.sc.parallelize(d)
+        data = self.sqlCtx.createDataFrame(rdd)
+
+        result = data.select(explode(data.intlist).alias("a")).select("a").collect()
+        self.assertEqual(result[0][0], 1)
+        self.assertEqual(result[1][0], 2)
+        self.assertEqual(result[2][0], 3)
+
+        result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
+        self.assertEqual(result[0][0], "a")
+        self.assertEqual(result[0][1], "b")
+
     def test_udf_with_callable(self):
         d = [Row(number=i, squared=i**2) for i in range(10)]
         rdd = self.sc.parallelize(d)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 4baeeb5b58c2d..0b6e1d44b9c4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -73,7 +73,6 @@ class Analyzer(
       ResolveGroupingAnalytics ::
       ResolveSortReferences ::
       ResolveGenerate ::
-      ImplicitGenerate ::
       ResolveFunctions ::
       ExtractWindowExpressions ::
       GlobalAggregates ::
@@ -323,6 +322,11 @@ class Analyzer(
               if findAliases(aggregateExpressions).intersect(conflictingAttributes).nonEmpty =>
             (oldVersion, oldVersion.copy(aggregateExpressions = newAliases(aggregateExpressions)))
 
+          case oldVersion: Generate
+              if oldVersion.generatedSet.intersect(conflictingAttributes).nonEmpty =>
+            val newOutput = oldVersion.generatorOutput.map(_.newInstance())
+            (oldVersion, oldVersion.copy(generatorOutput = newOutput))
+
           case oldVersion @ Window(_, windowExpressions, _, child)
               if AttributeSet(windowExpressions.map(_.toAttribute)).intersect(conflictingAttributes)
                 .nonEmpty =>
@@ -521,66 +525,89 @@ class Analyzer(
   }
 
   /**
-   * When a SELECT clause has only a single expression and that expression is a
-   * [[catalyst.expressions.Generator Generator]] we convert the
-   * [[catalyst.plans.logical.Project Project]] to a [[catalyst.plans.logical.Generate Generate]].
+   * Rewrites table generating expressions that either need one or more of the following in order
+   * to be resolved:
+   *  - concrete attribute references for their output.
+   *  - to be relocated from a SELECT clause (i.e. from  a [[Project]]) into a [[Generate]]).
+   *
+   * Names for the output [[Attributes]] are extracted from [[Alias]] or [[MultiAlias]] expressions
+   * that wrap the [[Generator]]. If more than one [[Generator]] is found in a Project, an
+   * [[AnalysisException]] is throw.
    */
-  object ImplicitGenerate extends Rule[LogicalPlan] {
+  object ResolveGenerate extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case Project(Seq(Alias(g: Generator, name)), child) =>
-        Generate(g, join = false, outer = false,
-          qualifier = None, UnresolvedAttribute(name) :: Nil, child)
-      case Project(Seq(MultiAlias(g: Generator, names)), child) =>
-        Generate(g, join = false, outer = false,
-          qualifier = None, names.map(UnresolvedAttribute(_)), child)
+      case p: Generate if !p.child.resolved || !p.generator.resolved => p
+      case g: Generate if g.resolved == false =>
+          g.copy(
+            generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name)))
+
+      case p @ Project(projectList, child) =>
+        // Holds the resolved generator, if one exists in the project list.
+        var resolvedGenerator: Generate = null
+
+        val newProjectList = projectList.flatMap {
+          case AliasedGenerator(generator, names) if generator.childrenResolved =>
+            if (resolvedGenerator != null) {
+              failAnalysis(
+                s"Only one generator allowed per select but ${resolvedGenerator.nodeName} and " +
+                s"and ${generator.nodeName} found.")
+            }
+
+            resolvedGenerator =
+              Generate(
+                generator,
+                join = projectList.size > 1, // Only join if there are other expressions in SELECT.
+                outer = false,
+                qualifier = None,
+                generatorOutput = makeGeneratorOutput(generator, names),
+                child)
+
+            resolvedGenerator.generatorOutput
+          case other => other :: Nil
+        }
+
+        if (resolvedGenerator != null) {
+          Project(newProjectList, resolvedGenerator)
+        } else {
+          p
+        }
     }
-  }
 
-  /**
-   * Resolve the Generate, if the output names specified, we will take them, otherwise
-   * we will try to provide the default names, which follow the same rule with Hive.
-   */
-  object ResolveGenerate extends Rule[LogicalPlan] {
-    // Construct the output attributes for the generator,
-    // The output attribute names can be either specified or
-    // auto generated.
+    /** Extracts a [[Generator]] expression and any names assigned by aliases to their output. */
+    private object AliasedGenerator {
+      def unapply(e: Expression): Option[(Generator, Seq[String])] = e match {
+        case Alias(g: Generator, name) => Some((g, name :: Nil))
+        case MultiAlias(g: Generator, names) => Some(g, names)
+        case _ => None
+      }
+    }
+
+    /**
+     * Construct the output attributes for a [[Generator]], given a list of names.  If the list of
+     * names is empty names are assigned by ordinal (i.e., _c0, _c1, ...) to match Hive's defaults.
+     */
     private def makeGeneratorOutput(
         generator: Generator,
-        generatorOutput: Seq[Attribute]): Seq[Attribute] = {
+        names: Seq[String]): Seq[Attribute] = {
       val elementTypes = generator.elementTypes
 
-      if (generatorOutput.length == elementTypes.length) {
-        generatorOutput.zip(elementTypes).map {
-          case (a, (t, nullable)) if !a.resolved =>
-            AttributeReference(a.name, t, nullable)()
-          case (a, _) => a
+      if (names.length == elementTypes.length) {
+        names.zip(elementTypes).map {
+          case (name, (t, nullable)) =>
+            AttributeReference(name, t, nullable)()
         }
-      } else if (generatorOutput.length == 0) {
+      } else if (names.isEmpty) {
         elementTypes.zipWithIndex.map {
           // keep the default column names as Hive does _c0, _c1, _cN
           case ((t, nullable), i) => AttributeReference(s"_c$i", t, nullable)()
         }
       } else {
-        throw new AnalysisException(
-          s"""
-             |The number of aliases supplied in the AS clause does not match
-             |the number of columns output by the UDTF expected
-             |${elementTypes.size} aliases but got ${generatorOutput.size}
-           """.stripMargin)
+        failAnalysis(
+          "The number of aliases supplied in the AS clause does not match the number of columns " +
+          s"output by the UDTF expected ${elementTypes.size} aliases but got " +
+          s"${names.mkString(",")} ")
       }
     }
-
-    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case p: Generate if !p.child.resolved || !p.generator.resolved => p
-      case p: Generate if p.resolved == false =>
-        // if the generator output names are not specified, we will use the default ones.
-        Generate(
-          p.generator,
-          join = p.join,
-          outer = p.outer,
-          p.qualifier,
-          makeGeneratorOutput(p.generator, p.generatorOutput), p.child)
-    }
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 0f349f9d11415..01f4b6e9bb77d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -59,6 +59,9 @@ case class Generate(
     child: LogicalPlan)
   extends UnaryNode {
 
+  /** The set of all attributes produced by this node. */
+  def generatedSet: AttributeSet = AttributeSet(generatorOutput)
+
   override lazy val resolved: Boolean = {
     generator.resolved &&
       childrenResolved &&
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 6f2f35564d12e..e1d6ac462fbcc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -72,6 +72,9 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
       StructField("cField", StringType) :: Nil
     ))())
 
+  val listRelation = LocalRelation(
+    AttributeReference("list", ArrayType(IntegerType))())
+
   before {
     caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
     caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
@@ -159,10 +162,15 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
         }
       }
 
-      errorMessages.foreach(m => assert(error.getMessage contains m))
+      errorMessages.foreach(m => assert(error.getMessage.toLowerCase contains m.toLowerCase))
     }
   }
 
+  errorTest(
+    "too many generators",
+    listRelation.select(Explode('list).as('a), Explode('list).as('b)),
+    "only one generator" :: "explode" :: Nil)
+
   errorTest(
     "unresolved attributes",
     testRelation.select('abcd),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 8bf1320ccb71d..dc0aeea7c4aea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -18,12 +18,13 @@
 package org.apache.spark.sql
 
 import scala.language.implicitConversions
+import scala.collection.JavaConversions._
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.Logging
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar, UnresolvedExtractValue}
+import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedAttribute, UnresolvedStar, UnresolvedExtractValue}
 import org.apache.spark.sql.types._
 
 
@@ -727,6 +728,30 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    */
   def as(alias: String): Column = Alias(expr, alias)()
 
+  /**
+   * (Scala-specific) Assigns the given aliases to the results of a table generating function.
+   * {{{
+   *   // Renames colA to colB in select output.
+   *   df.select(explode($"myMap").as("key" :: "value" :: Nil))
+   * }}}
+   *
+   * @group expr_ops
+   * @since 1.4.0
+   */
+  def as(aliases: Seq[String]): Column = MultiAlias(expr, aliases)
+
+  /**
+   * Assigns the given aliases to the results of a table generating function.
+   * {{{
+   *   // Renames colA to colB in select output.
+   *   df.select(explode($"myMap").as("key" :: "value" :: Nil))
+   * }}}
+   *
+   * @group expr_ops
+   * @since 1.4.0
+   */
+  def as(aliases: Array[String]): Column = MultiAlias(expr, aliases)
+
   /**
    * Gives the column an alias.
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 4fd5105c27443..2e20c3d3f4ed2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -34,7 +34,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{MultiAlias, ResolvedStar, UnresolvedAttribute, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
@@ -593,6 +593,9 @@ class DataFrame private[sql](
   def select(cols: Column*): DataFrame = {
     val namedExpressions = cols.map {
       case Column(expr: NamedExpression) => expr
+      // Leave an unaliased explode with an empty list of names since the analzyer will generate the
+      // correct defaults after the nested expression's type has been resolved.
+      case Column(explode: Explode) => MultiAlias(explode, Nil)
       case Column(expr: Expression) => Alias(expr, expr.prettyString)()
     }
     // When user continuously call `select`, speed up analysis by collapsing `Project`
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4404ad8ad63a8..6640631cf0719 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -363,6 +363,11 @@ object functions {
   @scala.annotation.varargs
   def coalesce(e: Column*): Column = Coalesce(e.map(_.expr))
 
+  /**
+   * Creates a new row for each element in the given array or map column.
+   */
+   def explode(e: Column): Column = Explode(e.expr)
+
   /**
    * Converts a string exprsesion to lower case.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 269e185543059..9bdf201b3be7c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -27,6 +27,66 @@ import org.apache.spark.sql.types._
 class ColumnExpressionSuite extends QueryTest {
   import org.apache.spark.sql.TestData._
 
+  test("single explode") {
+    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    checkAnswer(
+      df.select(explode('intList)),
+      Row(1) :: Row(2) :: Row(3) :: Nil)
+  }
+
+  test("explode and other columns") {
+    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+
+    checkAnswer(
+      df.select($"a", explode('intList)),
+      Row(1, 1) ::
+      Row(1, 2) ::
+      Row(1, 3) :: Nil)
+
+    checkAnswer(
+      df.select($"*", explode('intList)),
+      Row(1, Seq(1,2,3), 1) ::
+      Row(1, Seq(1,2,3), 2) ::
+      Row(1, Seq(1,2,3), 3) :: Nil)
+  }
+
+  test("aliased explode") {
+    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+
+    checkAnswer(
+      df.select(explode('intList).as('int)).select('int),
+      Row(1) :: Row(2) :: Row(3) :: Nil)
+
+    checkAnswer(
+      df.select(explode('intList).as('int)).select(sum('int)),
+      Row(6) :: Nil)
+  }
+
+  test("explode on map") {
+    val df = Seq((1, Map("a" -> "b"))).toDF("a", "map")
+
+    checkAnswer(
+      df.select(explode('map)),
+      Row("a", "b"))
+  }
+
+  test("explode on map with aliases") {
+    val df = Seq((1, Map("a" -> "b"))).toDF("a", "map")
+
+    checkAnswer(
+      df.select(explode('map).as("key1" :: "value1" :: Nil)).select("key1", "value1"),
+      Row("a", "b"))
+  }
+
+  test("self join explode") {
+    val df = Seq((1, Seq(1,2,3))).toDF("a", "intList")
+    val exploded = df.select(explode('intList).as('i))
+
+    checkAnswer(
+      exploded.join(exploded, exploded("i") === exploded("i")).agg(count("*")),
+      Row(3) :: Nil)
+  }
+
   test("collect on column produced by a binary operator") {
     val df = Seq((1, 2, 3)).toDF("a", "b", "c")
     checkAnswer(df.select(df("a") + df("b")), Seq(Row(3)))

From f9705d461350c6fccf8022e933ea909f40c53576 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 14 May 2015 20:49:21 -0700
Subject: [PATCH 2/7] [SPARK-7098][SQL] Make the WHERE clause with timestamp
 show consistent result

JIRA: https://issues.apache.org/jira/browse/SPARK-7098

The WHERE clause with timstamp shows inconsistent results. This pr fixes it.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #5682 from viirya/consistent_timestamp and squashes the following commits:

171445a [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into consistent_timestamp
4e98520 [Liang-Chi Hsieh] Make the WHERE clause with timestamp show consistent result.
---
 .../spark/sql/catalyst/analysis/HiveTypeCoercion.scala      | 6 +++---
 .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 4 ++++
 sql/core/src/test/scala/org/apache/spark/sql/TestData.scala | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 168a4e30eab86..fe0d3f29977c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -251,10 +251,10 @@ trait HiveTypeCoercion {
         p.makeCopy(Array(Cast(p.left, StringType), p.right))
       case p: BinaryComparison if p.left.dataType == StringType &&
                                   p.right.dataType == TimestampType =>
-        p.makeCopy(Array(p.left, Cast(p.right, StringType)))
+        p.makeCopy(Array(Cast(p.left, TimestampType), p.right))
       case p: BinaryComparison if p.left.dataType == TimestampType &&
                                   p.right.dataType == StringType =>
-        p.makeCopy(Array(Cast(p.left, StringType), p.right))
+        p.makeCopy(Array(p.left, Cast(p.right, TimestampType)))
       case p: BinaryComparison if p.left.dataType == TimestampType &&
                                   p.right.dataType == DateType =>
         p.makeCopy(Array(Cast(p.left, StringType), Cast(p.right, StringType)))
@@ -274,7 +274,7 @@ trait HiveTypeCoercion {
         i.makeCopy(Array(Cast(a, StringType), b))
       case i @ In(a, b) if a.dataType == TimestampType &&
                            b.forall(_.dataType == StringType) =>
-        i.makeCopy(Array(Cast(a, StringType), b))
+        i.makeCopy(Array(a, b.map(Cast(_, TimestampType))))
       case i @ In(a, b) if a.dataType == DateType &&
                            b.forall(_.dataType == TimestampType) =>
         i.makeCopy(Array(Cast(a, StringType), b.map(Cast(_, StringType))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 8cdbe076cbd85..479ad9fe621d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -297,6 +297,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("SPARK-3173 Timestamp support in the parser") {
+    checkAnswer(sql(
+      "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.0'"),
+      Row(java.sql.Timestamp.valueOf("1969-12-31 16:00:00")))
+
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time=CAST('1969-12-31 16:00:00.001' AS TIMESTAMP)"),
       Row(java.sql.Timestamp.valueOf("1969-12-31 16:00:00.001")))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 446771ab2a5a5..8fbc2d23d47e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -175,7 +175,7 @@ object TestData {
       "4, D4, true, 2147483644" :: Nil)
 
   case class TimestampField(time: Timestamp)
-  val timestamps = TestSQLContext.sparkContext.parallelize((1 to 3).map { i =>
+  val timestamps = TestSQLContext.sparkContext.parallelize((0 to 3).map { i =>
     TimestampField(new Timestamp(i))
   })
   timestamps.toDF().registerTempTable("timestamps")

From e8f0e016eaf80a363796dd0a094291dcb3b35793 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 15 May 2015 12:04:26 +0800
Subject: [PATCH 3/7] [SQL] When creating partitioned table scan, explicitly
 create UnionRDD.

Otherwise, it will cause stack overflow when there are many partitions.

Author: Yin Huai <yhuai@databricks.com>

Closes #6162 from yhuai/partitionUnionedRDD and squashes the following commits:

fa016d8 [Yin Huai] Explicitly create UnionRDD.
---
 .../apache/spark/sql/sources/DataSourceStrategy.scala | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index a5410cda0fe6b..ee099ab9593c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{UnionRDD, RDD}
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
@@ -169,9 +169,12 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       scan.execute()
     }
 
-    val unionedRows = perPartitionRows.reduceOption(_ ++ _).getOrElse {
-      relation.sqlContext.emptyResult
-    }
+    val unionedRows =
+      if (perPartitionRows.length == 0) {
+        relation.sqlContext.emptyResult
+      } else {
+        new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+      }
 
     createPhysicalRDD(logicalRelation.relation, output, unionedRows)
   }

From 7da33ce5057ff965eec19ce662465b64a3564019 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 14 May 2015 23:17:41 -0700
Subject: [PATCH 4/7] [HOTFIX] Add workaround for SPARK-7660 to fix
 JavaAPISuite failures.

---
 .../spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java    | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 730d265c87f88..78e52643531e0 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -35,6 +35,7 @@
 import org.mockito.MockitoAnnotations;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
+import org.xerial.snappy.buffer.CachedBufferAllocator;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.lessThan;
@@ -96,6 +97,13 @@ public OutputStream apply(OutputStream stream) {
   @After
   public void tearDown() {
     Utils.deleteRecursively(tempDir);
+    // This call is a workaround for SPARK-7660, a snappy-java bug which is exposed by this test
+    // suite. Clearing the cached buffer allocator's pool of reusable buffers masks this bug,
+    // preventing a test failure in JavaAPISuite that would otherwise occur. The underlying bug
+    // needs to be fixed, but in the meantime this workaround avoids spurious Jenkins failures.
+    synchronized (CachedBufferAllocator.class) {
+      CachedBufferAllocator.queueTable.clear();
+    }
     final long leakedMemory = taskMemoryManager.cleanUpAllAllocatedMemory();
     if (leakedMemory != 0) {
       fail("Test leaked " + leakedMemory + " bytes of managed memory");

From daf4ae72fe01b6d9631bfbd061b3846bdf668dfa Mon Sep 17 00:00:00 2001
From: Kan Zhang <kzhang@apache.org>
Date: Thu, 14 May 2015 23:50:50 -0700
Subject: [PATCH 5/7] [CORE] Remove unreachable Heartbeat message from Worker

It doesn't look to me Heartbeat is sent to Worker from anyone.

Author: Kan Zhang <kzhang@apache.org>

Closes #6163 from kanzhang/deadwood and squashes the following commits:

56be118 [Kan Zhang] [core] Remove unreachable Heartbeat message from Worker
---
 .../src/main/scala/org/apache/spark/deploy/worker/Worker.scala | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 8f3cc54051048..c8df024dda355 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -324,9 +324,6 @@ private[worker] class Worker(
         map(e => new ExecutorDescription(e.appId, e.execId, e.cores, e.state))
       sender ! WorkerSchedulerStateResponse(workerId, execs.toList, drivers.keys.toSeq)
 
-    case Heartbeat =>
-      logInfo(s"Received heartbeat from driver ${sender.path}")
-
     case RegisterWorkerFailed(message) =>
       if (!registered) {
         logError("Worker registration failed: " + message)

From cf842d42a70398671c4bc5ebfa70f6fdb8c57c7f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 14 May 2015 23:51:41 -0700
Subject: [PATCH 6/7] [SPARK-7650] [STREAMING] [WEBUI] Move streaming css and
 js files to the streaming project

cc tdas

Author: zsxwing <zsxwing@gmail.com>

Closes #6160 from zsxwing/SPARK-7650 and squashes the following commits:

fe6ae15 [zsxwing] Fix the import order
a4ffd99 [zsxwing] Merge branch 'master' into SPARK-7650
dc402b6 [zsxwing] Move streaming css and js files to the streaming project
---
 core/src/main/scala/org/apache/spark/ui/WebUI.scala  |  2 +-
 .../spark/streaming}/ui/static/streaming-page.css    |  0
 .../spark/streaming}/ui/static/streaming-page.js     |  0
 .../apache/spark/streaming/ui/StreamingPage.scala    |  4 ++--
 .../org/apache/spark/streaming/ui/StreamingTab.scala | 12 +++++++++++-
 5 files changed, 14 insertions(+), 4 deletions(-)
 rename {core/src/main/resources/org/apache/spark => streaming/src/main/resources/org/apache/spark/streaming}/ui/static/streaming-page.css (100%)
 rename {core/src/main/resources/org/apache/spark => streaming/src/main/resources/org/apache/spark/streaming}/ui/static/streaming-page.js (100%)

diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 384f2ad26e281..1df9cd0fa18b4 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -94,7 +94,7 @@ private[spark] abstract class WebUI(
   }
 
   /** Detach a handler from this UI. */
-  protected def detachHandler(handler: ServletContextHandler) {
+  def detachHandler(handler: ServletContextHandler) {
     handlers -= handler
     serverInfo.foreach { info =>
       info.rootHandler.removeHandler(handler)
diff --git a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.css b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
similarity index 100%
rename from core/src/main/resources/org/apache/spark/ui/static/streaming-page.css
rename to streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css
diff --git a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
similarity index 100%
rename from core/src/main/resources/org/apache/spark/ui/static/streaming-page.js
rename to streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
index 070564aa10633..4ee7a486e370b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
@@ -166,8 +166,8 @@ private[ui] class StreamingPage(parent: StreamingTab)
   private def generateLoadResources(): Seq[Node] = {
     // scalastyle:off
     <script src={SparkUIUtils.prependBaseUri("/static/d3.min.js")}></script>
-      <link rel="stylesheet" href={SparkUIUtils.prependBaseUri("/static/streaming-page.css")} type="text/css"/>
-      <script src={SparkUIUtils.prependBaseUri("/static/streaming-page.js")}></script>
+      <link rel="stylesheet" href={SparkUIUtils.prependBaseUri("/static/streaming/streaming-page.css")} type="text/css"/>
+      <script src={SparkUIUtils.prependBaseUri("/static/streaming/streaming-page.js")}></script>
     // scalastyle:on
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
index f307b54bb9630..e0c0f57212f55 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.streaming.ui
 
+import org.eclipse.jetty.servlet.ServletContextHandler
+
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.ui.{SparkUI, SparkUITab}
+import org.apache.spark.ui.{JettyUtils, SparkUI, SparkUITab}
 
 import StreamingTab._
 
@@ -30,6 +32,8 @@ import StreamingTab._
 private[spark] class StreamingTab(val ssc: StreamingContext)
   extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {
 
+  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"
+
   val parent = getSparkUI(ssc)
   val listener = ssc.progressListener
 
@@ -38,12 +42,18 @@ private[spark] class StreamingTab(val ssc: StreamingContext)
   attachPage(new StreamingPage(this))
   attachPage(new BatchPage(this))
 
+  var staticHandler: ServletContextHandler = null
+
   def attach() {
     getSparkUI(ssc).attachTab(this)
+    staticHandler = JettyUtils.createStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
+    getSparkUI(ssc).attachHandler(staticHandler)
   }
 
   def detach() {
     getSparkUI(ssc).detachTab(this)
+    getSparkUI(ssc).detachHandler(staticHandler)
+    staticHandler = null
   }
 }
 

From 94761485b207fa1f12a8410a68920300d851bf61 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 15 May 2015 00:18:39 -0700
Subject: [PATCH 7/7] [SPARK-6258] [MLLIB] GaussianMixture Python API parity
 check

Implement Python API for major disparities of GaussianMixture cluster algorithm between Scala & Python
```scala
GaussianMixture
    setInitialModel
GaussianMixtureModel
    k
```

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #6087 from yanboliang/spark-6258 and squashes the following commits:

b3af21c [Yanbo Liang] fix typo
2b645c1 [Yanbo Liang] fix doc
638b4b7 [Yanbo Liang] address comments
b5bcade [Yanbo Liang] GaussianMixture Python API parity check
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 24 +++++--
 .../clustering/GaussianMixtureModel.scala     |  9 ++-
 python/pyspark/mllib/clustering.py            | 67 +++++++++++++++----
 3 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f4c477596557f..2fa54df6fc2b2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -345,28 +345,40 @@ private[python] class PythonMLLibAPI extends Serializable {
    * Returns a list containing weights, mean and covariance of each mixture component.
    */
   def trainGaussianMixture(
-      data: JavaRDD[Vector], 
-      k: Int, 
-      convergenceTol: Double, 
+      data: JavaRDD[Vector],
+      k: Int,
+      convergenceTol: Double,
       maxIterations: Int,
-      seed: java.lang.Long): JList[Object] = {
+      seed: java.lang.Long,
+      initialModelWeights: java.util.ArrayList[Double],
+      initialModelMu: java.util.ArrayList[Vector],
+      initialModelSigma: java.util.ArrayList[Matrix]): JList[Object] = {
     val gmmAlg = new GaussianMixture()
       .setK(k)
       .setConvergenceTol(convergenceTol)
       .setMaxIterations(maxIterations)
 
+    if (initialModelWeights != null && initialModelMu != null && initialModelSigma != null) {
+      val gaussians = initialModelMu.asScala.toSeq.zip(initialModelSigma.asScala.toSeq).map {
+        case (x, y) => new MultivariateGaussian(x.asInstanceOf[Vector], y.asInstanceOf[Matrix])
+      }
+      val initialModel = new GaussianMixtureModel(
+        initialModelWeights.asScala.toArray, gaussians.toArray)
+      gmmAlg.setInitialModel(initialModel)
+    }
+
     if (seed != null) gmmAlg.setSeed(seed)
 
     try {
       val model = gmmAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
       var wt = ArrayBuffer.empty[Double]
-      var mu = ArrayBuffer.empty[Vector]      
+      var mu = ArrayBuffer.empty[Vector]
       var sigma = ArrayBuffer.empty[Matrix]
       for (i <- 0 until model.k) {
           wt += model.weights(i)
           mu += model.gaussians(i).mu
           sigma += model.gaussians(i).sigma
-      }    
+      }
       List(Vectors.dense(wt.toArray), mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava
     } finally {
       data.rdd.unpersist(blocking = false)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index ec65a3da689de..c22862c130e77 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -38,11 +38,10 @@ import org.apache.spark.sql.{SQLContext, Row}
  * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are 
  * the respective mean and covariance for each Gaussian distribution i=1..k. 
  * 
- * @param weight Weights for each Gaussian distribution in the mixture, where weight(i) is
- *               the weight for Gaussian i, and weight.sum == 1
- * @param mu Means for each Gaussian in the mixture, where mu(i) is the mean for Gaussian i
- * @param sigma Covariance maxtrix for each Gaussian in the mixture, where sigma(i) is the
- *              covariance matrix for Gaussian i
+ * @param weights Weights for each Gaussian distribution in the mixture, where weights(i) is
+ *                the weight for Gaussian i, and weights.sum == 1
+ * @param gaussians Array of MultivariateGaussian where gaussians(i) represents
+ *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
  */
 @Experimental
 class GaussianMixtureModel(
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 04e67158514f5..a53333dae6a82 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -142,6 +142,7 @@ class GaussianMixtureModel(object):
 
     """A clustering model derived from the Gaussian Mixture Model method.
 
+    >>> from pyspark.mllib.linalg import Vectors, DenseMatrix
     >>> clusterdata_1 =  sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
     ...                                         0.9,0.8,0.75,0.935,
     ...                                        -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2))
@@ -154,11 +155,12 @@ class GaussianMixtureModel(object):
     True
     >>> labels[4]==labels[5]
     True
-    >>> clusterdata_2 =  sc.parallelize(array([-5.1971, -2.5359, -3.8220,
-    ...                                        -5.2211, -5.0602,  4.7118,
-    ...                                         6.8989, 3.4592,  4.6322,
-    ...                                         5.7048,  4.6567, 5.5026,
-    ...                                         4.5605,  5.2043,  6.2734]).reshape(5, 3))
+    >>> data =  array([-5.1971, -2.5359, -3.8220,
+    ...                -5.2211, -5.0602,  4.7118,
+    ...                 6.8989, 3.4592,  4.6322,
+    ...                 5.7048,  4.6567, 5.5026,
+    ...                 4.5605,  5.2043,  6.2734])
+    >>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
     >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
     ...                               maxIterations=150, seed=10)
     >>> labels = model.predict(clusterdata_2).collect()
@@ -166,12 +168,38 @@ class GaussianMixtureModel(object):
     True
     >>> labels[3]==labels[4]
     True
+    >>> clusterdata_3 = sc.parallelize(data.reshape(15, 1))
+    >>> im = GaussianMixtureModel([0.5, 0.5],
+    ...      [MultivariateGaussian(Vectors.dense([-1.0]), DenseMatrix(1, 1, [1.0])),
+    ...      MultivariateGaussian(Vectors.dense([1.0]), DenseMatrix(1, 1, [1.0]))])
+    >>> model = GaussianMixture.train(clusterdata_3, 2, initialModel=im)
     """
 
     def __init__(self, weights, gaussians):
-        self.weights = weights
-        self.gaussians = gaussians
-        self.k = len(self.weights)
+        self._weights = weights
+        self._gaussians = gaussians
+        self._k = len(self._weights)
+
+    @property
+    def weights(self):
+        """
+        Weights for each Gaussian distribution in the mixture, where weights[i] is
+        the weight for Gaussian i, and weights.sum == 1.
+        """
+        return self._weights
+
+    @property
+    def gaussians(self):
+        """
+        Array of MultivariateGaussian where gaussians[i] represents
+        the Multivariate Gaussian (Normal) Distribution for Gaussian i.
+        """
+        return self._gaussians
+
+    @property
+    def k(self):
+        """Number of gaussians in mixture."""
+        return self._k
 
     def predict(self, x):
         """
@@ -193,9 +221,9 @@ def predictSoft(self, x):
         :return:     membership_matrix. RDD of array of double values.
         """
         if isinstance(x, RDD):
-            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
+            means, sigmas = zip(*[(g.mu, g.sigma) for g in self._gaussians])
             membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
-                                              _convert_to_vector(self.weights), means, sigmas)
+                                              _convert_to_vector(self._weights), means, sigmas)
             return membership_matrix.map(lambda x: pyarray.array('d', x))
 
 
@@ -208,13 +236,24 @@ class GaussianMixture(object):
     :param convergenceTol:  Threshold value to check the convergence criteria. Defaults to 1e-3
     :param maxIterations:   Number of iterations. Default to 100
     :param seed:            Random Seed
+    :param initialModel:    GaussianMixtureModel for initializing learning
     """
     @classmethod
-    def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None):
+    def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None):
         """Train a Gaussian Mixture clustering model."""
-        weight, mu, sigma = callMLlibFunc("trainGaussianMixture",
-                                          rdd.map(_convert_to_vector), k,
-                                          convergenceTol, maxIterations, seed)
+        initialModelWeights = None
+        initialModelMu = None
+        initialModelSigma = None
+        if initialModel is not None:
+            if initialModel.k != k:
+                raise Exception("Mismatched cluster count, initialModel.k = %s, however k = %s"
+                                % (initialModel.k, k))
+            initialModelWeights = initialModel.weights
+            initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]
+            initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]
+        weight, mu, sigma = callMLlibFunc("trainGaussianMixture", rdd.map(_convert_to_vector), k,
+                                          convergenceTol, maxIterations, seed, initialModelWeights,
+                                          initialModelMu, initialModelSigma)
         mvg_obj = [MultivariateGaussian(mu[i], sigma[i]) for i in range(k)]
         return GaussianMixtureModel(weight, mvg_obj)