diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1c8869202f677..b56dddcb9f2ef 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3745,3 +3745,27 @@ setMethod("hint",
             jdf <- callJMethod(x@sdf, "hint", name, parameters)
             dataFrame(jdf)
           })
+
+#' alias
+#'
+#' @aliases alias,SparkDataFrame-method
+#' @family SparkDataFrame functions
+#' @rdname alias
+#' @name alias
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- alias(createDataFrame(mtcars), "mtcars")
+#' avg_mpg <- alias(agg(groupBy(df, df$cyl), avg(df$mpg)), "avg_mpg")
+#'
+#' head(select(df, column("mtcars.mpg")))
+#' head(join(df, avg_mpg, column("mtcars.cyl") == column("avg_mpg.cyl")))
+#' }
+#' @note alias(SparkDataFrame) since 2.3.0
+setMethod("alias",
+          signature(object = "SparkDataFrame"),
+          function(object, data) {
+            stopifnot(is.character(data))
+            sdf <- callJMethod(object@sdf, "alias", data)
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 147ee4b6887b9..574078012adad 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -130,19 +130,19 @@ createMethods <- function() {
 
 createMethods()
 
-#' alias
-#'
-#' Set a new name for a column
-#'
-#' @param object Column to rename
-#' @param data new name to use
-#'
 #' @rdname alias
 #' @name alias
 #' @aliases alias,Column-method
 #' @family colum_func
 #' @export
-#' @note alias since 1.4.0
+#' @examples \dontrun{
+#' df <- createDataFrame(iris)
+#'
+#' head(select(
+#'   df, alias(df$Sepal_Length, "slength"), alias(df$Petal_Length, "plength")
+#' ))
+#' }
+#' @note alias(Column) since 1.4.0
 setMethod("alias",
           signature(object = "Column"),
           function(object, data) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e835ef3e4f40d..3c84bf8a4803e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -387,6 +387,17 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @export
 setGeneric("agg", function (x, ...) { standardGeneric("agg") })
 
+#' alias
+#'
+#' Returns a new SparkDataFrame or a Column with an alias set. Equivalent to SQL "AS" keyword.
+#'
+#' @name alias
+#' @rdname alias
+#' @param object x a SparkDataFrame or a Column
+#' @param data new name to use
+#' @return a SparkDataFrame or a Column
+NULL
+
 #' @rdname arrange
 #' @export
 setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 47cc34a6c5b75..f517ce6713133 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -96,6 +96,10 @@ mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}
 mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
 writeLines(mockLinesMapType, mapTypeJsonPath)
 
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
 test_that("calling sparkRSQL.init returns existing SQL context", {
   skip_on_cran()
 
@@ -1223,6 +1227,16 @@ test_that("select with column", {
   expect_equal(columns(df4), c("name", "age"))
   expect_equal(count(df4), 3)
 
+  # Test select with alias
+  df5 <- alias(df, "table")
+
+  expect_equal(columns(select(df5, column("table.name"))), "name")
+  expect_equal(columns(select(df5, "table.name")), "name")
+
+  # Test that stats::alias is not masked
+  expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
+
+
   expect_error(select(df, c("name", "age"), "name"),
                 "To select multiple columns, use a character vector or list for col")
 })
@@ -3387,7 +3401,7 @@ compare_list <- function(list1, list2) {
 
 # This should always be the **very last test** in this test file.
 test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
-  skip_on_cran()
+  skip_on_cran() # skip because when run from R CMD check SPARK_HOME is not the current directory
 
   # Check that it is not creating any extra file.
   # Does not check the tempdir which would be cleaned up after.
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index d38ec4f1b6f37..49f4ab8f146a8 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` fun
 head(carsDF)
 ```
 
-Common data processing operations such as `filter`, `select` are supported on the `SparkDataFrame`.
+Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`.
 ```{r}
 carsSubDF <- select(carsDF, "model", "mpg", "hp")
 carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200)
@@ -379,7 +379,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema)
 head(collect(out))
 ```
 
-Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory.
+Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory.
 
 ```{r}
 out <- dapplyCollect(
@@ -405,7 +405,7 @@ result <- gapply(
 head(arrange(result, "max_mpg", decreasing = TRUE))
 ```
 
-Like gapply, `gapplyCollect` applies a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory.
+Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory.
 
 ```{r}
 result <- gapplyCollect(
@@ -458,20 +458,20 @@ options(ops)
 
 
 ### SQL Queries
-A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`.
+A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`.
 
 ```{r}
 people <- read.df(paste0(sparkR.conf("spark.home"),
                          "/examples/src/main/resources/people.json"), "json")
 ```
 
-Register this SparkDataFrame as a temporary view.
+Register this `SparkDataFrame` as a temporary view.
 
 ```{r}
 createOrReplaceTempView(people, "people")
 ```
 
-SQL statements can be run by using the sql method.
+SQL statements can be run using the sql method.
 ```{r}
 teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 head(teenagers)
@@ -780,7 +780,7 @@ head(predict(isoregModel, newDF))
 `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions:
+We use the `longley` dataset to train a gradient-boosted tree and make predictions:
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
@@ -820,7 +820,7 @@ head(select(fitted, "Class", "prediction"))
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model.
 
-We use a simulated example to demostrate the usage.
+We use a simulated example to demonstrate the usage.
 ```{r}
 X1 <- data.frame(V1 = rnorm(4), V2 = rnorm(4))
 X2 <- data.frame(V1 = rnorm(6, 3), V2 = rnorm(6, 4))
@@ -851,9 +851,9 @@ head(select(kmeansPredictions, "model", "mpg", "hp", "wt", "prediction"), n = 20
 
 * Topics and documents both exist in a feature space, where feature vectors are vectors of word counts (bag of words).
 
-* Rather than estimating a clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated.
+* Rather than clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated.
 
-To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two type options for the column:
+To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two options for the column:
 
 * character string: This can be a string of the whole document. It will be parsed automatically. Additional stop words can be added in `customizedStopWords`.
 
@@ -901,7 +901,7 @@ perplexity
 
 `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
-There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help file.
+There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file.
 
 ```{r, eval=FALSE}
 ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
@@ -981,7 +981,7 @@ testSummary
 
 
 ### Model Persistence
-The following example shows how to save/load an ML model by SparkR.
+The following example shows how to save/load an ML model in SparkR.
 ```{r}
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
@@ -1079,19 +1079,19 @@ There are three main object classes in SparkR you may be working with.
     + `sdf` stores a reference to the corresponding Spark Dataset in the Spark JVM backend.
     + `env` saves the meta-information of the object such as `isCached`.
 
-It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms.
+    It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms.
 
-* `Column`: an S4 class representing column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding Column object in the Spark JVM backend.
+* `Column`: an S4 class representing a column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding `Column` object in the Spark JVM backend.
 
-It can be obtained from a `SparkDataFrame` by `$` operator, `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group.
+    It can be obtained from a `SparkDataFrame` by `$` operator, e.g., `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group.
 
-* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a RelationalGroupedDataset object in the backend.
+* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a `RelationalGroupedDataset` object in the backend.
 
-This is often an intermediate object with group information and followed up by aggregation operations.
+    This is often an intermediate object with group information and followed up by aggregation operations.
 
 ### Architecture
 
-A complete description of architecture can be seen in reference, in particular the paper *SparkR: Scaling R Programs with Spark*.
+A complete description of architecture can be seen in the references, in particular the paper *SparkR: Scaling R Programs with Spark*.
 
 Under the hood of SparkR is Spark SQL engine. This avoids the overheads of running interpreted R code, and the optimized SQL execution engine in Spark uses structural information about data and computation flow to perform a bunch of optimizations to speed up the computation.
 
diff --git a/appveyor.yml b/appveyor.yml
index bbb27589cad09..58c2e98289e96 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -48,12 +48,14 @@ install:
 build_script:
   - cmd: mvn -DskipTests -Psparkr -Phive -Phive-thriftserver package
 
+environment:
+  NOT_CRAN: true
+
 test_script:
-  - cmd: .\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
+  - cmd: .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configuration=file:///%CD:\=/%/R/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
 
 notifications:
   - provider: Email
     on_build_success: false
     on_build_failure: false
     on_build_status_changed: false
-
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 742a4a1531e71..464af16e46f6e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -226,5 +226,19 @@
         <parquet.deps.scope>provided</parquet.deps.scope>
       </properties>
     </profile>
+
+    <!--
+     Pull in spark-hadoop-cloud and its associated JARs,
+    -->
+    <profile>
+      <id>hadoop-cloud</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hadoop-cloud_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
   </profiles>
 </project>
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
new file mode 100644
index 0000000000000..ab296c5b5fb9b
--- /dev/null
+++ b/common/kvstore/pom.xml
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.3.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kvstore_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Local DB</name>
+  <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>kvstore</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.fusesource.leveldbjni</groupId>
+      <artifactId>leveldbjni-all</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>io.dropwizard.metrics</groupId>
+      <artifactId>metrics-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+</project>
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java
new file mode 100644
index 0000000000000..bf5e4a66e510f
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Tags a field to be indexed when storing an object.
+ *
+ * <p>
+ * Types are required to have a natural index that uniquely identifies instances in the store.
+ * The default value of the annotation identifies the natural index for the type.
+ * </p>
+ *
+ * <p>
+ * Indexes allow for more efficient sorting of data read from the store. By annotating a field or
+ * "getter" method with this annotation, an index will be created that will provide sorting based on
+ * the string value of that field.
+ * </p>
+ *
+ * <p>
+ * Note that creating indices means more space will be needed, and maintenance operations like
+ * updating or deleting a value will become more expensive.
+ * </p>
+ *
+ * <p>
+ * Indices are restricted to String, integral types (byte, short, int, long, boolean), and arrays
+ * of those values.
+ * </p>
+ */
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.FIELD, ElementType.METHOD})
+public @interface KVIndex {
+
+  public static final String NATURAL_INDEX_NAME = "__main__";
+
+  /**
+   * The name of the index to be created for the annotated entity. Must be unique within
+   * the class. Index names are not allowed to start with an underscore (that's reserved for
+   * internal use). The default value is the natural index name (which is always a copy index
+   * regardless of the annotation's values).
+   */
+  String value() default NATURAL_INDEX_NAME;
+
+  /**
+   * Whether to copy the instance's data to the index, instead of just storing a pointer to the
+   * data. The default behavior is to just store a reference; that saves disk space but is slower
+   * to read, since there's a level of indirection.
+   */
+  boolean copy() default false;
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java
new file mode 100644
index 0000000000000..667fccccd5428
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.Closeable;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Abstraction for a local key/value store for storing app data.
+ *
+ * <p>
+ * There are two main features provided by the implementations of this interface:
+ * </p>
+ *
+ * <h3>Serialization</h3>
+ *
+ * <p>
+ * Data will be serialized to and deserialized from the underlying data store using a
+ * {@link KVStoreSerializer}, which can be customized by the application. The serializer is
+ * based on Jackson, so it supports all the Jackson annotations for controlling the serialization
+ * of app-defined types.
+ * </p>
+ *
+ * <p>
+ * Data is also automatically compressed to save disk space.
+ * </p>
+ *
+ * <h3>Automatic Key Management</h3>
+ *
+ * <p>
+ * When using the built-in key management, the implementation will automatically create unique
+ * keys for each type written to the store. Keys are based on the type name, and always start
+ * with the "+" prefix character (so that it's easy to use both manual and automatic key
+ * management APIs without conflicts).
+ * </p>
+ *
+ * <p>
+ * Another feature of automatic key management is indexing; by annotating fields or methods of
+ * objects written to the store with {@link KVIndex}, indices are created to sort the data
+ * by the values of those properties. This makes it possible to provide sorting without having
+ * to load all instances of those types from the store.
+ * </p>
+ *
+ * <p>
+ * KVStore instances are thread-safe for both reads and writes.
+ * </p>
+ */
+public interface KVStore extends Closeable {
+
+  /**
+   * Returns app-specific metadata from the store, or null if it's not currently set.
+   *
+   * <p>
+   * The metadata type is application-specific. This is a convenience method so that applications
+   * don't need to define their own keys for this information.
+   * </p>
+   */
+  <T> T getMetadata(Class<T> klass) throws Exception;
+
+  /**
+   * Writes the given value in the store metadata key.
+   */
+  void setMetadata(Object value) throws Exception;
+
+  /**
+   * Read a specific instance of an object.
+   */
+  <T> T read(Class<T> klass, Object naturalKey) throws Exception;
+
+  /**
+   * Writes the given object to the store, including indexed fields. Indices are updated based
+   * on the annotated fields of the object's class.
+   *
+   * <p>
+   * Writes may be slower when the object already exists in the store, since it will involve
+   * updating existing indices.
+   * </p>
+   *
+   * @param value The object to write.
+   */
+  void write(Object value) throws Exception;
+
+  /**
+   * Removes an object and all data related to it, like index entries, from the store.
+   *
+   * @param type The object's type.
+   * @param naturalKey The object's "natural key", which uniquely identifies it.
+   */
+  void delete(Class<?> type, Object naturalKey) throws Exception;
+
+  /**
+   * Returns a configurable view for iterating over entities of the given type.
+   */
+  <T> KVStoreView<T> view(Class<T> type) throws Exception;
+
+  /**
+   * Returns the number of items of the given type currently in the store.
+   */
+  long count(Class<?> type) throws Exception;
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java
new file mode 100644
index 0000000000000..3efdec9ed32be
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * An iterator for KVStore.
+ *
+ * <p>
+ * Iterators may keep references to resources that need to be closed. It's recommended that users
+ * explicitly close iterators after they're used.
+ * </p>
+ */
+public interface KVStoreIterator<T> extends Iterator<T>, AutoCloseable {
+
+  /**
+   * Retrieve multiple elements from the store.
+   *
+   * @param max Maximum number of elements to retrieve.
+   */
+  List<T> next(int max);
+
+  /**
+   * Skip in the iterator.
+   *
+   * @return Whether there are items left after skipping.
+   */
+  boolean skip(long n);
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java
new file mode 100644
index 0000000000000..b84ec91cf67a0
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * Serializer used to translate between app-defined types and the LevelDB store.
+ *
+ * <p>
+ * The serializer is based on Jackson, so values are written as JSON. It also allows "naked strings"
+ * and integers to be written as values directly, which will be written as UTF-8 strings.
+ * </p>
+ */
+public class KVStoreSerializer {
+
+  /**
+   * Object mapper used to process app-specific types. If an application requires a specific
+   * configuration of the mapper, it can subclass this serializer and add custom configuration
+   * to this object.
+   */
+  protected final ObjectMapper mapper;
+
+  public KVStoreSerializer() {
+    this.mapper = new ObjectMapper();
+  }
+
+  public final byte[] serialize(Object o) throws Exception {
+    if (o instanceof String) {
+      return ((String) o).getBytes(UTF_8);
+    } else {
+      ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+      GZIPOutputStream out = new GZIPOutputStream(bytes);
+      try {
+        mapper.writeValue(out, o);
+      } finally {
+        out.close();
+      }
+      return bytes.toByteArray();
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  public final <T> T deserialize(byte[] data, Class<T> klass) throws Exception {
+    if (klass.equals(String.class)) {
+      return (T) new String(data, UTF_8);
+    } else {
+      GZIPInputStream in = new GZIPInputStream(new ByteArrayInputStream(data));
+      try {
+        return mapper.readValue(in, klass);
+      } finally {
+        in.close();
+      }
+    }
+  }
+
+  final byte[] serialize(long value) {
+    return String.valueOf(value).getBytes(UTF_8);
+  }
+
+  final long deserializeLong(byte[] data) {
+    return Long.parseLong(new String(data, UTF_8));
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java
new file mode 100644
index 0000000000000..65edc0149b438
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A configurable view that allows iterating over values in a {@link KVStore}.
+ *
+ * <p>
+ * The different methods can be used to configure the behavior of the iterator. Calling the same
+ * method multiple times is allowed; the most recent value will be used.
+ * </p>
+ *
+ * <p>
+ * The iterators returns by this view are of type {@link KVStoreIterator}; they auto-close
+ * when used in a for loop that exhausts their contents, but when used manually, they need
+ * to be closed explicitly unless all elements are read.
+ * </p>
+ */
+public abstract class KVStoreView<T> implements Iterable<T> {
+
+  final Class<T> type;
+
+  boolean ascending = true;
+  String index = KVIndex.NATURAL_INDEX_NAME;
+  Object first = null;
+  Object last = null;
+  long skip = 0L;
+  long max = Long.MAX_VALUE;
+
+  public KVStoreView(Class<T> type) {
+    this.type = type;
+  }
+
+  /**
+   * Reverses the order of iteration. By default, iterates in ascending order.
+   */
+  public KVStoreView<T> reverse() {
+    ascending = !ascending;
+    return this;
+  }
+
+  /**
+   * Iterates according to the given index.
+   */
+  public KVStoreView<T> index(String name) {
+    this.index = Preconditions.checkNotNull(name);
+    return this;
+  }
+
+  /**
+   * Iterates starting at the given value of the chosen index.
+   */
+  public KVStoreView<T> first(Object value) {
+    this.first = value;
+    return this;
+  }
+
+  /**
+   * Stops iteration at the given value of the chosen index.
+   */
+  public KVStoreView<T> last(Object value) {
+    this.last = value;
+    return this;
+  }
+
+  /**
+   * Stops iteration after a number of elements has been retrieved.
+   */
+  public KVStoreView<T> max(long max) {
+    Preconditions.checkArgument(max > 0L, "max must be positive.");
+    this.max = max;
+    return this;
+  }
+
+  /**
+   * Skips a number of elements at the start of iteration. Skipped elements are not accounted
+   * when using {@link #max(long)}.
+   */
+  public KVStoreView<T> skip(long n) {
+    this.skip = n;
+    return this;
+  }
+
+  /**
+   * Returns an iterator for the current configuration.
+   */
+  public KVStoreIterator<T> closeableIterator() throws Exception {
+    return (KVStoreIterator<T>) iterator();
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
new file mode 100644
index 0000000000000..1a0bee958d482
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.stream.Stream;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Wrapper around types managed in a KVStore, providing easy access to their indexed fields.
+ */
+public class KVTypeInfo {
+
+  private final Class<?> type;
+  private final Collection<KVIndex> indices;
+  private final Map<String, Accessor> accessors;
+
+  public KVTypeInfo(Class<?> type) throws Exception {
+    this.type = type;
+    this.indices = new ArrayList<>();
+    this.accessors = new HashMap<>();
+
+    for (Field f : type.getFields()) {
+      KVIndex idx = f.getAnnotation(KVIndex.class);
+      if (idx != null) {
+        checkIndex(idx);
+        indices.add(idx);
+        accessors.put(idx.value(), new FieldAccessor(f));
+      }
+    }
+
+    for (Method m : type.getMethods()) {
+      KVIndex idx = m.getAnnotation(KVIndex.class);
+      if (idx != null) {
+        checkIndex(idx);
+        Preconditions.checkArgument(m.getParameterTypes().length == 0,
+          "Annotated method %s::%s should not have any parameters.", type.getName(), m.getName());
+        indices.add(idx);
+        accessors.put(idx.value(), new MethodAccessor(m));
+      }
+    }
+
+    Preconditions.checkArgument(accessors.containsKey(KVIndex.NATURAL_INDEX_NAME),
+        "No natural index defined for type %s.", type.getName());
+  }
+
+  private void checkIndex(KVIndex idx) {
+    Preconditions.checkArgument(idx.value() != null && !idx.value().isEmpty(),
+      "No name provided for index in type %s.", type.getName());
+    Preconditions.checkArgument(
+      !idx.value().startsWith("_") || idx.value().equals(KVIndex.NATURAL_INDEX_NAME),
+      "Index name %s (in type %s) is not allowed.", idx.value(), type.getName());
+    Preconditions.checkArgument(!indices.contains(idx.value()),
+      "Duplicate index %s for type %s.", idx.value(), type.getName());
+  }
+
+  public Class<?> getType() {
+    return type;
+  }
+
+  public Object getIndexValue(String indexName, Object instance) throws Exception {
+    return getAccessor(indexName).get(instance);
+  }
+
+  public Stream<KVIndex> indices() {
+    return indices.stream();
+  }
+
+  Accessor getAccessor(String indexName) {
+    Accessor a = accessors.get(indexName);
+    Preconditions.checkArgument(a != null, "No index %s.", indexName);
+    return a;
+  }
+
+  /**
+   * Abstracts the difference between invoking a Field and a Method.
+   */
+  interface Accessor {
+
+    Object get(Object instance) throws Exception;
+
+  }
+
+  private class FieldAccessor implements Accessor {
+
+    private final Field field;
+
+    FieldAccessor(Field field) {
+      this.field = field;
+    }
+
+    @Override
+    public Object get(Object instance) throws Exception {
+      return field.get(instance);
+    }
+
+  }
+
+  private class MethodAccessor implements Accessor {
+
+    private final Method method;
+
+    MethodAccessor(Method method) {
+      this.method = method;
+    }
+
+    @Override
+    public Object get(Object instance) throws Exception {
+      return method.invoke(instance);
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java
new file mode 100644
index 0000000000000..b40c7950d1d11
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicReference;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+import org.fusesource.leveldbjni.JniDBFactory;
+import org.iq80.leveldb.DB;
+import org.iq80.leveldb.Options;
+import org.iq80.leveldb.WriteBatch;
+import org.iq80.leveldb.WriteOptions;
+
+/**
+ * Implementation of KVStore that uses LevelDB as the underlying data store.
+ */
+public class LevelDB implements KVStore {
+
+  @VisibleForTesting
+  static final long STORE_VERSION = 1L;
+
+  @VisibleForTesting
+  static final byte[] STORE_VERSION_KEY = "__version__".getBytes(UTF_8);
+
+  /** DB key where app metadata is stored. */
+  private static final byte[] METADATA_KEY = "__meta__".getBytes(UTF_8);
+
+  /** DB key where type aliases are stored. */
+  private static final byte[] TYPE_ALIASES_KEY = "__types__".getBytes(UTF_8);
+
+  final AtomicReference<DB> _db;
+  final KVStoreSerializer serializer;
+
+  private final ConcurrentMap<String, byte[]> typeAliases;
+  private final ConcurrentMap<Class<?>, LevelDBTypeInfo> types;
+
+  public LevelDB(File path) throws Exception {
+    this(path, new KVStoreSerializer());
+  }
+
+  public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
+    this.serializer = serializer;
+    this.types = new ConcurrentHashMap<>();
+
+    Options options = new Options();
+    options.createIfMissing(!path.exists());
+    this._db = new AtomicReference<>(JniDBFactory.factory.open(path, options));
+
+    byte[] versionData = db().get(STORE_VERSION_KEY);
+    if (versionData != null) {
+      long version = serializer.deserializeLong(versionData);
+      if (version != STORE_VERSION) {
+        throw new UnsupportedStoreVersionException();
+      }
+    } else {
+      db().put(STORE_VERSION_KEY, serializer.serialize(STORE_VERSION));
+    }
+
+    Map<String, byte[]> aliases;
+    try {
+      aliases = get(TYPE_ALIASES_KEY, TypeAliases.class).aliases;
+    } catch (NoSuchElementException e) {
+      aliases = new HashMap<>();
+    }
+    typeAliases = new ConcurrentHashMap<>(aliases);
+  }
+
+  @Override
+  public <T> T getMetadata(Class<T> klass) throws Exception {
+    try {
+      return get(METADATA_KEY, klass);
+    } catch (NoSuchElementException nsee) {
+      return null;
+    }
+  }
+
+  @Override
+  public void setMetadata(Object value) throws Exception {
+    if (value != null) {
+      put(METADATA_KEY, value);
+    } else {
+      db().delete(METADATA_KEY);
+    }
+  }
+
+  <T> T get(byte[] key, Class<T> klass) throws Exception {
+    byte[] data = db().get(key);
+    if (data == null) {
+      throw new NoSuchElementException(new String(key, UTF_8));
+    }
+    return serializer.deserialize(data, klass);
+  }
+
+  private void put(byte[] key, Object value) throws Exception {
+    Preconditions.checkArgument(value != null, "Null values are not allowed.");
+    db().put(key, serializer.serialize(value));
+  }
+
+  @Override
+  public <T> T read(Class<T> klass, Object naturalKey) throws Exception {
+    Preconditions.checkArgument(naturalKey != null, "Null keys are not allowed.");
+    byte[] key = getTypeInfo(klass).naturalIndex().start(naturalKey);
+    return get(key, klass);
+  }
+
+  @Override
+  public void write(Object value) throws Exception {
+    write(value, false);
+  }
+
+  public void write(Object value, boolean sync) throws Exception {
+    Preconditions.checkArgument(value != null, "Null values are not allowed.");
+    LevelDBTypeInfo ti = getTypeInfo(value.getClass());
+
+    LevelDBWriteBatch batch = new LevelDBWriteBatch(this);
+    try {
+      byte[] data = serializer.serialize(value);
+      synchronized (ti) {
+        try {
+          Object existing = get(ti.naturalIndex().entityKey(value), value.getClass());
+          removeInstance(ti, batch, existing);
+        } catch (NoSuchElementException e) {
+          // Ignore. No previous value.
+        }
+        for (LevelDBTypeInfo.Index idx : ti.indices()) {
+          idx.add(batch, value, data);
+        }
+        batch.write(sync);
+      }
+    } finally {
+      batch.close();
+    }
+  }
+
+  @Override
+  public void delete(Class<?> type, Object naturalKey) throws Exception {
+    delete(type, naturalKey, false);
+  }
+
+  public void delete(Class<?> type, Object naturalKey, boolean sync) throws Exception {
+    Preconditions.checkArgument(naturalKey != null, "Null keys are not allowed.");
+    LevelDBWriteBatch batch = new LevelDBWriteBatch(this);
+    try {
+      LevelDBTypeInfo ti = getTypeInfo(type);
+      byte[] key = ti.naturalIndex().start(naturalKey);
+      byte[] data = db().get(key);
+      if (data != null) {
+        Object existing = serializer.deserialize(data, type);
+        synchronized (ti) {
+          removeInstance(ti, batch, existing);
+          batch.write(sync);
+        }
+      }
+    } finally {
+      batch.close();
+    }
+  }
+
+  @Override
+  public <T> KVStoreView<T> view(Class<T> type) throws Exception {
+    return new KVStoreView<T>(type) {
+      @Override
+      public Iterator<T> iterator() {
+        try {
+          return new LevelDBIterator<>(LevelDB.this, this);
+        } catch (Exception e) {
+          throw Throwables.propagate(e);
+        }
+      }
+    };
+  }
+
+  @Override
+  public long count(Class<?> type) throws Exception {
+    LevelDBTypeInfo.Index idx = getTypeInfo(type).naturalIndex();
+    return idx.getCount(idx.end());
+  }
+
+  @Override
+  public void close() throws IOException {
+    DB _db = this._db.getAndSet(null);
+    if (_db == null) {
+      return;
+    }
+
+    try {
+      _db.close();
+    } catch (IOException ioe) {
+      throw ioe;
+    } catch (Exception e) {
+      throw new IOException(e.getMessage(), e);
+    }
+  }
+
+  /** Returns metadata about indices for the given type. */
+  LevelDBTypeInfo getTypeInfo(Class<?> type) throws Exception {
+    LevelDBTypeInfo ti = types.get(type);
+    if (ti == null) {
+      LevelDBTypeInfo tmp = new LevelDBTypeInfo(this, type, getTypeAlias(type));
+      ti = types.putIfAbsent(type, tmp);
+      if (ti == null) {
+        ti = tmp;
+      }
+    }
+    return ti;
+  }
+
+  /**
+   * Try to avoid use-after close since that has the tendency of crashing the JVM. This doesn't
+   * prevent methods that retrieved the instance from using it after close, but hopefully will
+   * catch most cases; otherwise, we'll need some kind of locking.
+   */
+  DB db() {
+    DB _db = this._db.get();
+    if (_db == null) {
+      throw new IllegalStateException("DB is closed.");
+    }
+    return _db;
+  }
+
+  private void removeInstance(LevelDBTypeInfo ti, LevelDBWriteBatch batch, Object instance)
+      throws Exception {
+    for (LevelDBTypeInfo.Index idx : ti.indices()) {
+      idx.remove(batch, instance);
+    }
+  }
+
+  private byte[] getTypeAlias(Class<?> klass) throws Exception {
+    byte[] alias = typeAliases.get(klass.getName());
+    if (alias == null) {
+      synchronized (typeAliases) {
+        byte[] tmp = String.valueOf(typeAliases.size()).getBytes(UTF_8);
+        alias = typeAliases.putIfAbsent(klass.getName(), tmp);
+        if (alias == null) {
+          alias = tmp;
+          put(TYPE_ALIASES_KEY, new TypeAliases(typeAliases));
+        }
+      }
+    }
+    return alias;
+  }
+
+  /** Needs to be public for Jackson. */
+  public static class TypeAliases {
+
+    public Map<String, byte[]> aliases;
+
+    TypeAliases(Map<String, byte[]> aliases) {
+      this.aliases = aliases;
+    }
+
+    TypeAliases() {
+      this(null);
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java
new file mode 100644
index 0000000000000..73ca8afc9eb28
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Throwables;
+import org.iq80.leveldb.DBIterator;
+
+class LevelDBIterator<T> implements KVStoreIterator<T> {
+
+  private final LevelDB db;
+  private final boolean ascending;
+  private final DBIterator it;
+  private final Class<T> type;
+  private final LevelDBTypeInfo ti;
+  private final LevelDBTypeInfo.Index index;
+  private final byte[] indexKeyPrefix;
+  private final byte[] end;
+  private final long max;
+
+  private boolean checkedNext;
+  private T next;
+  private boolean closed;
+  private long count;
+
+  LevelDBIterator(LevelDB db, KVStoreView<T> params) throws Exception {
+    this.db = db;
+    this.ascending = params.ascending;
+    this.it = db.db().iterator();
+    this.type = params.type;
+    this.ti = db.getTypeInfo(type);
+    this.index = ti.index(params.index);
+    this.indexKeyPrefix = index.keyPrefix();
+    this.max = params.max;
+
+    byte[] firstKey;
+    if (params.first != null) {
+      if (ascending) {
+        firstKey = index.start(params.first);
+      } else {
+        firstKey = index.end(params.first);
+      }
+    } else if (ascending) {
+      firstKey = index.keyPrefix();
+    } else {
+      firstKey = index.end();
+    }
+    it.seek(firstKey);
+
+    byte[] end = null;
+    if (ascending) {
+      end = params.last != null ? index.end(params.last) : index.end();
+    } else {
+      if (params.last != null) {
+        end = index.start(params.last);
+      }
+      if (it.hasNext()) {
+        // When descending, the caller may have set up the start of iteration at a non-existant
+        // entry that is guaranteed to be after the desired entry. For example, if you have a
+        // compound key (a, b) where b is a, integer, you may seek to the end of the elements that
+        // have the same "a" value by specifying Integer.MAX_VALUE for "b", and that value may not
+        // exist in the database. So need to check here whether the next value actually belongs to
+        // the set being returned by the iterator before advancing.
+        byte[] nextKey = it.peekNext().getKey();
+        if (compare(nextKey, indexKeyPrefix) <= 0) {
+          it.next();
+        }
+      }
+    }
+    this.end = end;
+
+    if (params.skip > 0) {
+      skip(params.skip);
+    }
+  }
+
+  @Override
+  public boolean hasNext() {
+    if (!checkedNext && !closed) {
+      next = loadNext();
+      checkedNext = true;
+    }
+    if (!closed && next == null) {
+      try {
+        close();
+      } catch (IOException ioe) {
+        throw Throwables.propagate(ioe);
+      }
+    }
+    return next != null;
+  }
+
+  @Override
+  public T next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException();
+    }
+    checkedNext = false;
+    return next;
+  }
+
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public List<T> next(int max) {
+    List<T> list = new ArrayList<>(max);
+    while (hasNext() && list.size() < max) {
+      list.add(next());
+    }
+    return list;
+  }
+
+  @Override
+  public boolean skip(long n) {
+    long skipped = 0;
+    while (skipped < n) {
+      next = null;
+      boolean hasNext = ascending ? it.hasNext() : it.hasPrev();
+      if (!hasNext) {
+        return false;
+      }
+
+      Map.Entry<byte[], byte[]> e = ascending ? it.next() : it.prev();
+      if (!isEndMarker(e.getKey())) {
+        skipped++;
+      }
+    }
+
+    return true;
+  }
+
+  @Override
+  public synchronized void close() throws IOException {
+    if (!closed) {
+      it.close();
+      closed = true;
+    }
+  }
+
+  private T loadNext() {
+    if (count >= max) {
+      return null;
+    }
+
+    try {
+      while (true) {
+        boolean hasNext = ascending ? it.hasNext() : it.hasPrev();
+        if (!hasNext) {
+          return null;
+        }
+
+        Map.Entry<byte[], byte[]> nextEntry;
+        try {
+          // Avoid races if another thread is updating the DB.
+          nextEntry = ascending ? it.next() : it.prev();
+        } catch (NoSuchElementException e) {
+          return null;
+        }
+        byte[] nextKey = nextEntry.getKey();
+
+        // If the next key is an end marker, then skip it.
+        if (isEndMarker(nextKey)) {
+          continue;
+        }
+
+        // Next key is not part of the index, stop.
+        if (!startsWith(nextKey, indexKeyPrefix)) {
+          return null;
+        }
+
+        // If there's a known end key and iteration has gone past it, stop.
+        if (end != null) {
+          int comp = compare(nextKey, end) * (ascending ? 1 : -1);
+          if (comp > 0) {
+            return null;
+          }
+        }
+
+        count++;
+
+        // Next element is part of the iteration, return it.
+        if (index == null || index.isCopy()) {
+          return db.serializer.deserialize(nextEntry.getValue(), type);
+        } else {
+          byte[] key = stitch(ti.naturalIndex().keyPrefix(), nextEntry.getValue());
+          return db.get(key, type);
+        }
+      }
+    } catch (Exception e) {
+      throw Throwables.propagate(e);
+    }
+  }
+
+  @VisibleForTesting
+  static boolean startsWith(byte[] key, byte[] prefix) {
+    if (key.length < prefix.length) {
+      return false;
+    }
+
+    for (int i = 0; i < prefix.length; i++) {
+      if (key[i] != prefix[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  private boolean isEndMarker(byte[] key) {
+    return (key.length > 2 &&
+        key[key.length - 2] == LevelDBTypeInfo.KEY_SEPARATOR &&
+        key[key.length - 1] == (byte) LevelDBTypeInfo.END_MARKER.charAt(0));
+  }
+
+  private byte[] stitch(byte[]... comps) {
+    int len = 0;
+    for (byte[] comp : comps) {
+      len += comp.length;
+    }
+
+    byte[] dest = new byte[len];
+    int written = 0;
+    for (byte[] comp : comps) {
+      System.arraycopy(comp, 0, dest, written, comp.length);
+      written += comp.length;
+    }
+
+    return dest;
+  }
+
+  private int compare(byte[] a, byte[] b) {
+    int diff = 0;
+    int minLen = Math.min(a.length, b.length);
+    for (int i = 0; i < minLen; i++) {
+      diff += (a[i] - b[i]);
+      if (diff != 0) {
+        return diff;
+      }
+    }
+
+    return a.length - b.length;
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java
new file mode 100644
index 0000000000000..826e6cf068fd9
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.lang.reflect.Array;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+
+/**
+ * Holds metadata about app-specific types stored in LevelDB. Serves as a cache for data collected
+ * via reflection, to make it cheaper to access it multiple times.
+ */
+class LevelDBTypeInfo {
+
+  static final String ENTRY_PREFIX = "+";
+  static final String END_MARKER = "-";
+  static final byte KEY_SEPARATOR = 0x0;
+
+  // These constants are used in the Index.toKey() method below when encoding numbers into keys.
+  // See javadoc for that method for details.
+  private static final char POSITIVE_FILL = '.';
+  private static final char NEGATIVE_FILL = '~';
+  private static final char POSITIVE_MARKER = '=';
+  private static final char NEGATIVE_MARKER = '*';
+
+  @VisibleForTesting
+  static final int BYTE_ENCODED_LEN = String.valueOf(Byte.MAX_VALUE).length() + 1;
+  @VisibleForTesting
+  static final int INT_ENCODED_LEN = String.valueOf(Integer.MAX_VALUE).length() + 1;
+  @VisibleForTesting
+  static final int LONG_ENCODED_LEN = String.valueOf(Long.MAX_VALUE).length() + 1;
+  @VisibleForTesting
+  static final int SHORT_ENCODED_LEN = String.valueOf(Short.MAX_VALUE).length() + 1;
+
+  private final LevelDB db;
+  private final Class<?> type;
+  private final Map<String, Index> indices;
+  private final byte[] typePrefix;
+
+  LevelDBTypeInfo(LevelDB db, Class<?> type, byte[] alias) throws Exception {
+    this.db = db;
+    this.type = type;
+    this.indices = new HashMap<>();
+
+    KVTypeInfo ti = new KVTypeInfo(type);
+    ti.indices().forEach(idx -> {
+      indices.put(idx.value(), new Index(idx.value(), idx.copy(), ti.getAccessor(idx.value())));
+    });
+
+    ByteArrayOutputStream typePrefix = new ByteArrayOutputStream();
+    typePrefix.write(utf8(ENTRY_PREFIX));
+    typePrefix.write(alias);
+    this.typePrefix = typePrefix.toByteArray();
+  }
+
+  Class<?> type() {
+    return type;
+  }
+
+  byte[] keyPrefix() {
+    return buildKey(false);
+  }
+
+  Index naturalIndex() {
+    return index(KVIndex.NATURAL_INDEX_NAME);
+  }
+
+  Index index(String name) {
+    Index i = indices.get(name);
+    Preconditions.checkArgument(i != null, "Index %s does not exist for type %s.", name,
+      type.getName());
+    return i;
+  }
+
+  Collection<Index> indices() {
+    return indices.values();
+  }
+
+  private byte[] utf8(String s) {
+    return s.getBytes(UTF_8);
+  }
+
+  private byte[] buildKey(boolean trim, String... components) {
+    try {
+      ByteArrayOutputStream kos = new ByteArrayOutputStream(typePrefix.length * 2);
+      kos.write(typePrefix);
+      for (int i = 0; i < components.length; i++) {
+        kos.write(utf8(components[i]));
+        if (!trim || i < components.length - 1) {
+          kos.write(KEY_SEPARATOR);
+        }
+      }
+      return kos.toByteArray();
+    } catch (IOException ioe) {
+      throw Throwables.propagate(ioe);
+    }
+  }
+
+  /**
+   * Models a single index in LevelDB. Keys are stored under the type's prefix, in sequential
+   * order according to the indexed value. For non-natural indices, the key also contains the
+   * entity's natural key after the indexed value, so that it's possible for multiple entities
+   * to have the same indexed value.
+   *
+   * <p>
+   * An end marker is used to mark where the index ends, and the boundaries of each indexed value
+   * within the index, to make descending iteration faster, at the expense of some disk space and
+   * minor overhead when iterating. A count of the number of indexed entities is kept at the end
+   * marker, so that it can be cleaned up when all entries are removed from the index.
+   * </p>
+   */
+  class Index {
+
+    private final boolean copy;
+    private final boolean isNatural;
+    private final String name;
+    private final KVTypeInfo.Accessor accessor;
+
+    private Index(String name, boolean copy, KVTypeInfo.Accessor accessor) {
+      this.name = name;
+      this.isNatural = name.equals(KVIndex.NATURAL_INDEX_NAME);
+      this.copy = isNatural || copy;
+      this.accessor = accessor;
+    }
+
+    boolean isCopy() {
+      return copy;
+    }
+
+    /** The prefix for all keys that belong to this index. */
+    byte[] keyPrefix() {
+      return buildKey(false, name);
+    }
+
+    /** The key where to start ascending iteration for entries that match the given value. */
+    byte[] start(Object value) {
+      return buildKey(isNatural, name, toKey(value));
+    }
+
+    /** The key for the index's end marker. */
+    byte[] end() {
+      return buildKey(true, name, END_MARKER);
+    }
+
+    /** The key for the end marker for index entries with the given value. */
+    byte[] end(Object value) throws Exception {
+      return buildKey(true, name, toKey(value), END_MARKER);
+    }
+
+    /** The key in the index that identifies the given entity. */
+    byte[] entityKey(Object entity) throws Exception {
+      Object indexValue = accessor.get(entity);
+      Preconditions.checkNotNull(indexValue, "Null index value for %s in type %s.",
+        name, type.getName());
+      if (isNatural) {
+        return buildKey(true, name, toKey(indexValue));
+      } else {
+        Object naturalKey = naturalIndex().accessor.get(entity);
+        return buildKey(true, name, toKey(accessor.get(entity)), toKey(naturalKey));
+      }
+    }
+
+    /**
+     * Add an entry to the index.
+     *
+     * @param batch Write batch with other related changes.
+     * @param entity The entity being added to the index.
+     * @param data Serialized entity to store (when storing the entity, not a reference).
+     * @param naturalKey The value's key.
+     */
+    void add(LevelDBWriteBatch batch, Object entity, byte[] data) throws Exception {
+      byte[] stored = data;
+      if (!copy) {
+        stored = db.serializer.serialize(toKey(naturalIndex().accessor.get(entity)));
+      }
+      batch.put(entityKey(entity), stored);
+      batch.updateCount(end(accessor.get(entity)), 1L);
+      batch.updateCount(end(), 1L);
+    }
+
+    /**
+     * Remove a value from the index.
+     *
+     * @param batch Write batch with other related changes.
+     * @param entity The entity being removed, to identify the index entry to modify.
+     * @param naturalKey The value's key.
+     */
+    void remove(LevelDBWriteBatch batch, Object entity) throws Exception {
+      batch.delete(entityKey(entity));
+      batch.updateCount(end(accessor.get(entity)), -1L);
+      batch.updateCount(end(), -1L);
+    }
+
+    long getCount(byte[] key) throws Exception {
+      byte[] data = db.db().get(key);
+      return data != null ? db.serializer.deserializeLong(data) : 0;
+    }
+
+    /**
+     * Translates a value to be used as part of the store key.
+     *
+     * Integral numbers are encoded as a string in a way that preserves lexicographical
+     * ordering. The string is always as long as the maximum value for the given type (e.g.
+     * 11 characters for integers, including the character for the sign). The first character
+     * represents the sign (with the character for negative coming before the one for positive,
+     * which means you cannot use '-'...). The rest of the value is padded with a value that is
+     * "greater than 9" for negative values, so that for example "-123" comes before "-12" (the
+     * encoded value would look like "*~~~~~~~123"). For positive values, similarly, a value that
+     * is "lower than 0" (".") is used for padding. The fill characters were chosen for readability
+     * when looking at the encoded keys.
+     *
+     * Arrays are encoded by encoding each element separately, separated by KEY_SEPARATOR.
+     */
+    @VisibleForTesting
+    String toKey(Object value) {
+      StringBuilder sb = new StringBuilder(ENTRY_PREFIX);
+
+      if (value instanceof String) {
+        sb.append(value);
+      } else if (value instanceof Boolean) {
+        sb.append(((Boolean) value).toString().toLowerCase());
+      } else if (value.getClass().isArray()) {
+        int length = Array.getLength(value);
+        for (int i = 0; i < length; i++) {
+          sb.append(toKey(Array.get(value, i)));
+          sb.append(KEY_SEPARATOR);
+        }
+        if (length > 0) {
+          sb.setLength(sb.length() - 1);
+        }
+      } else {
+        int encodedLen;
+
+        if (value instanceof Integer) {
+          encodedLen = INT_ENCODED_LEN;
+        } else if (value instanceof Long) {
+          encodedLen = LONG_ENCODED_LEN;
+        } else if (value instanceof Short) {
+          encodedLen = SHORT_ENCODED_LEN;
+        } else if (value instanceof Byte) {
+          encodedLen = BYTE_ENCODED_LEN;
+        } else {
+          throw new IllegalArgumentException(String.format("Type %s not allowed as key.",
+            value.getClass().getName()));
+        }
+
+        long longValue = ((Number) value).longValue();
+        String strVal;
+        if (longValue == Long.MIN_VALUE) {
+          // Math.abs() overflows for Long.MIN_VALUE.
+          strVal = String.valueOf(longValue).substring(1);
+        } else {
+          strVal = String.valueOf(Math.abs(longValue));
+        }
+
+        sb.append(longValue >= 0 ? POSITIVE_MARKER : NEGATIVE_MARKER);
+
+        char fill = longValue >= 0 ? POSITIVE_FILL : NEGATIVE_FILL;
+        for (int i = 0; i < encodedLen - strVal.length() - 1; i++) {
+          sb.append(fill);
+        }
+
+        sb.append(strVal);
+      }
+
+      return sb.toString();
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBWriteBatch.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBWriteBatch.java
new file mode 100644
index 0000000000000..f3de251de554f
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBWriteBatch.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.iq80.leveldb.DB;
+import org.iq80.leveldb.WriteBatch;
+import org.iq80.leveldb.WriteOptions;
+
+/**
+ * A wrapper around the LevelDB library's WriteBatch with some extra functionality for keeping
+ * track of counts.
+ */
+class LevelDBWriteBatch {
+
+  private final LevelDB db;
+  private final Map<KeyWrapper, Long> deltas;
+  private final WriteBatch batch;
+
+  LevelDBWriteBatch(LevelDB db) {
+    this.db = db;
+    this.batch = db.db().createWriteBatch();
+    this.deltas = new HashMap<>();
+  }
+
+  void updateCount(byte[] key, long delta) {
+    KeyWrapper kw = new KeyWrapper(key);
+    Long fullDelta = deltas.get(kw);
+    if (fullDelta != null) {
+      fullDelta += delta;
+    } else {
+      fullDelta = delta;
+    }
+    deltas.put(kw, fullDelta);
+  }
+
+  void put(byte[] key, byte[] value) {
+    batch.put(key, value);
+  }
+
+  void delete(byte[] key) {
+    batch.delete(key);
+  }
+
+  void write(boolean sync) {
+    for (Map.Entry<KeyWrapper, Long> e : deltas.entrySet()) {
+      long delta = e.getValue();
+      if (delta == 0) {
+        continue;
+      }
+
+      byte[] key = e.getKey().key;
+      byte[] data = db.db().get(key);
+      long count = data != null ? db.serializer.deserializeLong(data) : 0L;
+      long newCount = count + delta;
+
+      if (newCount > 0) {
+        batch.put(key, db.serializer.serialize(newCount));
+      } else {
+        batch.delete(key);
+      }
+    }
+
+    db.db().write(batch, new WriteOptions().sync(sync));
+  }
+
+  void close() throws IOException {
+    batch.close();
+  }
+
+  private static class KeyWrapper {
+
+    private final byte[] key;
+
+    KeyWrapper(byte[] key) {
+      this.key = key;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (other instanceof KeyWrapper) {
+        return Arrays.equals(key, ((KeyWrapper) other).key);
+      }
+      return false;
+    }
+
+    @Override
+    public int hashCode() {
+      return Arrays.hashCode(key);
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java
new file mode 100644
index 0000000000000..2ed246e4f4c97
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.IOException;
+
+/**
+ * Exception thrown when the store implementation is not compatible with the underlying data.
+ */
+public class UnsupportedStoreVersionException extends IOException {
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java
new file mode 100644
index 0000000000000..2bea5b560681f
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import com.google.common.base.Objects;
+
+public class CustomType1 {
+
+  @KVIndex
+  public String key;
+
+  @KVIndex("id")
+  public String id;
+
+  @KVIndex(value = "name", copy = true)
+  public String name;
+
+  @KVIndex("int")
+  public int num;
+
+  @Override
+  public boolean equals(Object o) {
+    if (o instanceof CustomType1) {
+      CustomType1 other = (CustomType1) o;
+      return id.equals(other.id) && name.equals(other.name);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return id.hashCode();
+  }
+
+  @Override
+  public String toString() {
+    return Objects.toStringHelper(this)
+      .add("key", key)
+      .add("id", id)
+      .add("name", name)
+      .add("num", num)
+      .toString();
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java
new file mode 100644
index 0000000000000..6c4469e1ed5d0
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import org.apache.commons.io.FileUtils;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+/**
+ * This class should really be called "LevelDBIteratorSuite" but for some reason I don't know,
+ * sbt does not run the tests if it has that name.
+ */
+public class DBIteratorSuite {
+
+  private static final int MIN_ENTRIES = 42;
+  private static final int MAX_ENTRIES = 1024;
+  private static final Random RND = new Random();
+
+  private static List<CustomType1> allEntries;
+  private static List<CustomType1> clashingEntries;
+  private static LevelDB db;
+  private static File dbpath;
+
+  private interface BaseComparator extends Comparator<CustomType1> {
+    /**
+     * Returns a comparator that falls back to natural order if this comparator's ordering
+     * returns equality for two elements. Used to mimic how the index sorts things internally.
+     */
+    default BaseComparator fallback() {
+      return (t1, t2) -> {
+        int result = BaseComparator.this.compare(t1, t2);
+        if (result != 0) {
+          return result;
+        }
+
+        return t1.key.compareTo(t2.key);
+      };
+    }
+
+    /** Reverses the order of this comparator. */
+    default BaseComparator reverse() {
+      return (t1, t2) -> -BaseComparator.this.compare(t1, t2);
+    }
+  }
+
+  private final BaseComparator NATURAL_ORDER = (t1, t2) -> t1.key.compareTo(t2.key);
+  private final BaseComparator REF_INDEX_ORDER = (t1, t2) -> t1.id.compareTo(t2.id);
+  private final BaseComparator COPY_INDEX_ORDER = (t1, t2) -> t1.name.compareTo(t2.name);
+  private final BaseComparator NUMERIC_INDEX_ORDER = (t1, t2) -> t1.num - t2.num;
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    dbpath = File.createTempFile("test.", ".ldb");
+    dbpath.delete();
+    db = new LevelDB(dbpath);
+
+    int count = RND.nextInt(MAX_ENTRIES) + MIN_ENTRIES;
+
+    // Instead of generating sequential IDs, generate random unique IDs to avoid the insertion
+    // order matching the natural ordering. Just in case.
+    boolean[] usedIDs = new boolean[count];
+
+    allEntries = new ArrayList<>(count);
+    for (int i = 0; i < count; i++) {
+      CustomType1 t = new CustomType1();
+
+      int id;
+      do {
+        id = RND.nextInt(count);
+      } while (usedIDs[id]);
+
+      usedIDs[id] = true;
+      t.key = "key" + id;
+      t.id = "id" + i;
+      t.name = "name" + RND.nextInt(MAX_ENTRIES);
+      t.num = RND.nextInt(MAX_ENTRIES);
+      allEntries.add(t);
+      db.write(t);
+    }
+
+    // Pick the first generated value, and forcefully create a few entries that will clash
+    // with the indexed values (id and name), to make sure the index behaves correctly when
+    // multiple entities are indexed by the same value.
+    //
+    // This also serves as a test for the test code itself, to make sure it's sorting indices
+    // the same way the store is expected to.
+    CustomType1 first = allEntries.get(0);
+    clashingEntries = new ArrayList<>();
+    for (int i = 0; i < RND.nextInt(MIN_ENTRIES) + 1; i++) {
+      CustomType1 t = new CustomType1();
+      t.key = "n-key" + (count + i);
+      t.id = first.id;
+      t.name = first.name;
+      t.num = first.num;
+      allEntries.add(t);
+      clashingEntries.add(t);
+      db.write(t);
+    }
+
+    // Create another entry that could cause problems: take the first entry, and make its indexed
+    // name be an extension of the existing ones, to make sure the implementation sorts these
+    // correctly even considering the separator character (shorter strings first).
+    CustomType1 t = new CustomType1();
+    t.key = "extended-key-0";
+    t.id = first.id;
+    t.name = first.name + "a";
+    t.num = first.num;
+    allEntries.add(t);
+    db.write(t);
+  }
+
+  @AfterClass
+  public static void cleanup() throws Exception {
+    allEntries = null;
+    if (db != null) {
+      db.close();
+    }
+    if (dbpath != null) {
+      FileUtils.deleteQuietly(dbpath);
+    }
+  }
+
+  @Test
+  public void naturalIndex() throws Exception {
+    testIteration(NATURAL_ORDER, view(), null, null);
+  }
+
+  @Test
+  public void refIndex() throws Exception {
+    testIteration(REF_INDEX_ORDER, view().index("id"), null, null);
+  }
+
+  @Test
+  public void copyIndex() throws Exception {
+    testIteration(COPY_INDEX_ORDER, view().index("name"), null, null);
+  }
+
+  @Test
+  public void numericIndex() throws Exception {
+    testIteration(NUMERIC_INDEX_ORDER, view().index("int"), null, null);
+  }
+
+  @Test
+  public void naturalIndexDescending() throws Exception {
+    testIteration(NATURAL_ORDER, view().reverse(), null, null);
+  }
+
+  @Test
+  public void refIndexDescending() throws Exception {
+    testIteration(REF_INDEX_ORDER, view().index("id").reverse(), null, null);
+  }
+
+  @Test
+  public void copyIndexDescending() throws Exception {
+    testIteration(COPY_INDEX_ORDER, view().index("name").reverse(), null, null);
+  }
+
+  @Test
+  public void numericIndexDescending() throws Exception {
+    testIteration(NUMERIC_INDEX_ORDER, view().index("int").reverse(), null, null);
+  }
+
+  @Test
+  public void naturalIndexWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(NATURAL_ORDER, view().first(first.key), first, null);
+  }
+
+  @Test
+  public void refIndexWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(REF_INDEX_ORDER, view().index("id").first(first.id), first, null);
+  }
+
+  @Test
+  public void copyIndexWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(COPY_INDEX_ORDER, view().index("name").first(first.name), first, null);
+  }
+
+  @Test
+  public void numericIndexWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(NUMERIC_INDEX_ORDER, view().index("int").first(first.num), first, null);
+  }
+
+  @Test
+  public void naturalIndexDescendingWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(NATURAL_ORDER, view().reverse().first(first.key), first, null);
+  }
+
+  @Test
+  public void refIndexDescendingWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(REF_INDEX_ORDER, view().reverse().index("id").first(first.id), first, null);
+  }
+
+  @Test
+  public void copyIndexDescendingWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(COPY_INDEX_ORDER, view().reverse().index("name").first(first.name), first, null);
+  }
+
+  @Test
+  public void numericIndexDescendingWithStart() throws Exception {
+    CustomType1 first = pickLimit();
+    testIteration(NUMERIC_INDEX_ORDER, view().reverse().index("int").first(first.num), first, null);
+  }
+
+  @Test
+  public void naturalIndexWithSkip() throws Exception {
+    testIteration(NATURAL_ORDER, view().skip(RND.nextInt(allEntries.size() / 2)), null, null);
+  }
+
+  @Test
+  public void refIndexWithSkip() throws Exception {
+    testIteration(REF_INDEX_ORDER, view().index("id").skip(RND.nextInt(allEntries.size() / 2)),
+      null, null);
+  }
+
+  @Test
+  public void copyIndexWithSkip() throws Exception {
+    testIteration(COPY_INDEX_ORDER, view().index("name").skip(RND.nextInt(allEntries.size() / 2)),
+      null, null);
+  }
+
+  @Test
+  public void naturalIndexWithMax() throws Exception {
+    testIteration(NATURAL_ORDER, view().max(RND.nextInt(allEntries.size() / 2)), null, null);
+  }
+
+  @Test
+  public void copyIndexWithMax() throws Exception {
+    testIteration(COPY_INDEX_ORDER, view().index("name").max(RND.nextInt(allEntries.size() / 2)),
+      null, null);
+  }
+
+  @Test
+  public void naturalIndexWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(NATURAL_ORDER, view().last(last.key), null, last);
+  }
+
+  @Test
+  public void refIndexWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(REF_INDEX_ORDER, view().index("id").last(last.id), null, last);
+  }
+
+  @Test
+  public void copyIndexWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(COPY_INDEX_ORDER, view().index("name").last(last.name), null, last);
+  }
+
+  @Test
+  public void numericIndexWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(NUMERIC_INDEX_ORDER, view().index("int").last(last.num), null, last);
+  }
+
+  @Test
+  public void naturalIndexDescendingWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(NATURAL_ORDER, view().reverse().last(last.key), null, last);
+  }
+
+  @Test
+  public void refIndexDescendingWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(REF_INDEX_ORDER, view().reverse().index("id").last(last.id), null, last);
+  }
+
+  @Test
+  public void copyIndexDescendingWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(COPY_INDEX_ORDER, view().reverse().index("name").last(last.name),
+      null, last);
+  }
+
+  @Test
+  public void numericIndexDescendingWithLast() throws Exception {
+    CustomType1 last = pickLimit();
+    testIteration(NUMERIC_INDEX_ORDER, view().reverse().index("int").last(last.num),
+      null, last);
+   }
+
+  @Test
+  public void testRefWithIntNaturalKey() throws Exception {
+    LevelDBSuite.IntKeyType i = new LevelDBSuite.IntKeyType();
+    i.key = 1;
+    i.id = "1";
+    i.values = Arrays.asList("1");
+
+    db.write(i);
+
+    try(KVStoreIterator<?> it = db.view(i.getClass()).closeableIterator()) {
+      Object read = it.next();
+      assertEquals(i, read);
+    }
+  }
+
+  private CustomType1 pickLimit() {
+    // Picks an element that has clashes with other elements in the given index.
+    return clashingEntries.get(RND.nextInt(clashingEntries.size()));
+  }
+
+  /**
+   * Compares the two values and falls back to comparing the natural key of CustomType1
+   * if they're the same, to mimic the behavior of the indexing code.
+   */
+  private <T extends Comparable<T>> int compareWithFallback(
+      T v1,
+      T v2,
+      CustomType1 ct1,
+      CustomType1 ct2) {
+    int result = v1.compareTo(v2);
+    if (result != 0) {
+      return result;
+    }
+
+    return ct1.key.compareTo(ct2.key);
+  }
+
+  private void testIteration(
+      final BaseComparator order,
+      final KVStoreView<CustomType1> params,
+      final CustomType1 first,
+      final CustomType1 last) throws Exception {
+    List<CustomType1> indexOrder = sortBy(order.fallback());
+    if (!params.ascending) {
+      indexOrder = Lists.reverse(indexOrder);
+    }
+
+    Iterable<CustomType1> expected = indexOrder;
+    BaseComparator expectedOrder = params.ascending ? order : order.reverse();
+
+    if (first != null) {
+      expected = Iterables.filter(expected, v -> expectedOrder.compare(first, v) <= 0);
+    }
+
+    if (last != null) {
+      expected = Iterables.filter(expected, v -> expectedOrder.compare(v, last) <= 0);
+    }
+
+    if (params.skip > 0) {
+      expected = Iterables.skip(expected, (int) params.skip);
+    }
+
+    if (params.max != Long.MAX_VALUE) {
+      expected = Iterables.limit(expected, (int) params.max);
+    }
+
+    List<CustomType1> actual = collect(params);
+    compareLists(expected, actual);
+  }
+
+  /** Could use assertEquals(), but that creates hard to read errors for large lists. */
+  private void compareLists(Iterable<?> expected, List<?> actual) {
+    Iterator<?> expectedIt = expected.iterator();
+    Iterator<?> actualIt = actual.iterator();
+
+    int count = 0;
+    while (expectedIt.hasNext()) {
+      if (!actualIt.hasNext()) {
+        break;
+      }
+      count++;
+      assertEquals(expectedIt.next(), actualIt.next());
+    }
+
+    String message;
+    Object[] remaining;
+    int expectedCount = count;
+    int actualCount = count;
+
+    if (expectedIt.hasNext()) {
+      remaining = Iterators.toArray(expectedIt, Object.class);
+      expectedCount += remaining.length;
+      message = "missing";
+    } else {
+      remaining = Iterators.toArray(actualIt, Object.class);
+      actualCount += remaining.length;
+      message = "stray";
+    }
+
+    assertEquals(String.format("Found %s elements: %s", message, Arrays.asList(remaining)),
+      expectedCount, actualCount);
+  }
+
+  private KVStoreView<CustomType1> view() throws Exception {
+    return db.view(CustomType1.class);
+  }
+
+  private List<CustomType1> collect(KVStoreView<CustomType1> view) throws Exception {
+    return Arrays.asList(Iterables.toArray(view, CustomType1.class));
+  }
+
+  private List<CustomType1> sortBy(Comparator<CustomType1> comp) {
+    List<CustomType1> copy = new ArrayList<>(allEntries);
+    Collections.sort(copy, comp);
+    return copy;
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java
new file mode 100644
index 0000000000000..aecea26ec82f3
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.Slf4jReporter;
+import com.codahale.metrics.Snapshot;
+import com.codahale.metrics.Timer;
+import org.apache.commons.io.FileUtils;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.LoggerFactory;
+import static org.junit.Assert.*;
+
+/**
+ * A set of small benchmarks for the LevelDB implementation.
+ *
+ * The benchmarks are run over two different types (one with just a natural index, and one
+ * with a ref index), over a set of 2^20 elements, and the following tests are performed:
+ *
+ * - write (then update) elements in sequential natural key order
+ * - write (then update) elements in random natural key order
+ * - iterate over natural index, ascending and descending
+ * - iterate over ref index, ascending and descending
+ */
+@Ignore
+public class LevelDBBenchmark {
+
+  private static final int COUNT = 1024;
+  private static final AtomicInteger IDGEN = new AtomicInteger();
+  private static final MetricRegistry metrics = new MetricRegistry();
+  private static final Timer dbCreation = metrics.timer("dbCreation");
+  private static final Timer dbClose = metrics.timer("dbClose");
+
+  private LevelDB db;
+  private File dbpath;
+
+  @Before
+  public void setup() throws Exception {
+    dbpath = File.createTempFile("test.", ".ldb");
+    dbpath.delete();
+    try(Timer.Context ctx = dbCreation.time()) {
+      db = new LevelDB(dbpath);
+    }
+  }
+
+  @After
+  public void cleanup() throws Exception {
+    if (db != null) {
+      try(Timer.Context ctx = dbClose.time()) {
+        db.close();
+      }
+    }
+    if (dbpath != null) {
+      FileUtils.deleteQuietly(dbpath);
+    }
+  }
+
+  @AfterClass
+  public static void report() {
+    if (metrics.getTimers().isEmpty()) {
+      return;
+    }
+
+    int headingPrefix = 0;
+    for (Map.Entry<String, Timer> e : metrics.getTimers().entrySet()) {
+      headingPrefix = Math.max(e.getKey().length(), headingPrefix);
+    }
+    headingPrefix += 4;
+
+    StringBuilder heading = new StringBuilder();
+    for (int i = 0; i < headingPrefix; i++) {
+      heading.append(" ");
+    }
+    heading.append("\tcount");
+    heading.append("\tmean");
+    heading.append("\tmin");
+    heading.append("\tmax");
+    heading.append("\t95th");
+    System.out.println(heading);
+
+    for (Map.Entry<String, Timer> e : metrics.getTimers().entrySet()) {
+      StringBuilder row = new StringBuilder();
+      row.append(e.getKey());
+      for (int i = 0; i < headingPrefix - e.getKey().length(); i++) {
+        row.append(" ");
+      }
+
+      Snapshot s = e.getValue().getSnapshot();
+      row.append("\t").append(e.getValue().getCount());
+      row.append("\t").append(toMs(s.getMean()));
+      row.append("\t").append(toMs(s.getMin()));
+      row.append("\t").append(toMs(s.getMax()));
+      row.append("\t").append(toMs(s.get95thPercentile()));
+
+      System.out.println(row);
+    }
+
+    Slf4jReporter.forRegistry(metrics).outputTo(LoggerFactory.getLogger(LevelDBBenchmark.class))
+      .build().report();
+  }
+
+  private static String toMs(double nanos) {
+    return String.format("%.3f", nanos / 1000 / 1000);
+  }
+
+  @Test
+  public void sequentialWritesNoIndex() throws Exception {
+    List<SimpleType> entries = createSimpleType();
+    writeAll(entries, false, "sequentialWritesNoIndex");
+    writeAll(entries, false, "sequentialUpdatesNoIndex");
+    deleteNoIndex(entries, false, "sequentialDeleteNoIndex");
+  }
+
+  @Test
+  public void sequentialSyncWritesNoIndex() throws Exception {
+    List<SimpleType> entries = createSimpleType();
+    writeAll(entries, true, "sequentialSyncWritesNoIndex");
+    writeAll(entries, true, "sequentialSyncUpdatesNoIndex");
+    deleteNoIndex(entries, true, "sequentialSyncDeleteNoIndex");
+  }
+
+  @Test
+  public void randomWritesNoIndex() throws Exception {
+    List<SimpleType> entries = createSimpleType();
+
+    Collections.shuffle(entries);
+    writeAll(entries, false, "randomWritesNoIndex");
+
+    Collections.shuffle(entries);
+    writeAll(entries, false, "randomUpdatesNoIndex");
+
+    Collections.shuffle(entries);
+    deleteNoIndex(entries, false, "randomDeletesNoIndex");
+  }
+
+  @Test
+  public void randomSyncWritesNoIndex() throws Exception {
+    List<SimpleType> entries = createSimpleType();
+
+    Collections.shuffle(entries);
+    writeAll(entries, true, "randomSyncWritesNoIndex");
+
+    Collections.shuffle(entries);
+    writeAll(entries, true, "randomSyncUpdatesNoIndex");
+
+    Collections.shuffle(entries);
+    deleteNoIndex(entries, true, "randomSyncDeletesNoIndex");
+  }
+
+  @Test
+  public void sequentialWritesIndexedType() throws Exception {
+    List<IndexedType> entries = createIndexedType();
+    writeAll(entries, false, "sequentialWritesIndexed");
+    writeAll(entries, false, "sequentialUpdatesIndexed");
+    deleteIndexed(entries, false, "sequentialDeleteIndexed");
+  }
+
+  @Test
+  public void sequentialSyncWritesIndexedType() throws Exception {
+    List<IndexedType> entries = createIndexedType();
+    writeAll(entries, true, "sequentialSyncWritesIndexed");
+    writeAll(entries, true, "sequentialSyncUpdatesIndexed");
+    deleteIndexed(entries, true, "sequentialSyncDeleteIndexed");
+  }
+
+  @Test
+  public void randomWritesIndexedTypeAndIteration() throws Exception {
+    List<IndexedType> entries = createIndexedType();
+
+    Collections.shuffle(entries);
+    writeAll(entries, false, "randomWritesIndexed");
+
+    Collections.shuffle(entries);
+    writeAll(entries, false, "randomUpdatesIndexed");
+
+    // Run iteration benchmarks here since we've gone through the trouble of writing all
+    // the data already.
+    KVStoreView<?> view = db.view(IndexedType.class);
+    iterate(view, "naturalIndex");
+    iterate(view.reverse(), "naturalIndexDescending");
+    iterate(view.index("name"), "refIndex");
+    iterate(view.index("name").reverse(), "refIndexDescending");
+
+    Collections.shuffle(entries);
+    deleteIndexed(entries, false, "randomDeleteIndexed");
+  }
+
+  @Test
+  public void randomSyncWritesIndexedTypeAndIteration() throws Exception {
+    List<IndexedType> entries = createIndexedType();
+
+    Collections.shuffle(entries);
+    writeAll(entries, true, "randomSyncWritesIndexed");
+
+    Collections.shuffle(entries);
+    deleteIndexed(entries, true, "randomSyncDeleteIndexed");
+  }
+
+  private void iterate(KVStoreView<?> view, String name) throws Exception {
+    Timer create = metrics.timer(name + "CreateIterator");
+    Timer iter = metrics.timer(name + "Iteration");
+    KVStoreIterator<?> it = null;
+    {
+      // Create the iterator several times, just to have multiple data points.
+      for (int i = 0; i < 1024; i++) {
+        if (it != null) {
+          it.close();
+        }
+        try(Timer.Context ctx = create.time()) {
+          it = view.closeableIterator();
+        }
+      }
+    }
+
+    for (; it.hasNext(); ) {
+      try(Timer.Context ctx = iter.time()) {
+        it.next();
+      }
+    }
+  }
+
+  private void writeAll(List<?> entries, boolean sync, String timerName) throws Exception {
+    Timer timer = newTimer(timerName);
+    for (Object o : entries) {
+      try(Timer.Context ctx = timer.time()) {
+        db.write(o, sync);
+      }
+    }
+  }
+
+  private void deleteNoIndex(List<SimpleType> entries, boolean sync, String timerName)
+      throws Exception {
+    Timer delete = newTimer(timerName);
+    for (SimpleType i : entries) {
+      try(Timer.Context ctx = delete.time()) {
+        db.delete(i.getClass(), i.key, sync);
+      }
+    }
+  }
+
+  private void deleteIndexed(List<IndexedType> entries, boolean sync, String timerName)
+      throws Exception {
+    Timer delete = newTimer(timerName);
+    for (IndexedType i : entries) {
+      try(Timer.Context ctx = delete.time()) {
+        db.delete(i.getClass(), i.key, sync);
+      }
+    }
+  }
+
+  private List<SimpleType> createSimpleType() {
+    List<SimpleType> entries = new ArrayList<>();
+    for (int i = 0; i < COUNT; i++) {
+      SimpleType t = new SimpleType();
+      t.key = IDGEN.getAndIncrement();
+      t.name = "name" + (t.key % 1024);
+      entries.add(t);
+    }
+    return entries;
+  }
+
+  private List<IndexedType> createIndexedType() {
+    List<IndexedType> entries = new ArrayList<>();
+    for (int i = 0; i < COUNT; i++) {
+      IndexedType t = new IndexedType();
+      t.key = IDGEN.getAndIncrement();
+      t.name = "name" + (t.key % 1024);
+      entries.add(t);
+    }
+    return entries;
+  }
+
+  private Timer newTimer(String name) {
+    assertNull("Timer already exists: " + name, metrics.getTimers().get(name));
+    return metrics.timer(name);
+  }
+
+  public static class SimpleType {
+
+    @KVIndex
+    public int key;
+
+    public String name;
+
+  }
+
+  public static class IndexedType {
+
+    @KVIndex
+    public int key;
+
+    @KVIndex("name")
+    public String name;
+
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
new file mode 100644
index 0000000000000..1f88aae0be2aa
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import org.apache.commons.io.FileUtils;
+import org.iq80.leveldb.DBIterator;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class LevelDBSuite {
+
+  private LevelDB db;
+  private File dbpath;
+
+  @After
+  public void cleanup() throws Exception {
+    if (db != null) {
+      db.close();
+    }
+    if (dbpath != null) {
+      FileUtils.deleteQuietly(dbpath);
+    }
+  }
+
+  @Before
+  public void setup() throws Exception {
+    dbpath = File.createTempFile("test.", ".ldb");
+    dbpath.delete();
+    db = new LevelDB(dbpath);
+  }
+
+  @Test
+  public void testReopenAndVersionCheckDb() throws Exception {
+    db.close();
+    db = null;
+    assertTrue(dbpath.exists());
+
+    db = new LevelDB(dbpath);
+    assertEquals(LevelDB.STORE_VERSION,
+      db.serializer.deserializeLong(db.db().get(LevelDB.STORE_VERSION_KEY)));
+    db.db().put(LevelDB.STORE_VERSION_KEY, db.serializer.serialize(LevelDB.STORE_VERSION + 1));
+    db.close();
+    db = null;
+
+    try {
+      db = new LevelDB(dbpath);
+      fail("Should have failed version check.");
+    } catch (UnsupportedStoreVersionException e) {
+      // Expected.
+    }
+  }
+
+  @Test
+  public void testObjectWriteReadDelete() throws Exception {
+    CustomType1 t = new CustomType1();
+    t.key = "key";
+    t.id = "id";
+    t.name = "name";
+
+    try {
+      db.read(CustomType1.class, t.key);
+      fail("Expected exception for non-existant object.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+
+    db.write(t);
+    assertEquals(t, db.read(t.getClass(), t.key));
+    assertEquals(1L, db.count(t.getClass()));
+
+    db.delete(t.getClass(), t.key);
+    try {
+      db.read(t.getClass(), t.key);
+      fail("Expected exception for deleted object.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+
+    // Look into the actual DB and make sure that all the keys related to the type have been
+    // removed.
+    assertEquals(0, countKeys(t.getClass()));
+  }
+
+  @Test
+  public void testMultipleObjectWriteReadDelete() throws Exception {
+    CustomType1 t1 = new CustomType1();
+    t1.key = "key1";
+    t1.id = "id";
+    t1.name = "name1";
+
+    CustomType1 t2 = new CustomType1();
+    t2.key = "key2";
+    t2.id = "id";
+    t2.name = "name2";
+
+    db.write(t1);
+    db.write(t2);
+
+    assertEquals(t1, db.read(t1.getClass(), t1.key));
+    assertEquals(t2, db.read(t2.getClass(), t2.key));
+    assertEquals(2L, db.count(t1.getClass()));
+
+    // There should be one "id" index entry with two values.
+    assertEquals(2, countIndexEntries(t1.getClass(), "id", t1.id));
+
+    // Delete the first entry; now there should be 3 remaining keys, since one of the "name"
+    // index entries should have been removed.
+    db.delete(t1.getClass(), t1.key);
+
+    // Make sure there's a single entry in the "id" index now.
+    assertEquals(1, countIndexEntries(t2.getClass(), "id", t2.id));
+
+    // Delete the remaining entry, make sure all data is gone.
+    db.delete(t2.getClass(), t2.key);
+    assertEquals(0, countKeys(t2.getClass()));
+  }
+
+  @Test
+  public void testMultipleTypesWriteReadDelete() throws Exception {
+    CustomType1 t1 = new CustomType1();
+    t1.key = "1";
+    t1.id = "id";
+    t1.name = "name1";
+
+    IntKeyType t2 = new IntKeyType();
+    t2.key = 2;
+    t2.id = "2";
+    t2.values = Arrays.asList("value1", "value2");
+
+    ArrayKeyIndexType t3 = new ArrayKeyIndexType();
+    t3.key = new int[] { 42, 84 };
+    t3.id = new String[] { "id1", "id2" };
+
+    db.write(t1);
+    db.write(t2);
+    db.write(t3);
+
+    assertEquals(t1, db.read(t1.getClass(), t1.key));
+    assertEquals(t2, db.read(t2.getClass(), t2.key));
+    assertEquals(t3, db.read(t3.getClass(), t3.key));
+
+    // There should be one "id" index with a single entry for each type.
+    assertEquals(1, countIndexEntries(t1.getClass(), "id", t1.id));
+    assertEquals(1, countIndexEntries(t2.getClass(), "id", t2.id));
+    assertEquals(1, countIndexEntries(t3.getClass(), "id", t3.id));
+
+    // Delete the first entry; this should not affect the entries for the second type.
+    db.delete(t1.getClass(), t1.key);
+    assertEquals(0, countKeys(t1.getClass()));
+    assertEquals(1, countIndexEntries(t2.getClass(), "id", t2.id));
+    assertEquals(1, countIndexEntries(t3.getClass(), "id", t3.id));
+
+    // Delete the remaining entries, make sure all data is gone.
+    db.delete(t2.getClass(), t2.key);
+    assertEquals(0, countKeys(t2.getClass()));
+
+    db.delete(t3.getClass(), t3.key);
+    assertEquals(0, countKeys(t3.getClass()));
+  }
+
+  @Test
+  public void testMetadata() throws Exception {
+    assertNull(db.getMetadata(CustomType1.class));
+
+    CustomType1 t = new CustomType1();
+    t.id = "id";
+    t.name = "name";
+
+    db.setMetadata(t);
+    assertEquals(t, db.getMetadata(CustomType1.class));
+
+    db.setMetadata(null);
+    assertNull(db.getMetadata(CustomType1.class));
+  }
+
+  private long countIndexEntries(Class<?> type, String index, Object value) throws Exception {
+    LevelDBTypeInfo.Index idx = db.getTypeInfo(type).index(index);
+    return idx.getCount(idx.end());
+  }
+
+  @Test
+  public void testUpdate() throws Exception {
+    CustomType1 t = new CustomType1();
+    t.key = "key";
+    t.id = "id";
+    t.name = "name";
+
+    db.write(t);
+
+    t.name = "anotherName";
+
+    db.write(t);
+
+    assertEquals(1, db.count(t.getClass()));
+
+    LevelDBTypeInfo.Index ni = db.getTypeInfo(t.getClass()).index("name");
+    assertEquals(1, ni.getCount(ni.end()));
+    assertEquals(1, ni.getCount(ni.end("anotherName")));
+    try {
+      db.get(ni.end("name"), Integer.class);
+      fail("Should have gotten an exception.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+  }
+
+  private int countKeys(Class<?> type) throws Exception {
+    byte[] prefix = db.getTypeInfo(type).keyPrefix();
+    int count = 0;
+
+    DBIterator it = db.db().iterator();
+    it.seek(prefix);
+
+    while (it.hasNext()) {
+      byte[] key = it.next().getKey();
+      if (LevelDBIterator.startsWith(key, prefix)) {
+        count++;
+      }
+    }
+
+    return count;
+  }
+
+  public static class IntKeyType {
+
+    @KVIndex
+    public int key;
+
+    @KVIndex("id")
+    public String id;
+
+    public List<String> values;
+
+    @Override
+    public boolean equals(Object o) {
+      if (o instanceof IntKeyType) {
+        IntKeyType other = (IntKeyType) o;
+        return key == other.key && id.equals(other.id) && values.equals(other.values);
+      }
+      return false;
+    }
+
+    @Override
+    public int hashCode() {
+      return id.hashCode();
+    }
+
+  }
+
+  public static class ArrayKeyIndexType {
+
+    @KVIndex
+    public int[] key;
+
+    @KVIndex("id")
+    public String[] id;
+
+    @Override
+    public boolean equals(Object o) {
+      if (o instanceof ArrayKeyIndexType) {
+        ArrayKeyIndexType other = (ArrayKeyIndexType) o;
+        return Arrays.equals(key, other.key) && Arrays.equals(id, other.id);
+      }
+      return false;
+    }
+
+    @Override
+    public int hashCode() {
+      return key.hashCode();
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java
new file mode 100644
index 0000000000000..cf69f32dfb354
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class LevelDBTypeInfoSuite {
+
+  @Test
+  public void testIndexAnnotation() throws Exception {
+    KVTypeInfo ti = new KVTypeInfo(CustomType1.class);
+    assertEquals(4, ti.indices().count());
+
+    CustomType1 t1 = new CustomType1();
+    t1.key = "key";
+    t1.id = "id";
+    t1.name = "name";
+    t1.num = 42;
+
+    assertEquals(t1.key, ti.getIndexValue(KVIndex.NATURAL_INDEX_NAME, t1));
+    assertEquals(t1.id, ti.getIndexValue("id", t1));
+    assertEquals(t1.name, ti.getIndexValue("name", t1));
+    assertEquals(t1.num, ti.getIndexValue("int", t1));
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testNoNaturalIndex() throws Exception {
+    newTypeInfo(NoNaturalIndex.class);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testDuplicateIndex() throws Exception {
+    newTypeInfo(DuplicateIndex.class);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testEmptyIndexName() throws Exception {
+    newTypeInfo(EmptyIndexName.class);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testIllegalIndexName() throws Exception {
+    newTypeInfo(IllegalIndexName.class);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testIllegalIndexMethod() throws Exception {
+    newTypeInfo(IllegalIndexMethod.class);
+  }
+
+  @Test
+  public void testKeyClashes() throws Exception {
+    LevelDBTypeInfo ti = newTypeInfo(CustomType1.class);
+
+    CustomType1 t1 = new CustomType1();
+    t1.key = "key1";
+    t1.name = "a";
+
+    CustomType1 t2 = new CustomType1();
+    t2.key = "key2";
+    t2.name = "aa";
+
+    CustomType1 t3 = new CustomType1();
+    t3.key = "key3";
+    t3.name = "aaa";
+
+    // Make sure entries with conflicting names are sorted correctly.
+    assertBefore(ti.index("name").entityKey(t1), ti.index("name").entityKey(t2));
+    assertBefore(ti.index("name").entityKey(t1), ti.index("name").entityKey(t3));
+    assertBefore(ti.index("name").entityKey(t2), ti.index("name").entityKey(t3));
+  }
+
+  @Test
+  public void testNumEncoding() throws Exception {
+    LevelDBTypeInfo.Index idx = newTypeInfo(CustomType1.class).indices().iterator().next();
+
+    assertBefore(idx.toKey(1), idx.toKey(2));
+    assertBefore(idx.toKey(-1), idx.toKey(2));
+    assertBefore(idx.toKey(-11), idx.toKey(2));
+    assertBefore(idx.toKey(-11), idx.toKey(-1));
+    assertBefore(idx.toKey(1), idx.toKey(11));
+    assertBefore(idx.toKey(Integer.MIN_VALUE), idx.toKey(Integer.MAX_VALUE));
+    assertEquals(LevelDBTypeInfo.INT_ENCODED_LEN + LevelDBTypeInfo.ENTRY_PREFIX.length(),
+      idx.toKey(Integer.MIN_VALUE).length());
+
+    assertBefore(idx.toKey(1L), idx.toKey(2L));
+    assertBefore(idx.toKey(-1L), idx.toKey(2L));
+    assertBefore(idx.toKey(Long.MIN_VALUE), idx.toKey(Long.MAX_VALUE));
+    assertEquals(LevelDBTypeInfo.LONG_ENCODED_LEN + LevelDBTypeInfo.ENTRY_PREFIX.length(),
+      idx.toKey(Long.MIN_VALUE).length());
+
+    assertBefore(idx.toKey((short) 1), idx.toKey((short) 2));
+    assertBefore(idx.toKey((short) -1), idx.toKey((short) 2));
+    assertBefore(idx.toKey(Short.MIN_VALUE), idx.toKey(Short.MAX_VALUE));
+    assertEquals(LevelDBTypeInfo.SHORT_ENCODED_LEN + LevelDBTypeInfo.ENTRY_PREFIX.length(),
+      idx.toKey(Short.MIN_VALUE).length());
+
+    assertBefore(idx.toKey((byte) 1), idx.toKey((byte) 2));
+    assertBefore(idx.toKey((byte) -1), idx.toKey((byte) 2));
+    assertBefore(idx.toKey(Byte.MIN_VALUE), idx.toKey(Byte.MAX_VALUE));
+    assertEquals(LevelDBTypeInfo.BYTE_ENCODED_LEN + LevelDBTypeInfo.ENTRY_PREFIX.length(),
+      idx.toKey(Byte.MIN_VALUE).length());
+
+    assertEquals(LevelDBTypeInfo.ENTRY_PREFIX + "false", idx.toKey(false));
+    assertEquals(LevelDBTypeInfo.ENTRY_PREFIX + "true", idx.toKey(true));
+  }
+
+  @Test
+  public void testArrayIndices() throws Exception {
+    LevelDBTypeInfo.Index idx = newTypeInfo(CustomType1.class).indices().iterator().next();
+
+    assertBefore(idx.toKey(new String[] { "str1" }), idx.toKey(new String[] { "str2" }));
+    assertBefore(idx.toKey(new String[] { "str1", "str2" }),
+      idx.toKey(new String[] { "str1", "str3" }));
+
+    assertBefore(idx.toKey(new int[] { 1 }), idx.toKey(new int[] { 2 }));
+    assertBefore(idx.toKey(new int[] { 1, 2 }), idx.toKey(new int[] { 1, 3 }));
+  }
+
+  private LevelDBTypeInfo newTypeInfo(Class<?> type) throws Exception {
+    return new LevelDBTypeInfo(null, type, type.getName().getBytes(UTF_8));
+  }
+
+  private void assertBefore(byte[] key1, byte[] key2) {
+    assertBefore(new String(key1, UTF_8), new String(key2, UTF_8));
+  }
+
+  private void assertBefore(String str1, String str2) {
+    assertTrue(String.format("%s < %s failed", str1, str2), str1.compareTo(str2) < 0);
+  }
+
+  public static class NoNaturalIndex {
+
+    public String id;
+
+  }
+
+  public static class DuplicateIndex {
+
+    @KVIndex("id")
+    public String id;
+
+    @KVIndex("id")
+    public String id2;
+
+  }
+
+  public static class EmptyIndexName {
+
+    @KVIndex("")
+    public String id;
+
+  }
+
+  public static class IllegalIndexName {
+
+    @KVIndex("__invalid")
+    public String id;
+
+  }
+
+  public static class IllegalIndexMethod {
+
+    @KVIndex("id")
+    public String id(boolean illegalParam) {
+      return null;
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/test/resources/log4j.properties b/common/kvstore/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..e8da774f7ca9e
--- /dev/null
+++ b/common/kvstore/src/test/resources/log4j.properties
@@ -0,0 +1,27 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=DEBUG, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Silence verbose logs from 3rd-party libraries.
+log4j.logger.io.netty=INFO
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b0dd2fc187baf..fb0405b1a69c6 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -879,7 +879,7 @@ private[spark] class PythonAccumulatorV2(
     private val serverPort: Int)
   extends CollectionAccumulator[Array[Byte]] {
 
-  Utils.checkHost(serverHost, "Expected hostname")
+  Utils.checkHost(serverHost)
 
   val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index ac09c6c497f8b..b5cb3f0a0f9dc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -43,7 +43,7 @@ private[deploy] object DeployMessages {
       memory: Int,
       workerWebUiUrl: String)
     extends DeployMessage {
-    Utils.checkHost(host, "Required hostname")
+    Utils.checkHost(host)
     assert (port > 0)
   }
 
@@ -131,7 +131,7 @@ private[deploy] object DeployMessages {
 
   // TODO(matei): replace hostPort with host
   case class ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) {
-    Utils.checkHostPort(hostPort, "Required hostport")
+    Utils.checkHostPort(hostPort)
   }
 
   case class ExecutorUpdated(id: Int, state: ExecutorState, message: Option[String],
@@ -183,7 +183,7 @@ private[deploy] object DeployMessages {
       completedDrivers: Array[DriverInfo],
       status: MasterState) {
 
-    Utils.checkHost(host, "Required hostname")
+    Utils.checkHost(host)
     assert (port > 0)
 
     def uri: String = "spark://" + host + ":" + port
@@ -201,7 +201,7 @@ private[deploy] object DeployMessages {
     drivers: List[DriverRunner], finishedDrivers: List[DriverRunner], masterUrl: String,
     cores: Int, memory: Int, coresUsed: Int, memoryUsed: Int, masterWebUiUrl: String) {
 
-    Utils.checkHost(host, "Required hostname")
+    Utils.checkHost(host)
     assert (port > 0)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 816bf37e39fee..e061939623cbb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -80,7 +80,7 @@ private[deploy] class Master(
   private val waitingDrivers = new ArrayBuffer[DriverInfo]
   private var nextDriverNumber = 0
 
-  Utils.checkHost(address.host, "Expected hostname")
+  Utils.checkHost(address.host)
 
   private val masterMetricsSystem = MetricsSystem.createMetricsSystem("master", conf, securityMgr)
   private val applicationMetricsSystem = MetricsSystem.createMetricsSystem("applications", conf,
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
index c63793c16dcef..615d2533cf085 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
@@ -60,12 +60,12 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) exte
   @tailrec
   private def parse(args: List[String]): Unit = args match {
     case ("--ip" | "-i") :: value :: tail =>
-      Utils.checkHost(value, "ip no longer supported, please use hostname " + value)
+      Utils.checkHost(value)
       host = value
       parse(tail)
 
     case ("--host" | "-h") :: value :: tail =>
-      Utils.checkHost(value, "Please use hostname " + value)
+      Utils.checkHost(value)
       host = value
       parse(tail)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
index 4e20c10fd1427..c87d6e24b78c6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
@@ -32,7 +32,7 @@ private[spark] class WorkerInfo(
     val webUiAddress: String)
   extends Serializable {
 
-  Utils.checkHost(host, "Expected hostname")
+  Utils.checkHost(host)
   assert (port > 0)
 
   @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 00b9d1af373db..34e3a4c020c80 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -55,7 +55,7 @@ private[deploy] class Worker(
   private val host = rpcEnv.address.host
   private val port = rpcEnv.address.port
 
-  Utils.checkHost(host, "Expected hostname")
+  Utils.checkHost(host)
   assert (port > 0)
 
   // A scheduled executor used to send messages at the specified time.
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
index 777020d4d5c84..bd07d342e04ac 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
@@ -68,12 +68,12 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
   @tailrec
   private def parse(args: List[String]): Unit = args match {
     case ("--ip" | "-i") :: value :: tail =>
-      Utils.checkHost(value, "ip no longer supported, please use hostname " + value)
+      Utils.checkHost(value)
       host = value
       parse(tail)
 
     case ("--host" | "-h") :: value :: tail =>
-      Utils.checkHost(value, "Please use hostname " + value)
+      Utils.checkHost(value)
       host = value
       parse(tail)
 
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 51b6c373c4daf..3bc47b670305b 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -71,7 +71,7 @@ private[spark] class Executor(
   private val conf = env.conf
 
   // No ip or host:port - just hostname
-  Utils.checkHost(executorHostname, "Expected executed slave to be a hostname")
+  Utils.checkHost(executorHostname)
   // must not have port specified.
   assert (0 == Utils.parseHostPort(executorHostname)._2)
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 3219969bcd06f..33ce30c58e1ad 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -612,12 +612,19 @@ private[spark] class BlockManager(
 
   /**
    * Return a list of locations for the given block, prioritizing the local machine since
-   * multiple block managers can share the same host.
+   * multiple block managers can share the same host, followed by hosts on the same rack.
    */
   private def getLocations(blockId: BlockId): Seq[BlockManagerId] = {
     val locs = Random.shuffle(master.getLocations(blockId))
     val (preferredLocs, otherLocs) = locs.partition { loc => blockManagerId.host == loc.host }
-    preferredLocs ++ otherLocs
+    blockManagerId.topologyInfo match {
+      case None => preferredLocs ++ otherLocs
+      case Some(_) =>
+        val (sameRackLocs, differentRackLocs) = otherLocs.partition {
+          loc => blockManagerId.topologyInfo == loc.topologyInfo
+        }
+        preferredLocs ++ sameRackLocs ++ differentRackLocs
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
index c37a3604d28fa..2c3da0ee85e06 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
@@ -46,7 +46,7 @@ class BlockManagerId private (
   def executorId: String = executorId_
 
   if (null != host_) {
-    Utils.checkHost(host_, "Expected hostname")
+    Utils.checkHost(host_)
     assert (port_ > 0)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
index 46a5cb2cff5a5..e5cccf39f9455 100644
--- a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
@@ -28,7 +28,7 @@ private[spark] object RpcUtils {
   def makeDriverRef(name: String, conf: SparkConf, rpcEnv: RpcEnv): RpcEndpointRef = {
     val driverHost: String = conf.get("spark.driver.host", "localhost")
     val driverPort: Int = conf.getInt("spark.driver.port", 7077)
-    Utils.checkHost(driverHost, "Expected hostname")
+    Utils.checkHost(driverHost)
     rpcEnv.setupEndpointRef(RpcAddress(driverHost, driverPort), name)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 4d37db96dfc37..edfe229792323 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -937,12 +937,13 @@ private[spark] object Utils extends Logging {
     customHostname.getOrElse(InetAddresses.toUriString(localIpAddress))
   }
 
-  def checkHost(host: String, message: String = "") {
-    assert(host.indexOf(':') == -1, message)
+  def checkHost(host: String) {
+    assert(host != null && host.indexOf(':') == -1, s"Expected hostname (not IP) but got $host")
   }
 
-  def checkHostPort(hostPort: String, message: String = "") {
-    assert(hostPort.indexOf(':') != -1, message)
+  def checkHostPort(hostPort: String) {
+    assert(hostPort != null && hostPort.indexOf(':') != -1,
+      s"Expected host and port but got $hostPort")
   }
 
   // Typically, this will be of order of number of nodes in cluster
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
index 89ed031b6fcd1..f0c521b00b583 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.storage
 
+import java.util.UUID
+
 import org.apache.spark.SparkFunSuite
 
 class BlockIdSuite extends SparkFunSuite {
@@ -67,6 +69,32 @@ class BlockIdSuite extends SparkFunSuite {
     assertSame(id, BlockId(id.toString))
   }
 
+  test("shuffle data") {
+    val id = ShuffleDataBlockId(4, 5, 6)
+    assertSame(id, ShuffleDataBlockId(4, 5, 6))
+    assertDifferent(id, ShuffleDataBlockId(6, 5, 6))
+    assert(id.name === "shuffle_4_5_6.data")
+    assert(id.asRDDId === None)
+    assert(id.shuffleId === 4)
+    assert(id.mapId === 5)
+    assert(id.reduceId === 6)
+    assert(!id.isShuffle)
+    assertSame(id, BlockId(id.toString))
+  }
+
+  test("shuffle index") {
+    val id = ShuffleIndexBlockId(7, 8, 9)
+    assertSame(id, ShuffleIndexBlockId(7, 8, 9))
+    assertDifferent(id, ShuffleIndexBlockId(9, 8, 9))
+    assert(id.name === "shuffle_7_8_9.index")
+    assert(id.asRDDId === None)
+    assert(id.shuffleId === 7)
+    assert(id.mapId === 8)
+    assert(id.reduceId === 9)
+    assert(!id.isShuffle)
+    assertSame(id, BlockId(id.toString))
+  }
+
   test("broadcast") {
     val id = BroadcastBlockId(42)
     assertSame(id, BroadcastBlockId(42))
@@ -101,6 +129,30 @@ class BlockIdSuite extends SparkFunSuite {
     assertSame(id, BlockId(id.toString))
   }
 
+  test("temp local") {
+    val id = TempLocalBlockId(new UUID(5, 2))
+    assertSame(id, TempLocalBlockId(new UUID(5, 2)))
+    assertDifferent(id, TempLocalBlockId(new UUID(5, 3)))
+    assert(id.name === "temp_local_00000000-0000-0005-0000-000000000002")
+    assert(id.asRDDId === None)
+    assert(id.isBroadcast === false)
+    assert(id.id.getMostSignificantBits() === 5)
+    assert(id.id.getLeastSignificantBits() === 2)
+    assert(!id.isShuffle)
+  }
+
+  test("temp shuffle") {
+    val id = TempShuffleBlockId(new UUID(1, 2))
+    assertSame(id, TempShuffleBlockId(new UUID(1, 2)))
+    assertDifferent(id, TempShuffleBlockId(new UUID(1, 3)))
+    assert(id.name === "temp_shuffle_00000000-0000-0001-0000-000000000002")
+    assert(id.asRDDId === None)
+    assert(id.isBroadcast === false)
+    assert(id.id.getMostSignificantBits() === 1)
+    assert(id.id.getLeastSignificantBits() === 2)
+    assert(!id.isShuffle)
+  }
+
   test("test") {
     val id = TestBlockId("abc")
     assertSame(id, TestBlockId("abc"))
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index a8b9604899838..1e7bcdb6740f6 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -496,8 +496,8 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(list2DiskGet.get.readMethod === DataReadMethod.Disk)
   }
 
-  test("optimize a location order of blocks") {
-    val localHost = Utils.localHostName()
+  test("optimize a location order of blocks without topology information") {
+    val localHost = "localhost"
     val otherHost = "otherHost"
     val bmMaster = mock(classOf[BlockManagerMaster])
     val bmId1 = BlockManagerId("id1", localHost, 1)
@@ -508,7 +508,32 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val blockManager = makeBlockManager(128, "exec", bmMaster)
     val getLocations = PrivateMethod[Seq[BlockManagerId]]('getLocations)
     val locations = blockManager invokePrivate getLocations(BroadcastBlockId(0))
-    assert(locations.map(_.host).toSet === Set(localHost, localHost, otherHost))
+    assert(locations.map(_.host) === Seq(localHost, localHost, otherHost))
+  }
+
+  test("optimize a location order of blocks with topology information") {
+    val localHost = "localhost"
+    val otherHost = "otherHost"
+    val localRack = "localRack"
+    val otherRack = "otherRack"
+
+    val bmMaster = mock(classOf[BlockManagerMaster])
+    val bmId1 = BlockManagerId("id1", localHost, 1, Some(localRack))
+    val bmId2 = BlockManagerId("id2", localHost, 2, Some(localRack))
+    val bmId3 = BlockManagerId("id3", otherHost, 3, Some(otherRack))
+    val bmId4 = BlockManagerId("id4", otherHost, 4, Some(otherRack))
+    val bmId5 = BlockManagerId("id5", otherHost, 5, Some(localRack))
+    when(bmMaster.getLocations(mc.any[BlockId]))
+      .thenReturn(Seq(bmId1, bmId2, bmId5, bmId3, bmId4))
+
+    val blockManager = makeBlockManager(128, "exec", bmMaster)
+    blockManager.blockManagerId =
+      BlockManagerId(SparkContext.DRIVER_IDENTIFIER, localHost, 1, Some(localRack))
+    val getLocations = PrivateMethod[Seq[BlockManagerId]]('getLocations)
+    val locations = blockManager invokePrivate getLocations(BroadcastBlockId(0))
+    assert(locations.map(_.host) === Seq(localHost, localHost, otherHost, otherHost, otherHost))
+    assert(locations.flatMap(_.topologyInfo)
+      === Seq(localRack, localRack, localRack, otherRack, otherRack))
   }
 
   test("SPARK-9591: getRemoteBytes from another location when Exception throw") {
diff --git a/docs/cloud-integration.md b/docs/cloud-integration.md
new file mode 100644
index 0000000000000..751a192da4ffd
--- /dev/null
+++ b/docs/cloud-integration.md
@@ -0,0 +1,200 @@
+---
+layout: global
+displayTitle: Integration with Cloud Infrastructures
+title: Integration with Cloud Infrastructures
+description: Introduction to cloud storage support in Apache Spark SPARK_VERSION_SHORT
+---
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+## Introduction
+
+
+All major cloud providers offer persistent data storage in *object stores*.
+These are not classic "POSIX" file systems.
+In order to store hundreds of petabytes of data without any single points of failure,
+object stores replace the classic filesystem directory tree
+with a simpler model of `object-name => data`. To enable remote access, operations
+on objects are usually offered as (slow) HTTP REST operations.
+
+Spark can read and write data in object stores through filesystem connectors implemented
+in Hadoop or provided by the infrastructure suppliers themselves.
+These connectors make the object stores look *almost* like filesystems, with directories and files
+and the classic operations on them such as list, delete and rename.
+
+
+### Important: Cloud Object Stores are Not Real Filesystems
+
+While the stores appear to be filesystems, underneath
+they are still object stores, [and the difference is significant](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/filesystem/introduction.html)
+
+They cannot be used as a direct replacement for a cluster filesystem such as HDFS
+*except where this is explicitly stated*.
+
+Key differences are:
+
+* Changes to stored objects may not be immediately visible, both in directory listings and actual data access.
+* The means by which directories are emulated may make working with them slow.
+* Rename operations may be very slow and, on failure, leave the store in an unknown state.
+* Seeking within a file may require new HTTP calls, hurting performance. 
+
+How does this affect Spark? 
+
+1. Reading and writing data can be significantly slower than working with a normal filesystem.
+1. Some directory structures may be very inefficient to scan during query split calculation.
+1. The output of work may not be immediately visible to a follow-on query.
+1. The rename-based algorithm by which Spark normally commits work when saving an RDD, DataFrame or Dataset
+ is potentially both slow and unreliable.
+
+For these reasons, it is not always safe to use an object store as a direct destination of queries, or as
+an intermediate store in a chain of queries. Consult the documentation of the object store and its
+connector to determine which uses are considered safe.
+
+In particular: *without some form of consistency layer, Amazon S3 cannot
+be safely used as the direct destination of work with the normal rename-based committer.*
+
+### Installation
+
+With the relevant libraries on the classpath and Spark configured with valid credentials,
+objects can be can be read or written by using their URLs as the path to data.
+For example `sparkContext.textFile("s3a://landsat-pds/scene_list.gz")` will create
+an RDD of the file `scene_list.gz` stored in S3, using the s3a connector.
+
+To add the relevant libraries to an application's classpath, include the `hadoop-cloud` 
+module and its dependencies.
+
+In Maven, add the following to the `pom.xml` file, assuming `spark.version`
+is set to the chosen version of Spark:
+
+{% highlight xml %}
+<dependencyManagement>
+  ...
+  <dependency>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>hadoop-cloud_2.11</artifactId>
+    <version>${spark.version}</version>
+  </dependency>
+  ...
+</dependencyManagement>
+{% endhighlight %}
+
+Commercial products based on Apache Spark generally directly set up the classpath
+for talking to cloud infrastructures, in which case this module may not be needed.
+
+### Authenticating
+
+Spark jobs must authenticate with the object stores to access data within them.
+
+1. When Spark is running in a cloud infrastructure, the credentials are usually automatically set up.
+1. `spark-submit` reads the `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`
+and `AWS_SESSION_TOKEN` environment variables and sets the associated authentication options
+for the `s3n` and `s3a` connectors to Amazon S3.
+1. In a Hadoop cluster, settings may be set in the `core-site.xml` file.
+1. Authentication details may be manually added to the Spark configuration in `spark-default.conf`
+1. Alternatively, they can be programmatically set in the `SparkConf` instance used to configure 
+the application's `SparkContext`.
+
+*Important: never check authentication secrets into source code repositories,
+especially public ones*
+
+Consult [the Hadoop documentation](https://hadoop.apache.org/docs/current/) for the relevant
+configuration and security options.
+
+## Configuring
+
+Each cloud connector has its own set of configuration parameters, again, 
+consult the relevant documentation.
+
+### Recommended settings for writing to object stores
+
+For object stores whose consistency model means that rename-based commits are safe
+use the `FileOutputCommitter` v2 algorithm for performance:
+
+```
+spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2
+```
+
+This does less renaming at the end of a job than the "version 1" algorithm.
+As it still uses `rename()` to commit files, it is unsafe to use
+when the object store does not have consistent metadata/listings.
+
+The committer can also be set to ignore failures when cleaning up temporary
+files; this reduces the risk that a transient network problem is escalated into a 
+job failure:
+
+```
+spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored true
+```
+
+As storing temporary files can run up charges; delete
+directories called `"_temporary"` on a regular basis to avoid this.
+
+### Parquet I/O Settings
+
+For optimal performance when working with Parquet data use the following settings:
+
+```
+spark.hadoop.parquet.enable.summary-metadata false
+spark.sql.parquet.mergeSchema false
+spark.sql.parquet.filterPushdown true
+spark.sql.hive.metastorePartitionPruning true
+```
+
+These minimise the amount of data read during queries.
+
+### ORC I/O Settings
+
+For best performance when working with ORC data, use these settings:
+
+```
+spark.sql.orc.filterPushdown true
+spark.sql.orc.splits.include.file.footer true
+spark.sql.orc.cache.stripe.details.size 10000
+spark.sql.hive.metastorePartitionPruning true
+```
+
+Again, these minimise the amount of data read during queries.
+
+## Spark Streaming and Object Storage
+
+Spark Streaming can monitor files added to object stores, by
+creating a `FileInputDStream` to monitor a path in the store through a call to
+`StreamingContext.textFileStream()`.
+
+1. The time to scan for new files is proportional to the number of files
+under the path, not the number of *new* files, so it can become a slow operation.
+The size of the window needs to be set to handle this.
+
+1. Files only appear in an object store once they are completely written; there
+is no need for a worklow of write-then-rename to ensure that files aren't picked up
+while they are still being written. Applications can write straight to the monitored directory.
+
+1. Streams should only be checkpointed to an store implementing a fast and
+atomic `rename()` operation Otherwise the checkpointing may be slow and potentially unreliable.
+
+## Further Reading
+
+Here is the documentation on the standard connectors both from Apache and the cloud providers.
+
+* [OpenStack Swift](https://hadoop.apache.org/docs/current/hadoop-openstack/index.html). Hadoop 2.6+
+* [Azure Blob Storage](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html). Since Hadoop 2.7
+* [Azure Data Lake](https://hadoop.apache.org/docs/current/hadoop-azure-datalake/index.html). Since Hadoop 2.8
+* [Amazon S3 via S3A and S3N](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html). Hadoop 2.6+
+* [Amazon EMR File System (EMRFS)](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-fs.html). From Amazon
+* [Google Cloud Storage Connector for Spark and Hadoop](https://cloud.google.com/hadoop/google-cloud-storage-connector). From Google
+
+
diff --git a/docs/index.md b/docs/index.md
index ad4f24ff1a5d1..960b968454d0e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -126,6 +126,7 @@ options for deployment:
 * [Security](security.html): Spark security support
 * [Hardware Provisioning](hardware-provisioning.html): recommendations for cluster hardware
 * Integration with other storage systems:
+  * [Cloud Infrastructures](cloud-integration.html)
   * [OpenStack Swift](storage-openstack-swift.html)
 * [Building Spark](building-spark.html): build Spark using the Maven system
 * [Contributing to Spark](http://spark.apache.org/contributing.html)
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index e2bf2d7ca77ca..52e59df9990e9 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -323,7 +323,7 @@ One important parameter for parallel collections is the number of *partitions* t
 
 Spark can create distributed datasets from any storage source supported by Hadoop, including your local file system, HDFS, Cassandra, HBase, [Amazon S3](http://wiki.apache.org/hadoop/AmazonS3), etc. Spark supports text files, [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), and any other Hadoop [InputFormat](http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/InputFormat.html).
 
-Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3n://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
+Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3a://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
 
 {% highlight scala %}
 scala> val distFile = sc.textFile("data.txt")
@@ -356,7 +356,7 @@ Apart from text files, Spark's Scala API also supports several other data format
 
 Spark can create distributed datasets from any storage source supported by Hadoop, including your local file system, HDFS, Cassandra, HBase, [Amazon S3](http://wiki.apache.org/hadoop/AmazonS3), etc. Spark supports text files, [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), and any other Hadoop [InputFormat](http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/InputFormat.html).
 
-Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3n://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
+Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3a://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
 
 {% highlight java %}
 JavaRDD<String> distFile = sc.textFile("data.txt");
@@ -388,7 +388,7 @@ Apart from text files, Spark's Java API also supports several other data formats
 
 PySpark can create distributed datasets from any storage source supported by Hadoop, including your local file system, HDFS, Cassandra, HBase, [Amazon S3](http://wiki.apache.org/hadoop/AmazonS3), etc. Spark supports text files, [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), and any other Hadoop [InputFormat](http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/InputFormat.html).
 
-Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3n://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
+Text file RDDs can be created using `SparkContext`'s `textFile` method. This method takes an URI for the file (either a local path on the machine, or a `hdfs://`, `s3a://`, etc URI) and reads it as a collection of lines. Here is an example invocation:
 
 {% highlight python %}
 >>> distFile = sc.textFile("data.txt")
diff --git a/docs/storage-openstack-swift.md b/docs/storage-openstack-swift.md
index c39ef1ce59e1c..f4bb2353e3c49 100644
--- a/docs/storage-openstack-swift.md
+++ b/docs/storage-openstack-swift.md
@@ -8,7 +8,8 @@ same URI formats as in Hadoop. You can specify a path in Swift as input through
 URI of the form <code>swift://container.PROVIDER/path</code>. You will also need to set your 
 Swift security credentials, through <code>core-site.xml</code> or via
 <code>SparkContext.hadoopConfiguration</code>.
-Current Swift driver requires Swift to use Keystone authentication method.
+The current Swift driver requires Swift to use the Keystone authentication method, or
+its Rackspace-specific predecessor.
 
 # Configuring Swift for Better Data Locality
 
@@ -19,41 +20,30 @@ Although not mandatory, it is recommended to configure the proxy server of Swift
 
 # Dependencies
 
-The Spark application should include <code>hadoop-openstack</code> dependency.
+The Spark application should include <code>hadoop-openstack</code> dependency, which can
+be done by including the `hadoop-cloud` module for the specific version of spark used.
 For example, for Maven support, add the following to the <code>pom.xml</code> file:
 
 {% highlight xml %}
 <dependencyManagement>
   ...
   <dependency>
-    <groupId>org.apache.hadoop</groupId>
-    <artifactId>hadoop-openstack</artifactId>
-    <version>2.3.0</version>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>hadoop-cloud_2.11</artifactId>
+    <version>${spark.version}</version>
   </dependency>
   ...
 </dependencyManagement>
 {% endhighlight %}
 
-
 # Configuration Parameters
 
 Create <code>core-site.xml</code> and place it inside Spark's <code>conf</code> directory.
-There are two main categories of parameters that should to be configured: declaration of the
-Swift driver and the parameters that are required by Keystone. 
+The main category of parameters that should be configured are the authentication parameters
+required by Keystone.
 
-Configuration of Hadoop to use Swift File system achieved via 
-
-<table class="table">
-<tr><th>Property Name</th><th>Value</th></tr>
-<tr>
-  <td>fs.swift.impl</td>
-  <td>org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem</td>
-</tr>
-</table>
-
-Additional parameters required by Keystone (v2.0) and should be provided to the Swift driver. Those 
-parameters will be used to perform authentication in Keystone to access Swift. The following table 
-contains a list of Keystone mandatory parameters. <code>PROVIDER</code> can be any name.
+The following table  contains a list of Keystone mandatory parameters. <code>PROVIDER</code> can be
+any (alphanumeric) name.
 
 <table class="table">
 <tr><th>Property Name</th><th>Meaning</th><th>Required</th></tr>
@@ -94,7 +84,7 @@ contains a list of Keystone mandatory parameters. <code>PROVIDER</code> can be a
 </tr>
 <tr>
   <td><code>fs.swift.service.PROVIDER.public</code></td>
-  <td>Indicates if all URLs are public</td>
+  <td>Indicates whether to use the public (off cloud) or private (in cloud; no transfer fees) endpoints</td>
   <td>Mandatory</td>
 </tr>
 </table>
@@ -104,10 +94,6 @@ defined for tenant <code>test</code>. Then <code>core-site.xml</code> should inc
 
 {% highlight xml %}
 <configuration>
-  <property>
-    <name>fs.swift.impl</name>
-    <value>org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem</value>
-  </property>
   <property>
     <name>fs.swift.service.SparkTest.auth.url</name>
     <value>http://127.0.0.1:5000/v2.0/tokens</value>
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index 1bb89a361ca75..f7b1ec34ced76 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -70,6 +70,12 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
       """.stripMargin.replaceAll("\n", " ")).executeUpdate()
     conn.commit()
 
+    conn.prepareStatement("CREATE TABLE ts_with_timezone (id NUMBER(10), t TIMESTAMP WITH TIME ZONE)")
+        .executeUpdate()
+    conn.prepareStatement("INSERT INTO ts_with_timezone VALUES (1, to_timestamp_tz('1999-12-01 11:00:00 UTC','YYYY-MM-DD HH:MI:SS TZR'))")
+        .executeUpdate()
+    conn.commit()
+
     sql(
       s"""
         |CREATE TEMPORARY VIEW datetime
@@ -185,4 +191,11 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     sql("INSERT INTO TABLE datetime1 SELECT * FROM datetime where id = 1")
     checkRow(sql("SELECT * FROM datetime1 where id = 1").head())
   }
+
+  test("SPARK-20557: column type TIMESTAMP with TIME ZONE should be recognized") {
+    val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties)
+    val rows = dfRead.collect()
+    val types = rows(0).toSeq.map(x => x.getClass.toString)
+    assert(types(1).equals("class java.sql.Timestamp"))
+  }
 }
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
index a1a065a443e67..eb3c458360e7b 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
@@ -55,6 +55,13 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
       + "null, null, null, null, null, "
       + "null, null, null, null, null, null, null)"
     ).executeUpdate()
+
+    conn.prepareStatement("CREATE TABLE ts_with_timezone " +
+      "(id integer, tstz TIMESTAMP WITH TIME ZONE, ttz TIME WITH TIME ZONE)")
+      .executeUpdate()
+    conn.prepareStatement("INSERT INTO ts_with_timezone VALUES " +
+      "(1, TIMESTAMP WITH TIME ZONE '2016-08-12 10:22:31.949271-07', TIME WITH TIME ZONE '17:22:31.949271+00')")
+      .executeUpdate()
   }
 
   test("Type mapping for various types") {
@@ -126,4 +133,12 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
     assert(schema(0).dataType == FloatType)
     assert(schema(1).dataType == ShortType)
   }
+
+  test("SPARK-20557: column type TIMESTAMP with TIME ZONE and TIME with TIME ZONE should be recognized") {
+    val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties)
+    val rows = dfRead.collect()
+    val types = rows(0).toSeq.map(x => x.getClass.toString)
+    assert(types(1).equals("class java.sql.Timestamp"))
+    assert(types(2).equals("class java.sql.Timestamp"))
+  }
 }
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 2ce2760b7f463..f86b8f586d2a0 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -292,6 +292,7 @@ class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends L
     props.put("log.flush.interval.messages", "1")
     props.put("replica.socket.timeout.ms", "1500")
     props.put("delete.topic.enable", "true")
+    props.put("offsets.topic.num.partitions", "1")
     props.putAll(withBrokerProps.asJava)
     props
   }
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
new file mode 100644
index 0000000000000..aa36dd4774d86
--- /dev/null
+++ b/hadoop-cloud/pom.xml
@@ -0,0 +1,185 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.3.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-hadoop-cloud_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Cloud Integration through Hadoop Libraries</name>
+  <description>
+    Contains support for cloud infrastructures, specifically the Hadoop JARs and
+    transitive dependencies needed to interact with the infrastructures,
+    making everything consistent with Spark's other dependencies.
+  </description>
+  <properties>
+    <sbt.project.name>hadoop-cloud</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <!--
+      the AWS module pulls in jackson; its transitive dependencies can create
+      intra-jackson-module version problems.
+      -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-aws</artifactId>
+      <version>${hadoop.version}</version>
+      <scope>${hadoop.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.codehaus.jackson</groupId>
+          <artifactId>jackson-mapper-asl</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.codehaus.jackson</groupId>
+          <artifactId>jackson-core-asl</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-annotations</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-openstack</artifactId>
+      <version>${hadoop.version}</version>
+      <scope>${hadoop.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mockito</groupId>
+          <artifactId>mockito-all</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <!--
+    Add joda time to ensure that anything downstream which doesn't pull in spark-hive
+    gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs
+    -->
+    <dependency>
+      <groupId>joda-time</groupId>
+      <artifactId>joda-time</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+    <!-- explicitly declare the jackson artifacts desired -->
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.dataformat</groupId>
+      <artifactId>jackson-dataformat-cbor</artifactId>
+      <version>${fasterxml.jackson.version}</version>
+    </dependency>
+    <!--Explicit declaration to force in Spark version into transitive dependencies -->
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpclient</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+    <!--Explicit declaration to force in Spark version into transitive dependencies -->
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpcore</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+  </dependencies>
+
+  <profiles>
+
+    <profile>
+      <id>hadoop-2.7</id>
+      <!-- Hadoop Azure is a new Jar with -->
+      <dependencies>
+
+        <!--
+        Hadoop WASB client only arrived in Hadoop 2.7
+        -->
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-azure</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-mapper-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-core</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.google.guava</groupId>
+              <artifactId>guava</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+      </dependencies>
+    </profile>
+
+  </profiles>
+
+</project>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index a20ef72446661..1562bf1beb7e1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -774,6 +774,28 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
   /**
    * :: DeveloperApi ::
    * Implementation of the ALS algorithm.
+   *
+   * This implementation of the ALS factorization algorithm partitions the two sets of factors among
+   * Spark workers so as to reduce network communication by only sending one copy of each factor
+   * vector to each Spark worker on each iteration, and only if needed.  This is achieved by
+   * precomputing some information about the ratings matrix to determine which users require which
+   * item factors and vice versa.  See the Scaladoc for `InBlock` for a detailed explanation of how
+   * the precomputation is done.
+   *
+   * In addition, since each iteration of calculating the factor matrices depends on the known
+   * ratings, which are spread across Spark partitions, a naive implementation would incur
+   * significant network communication overhead between Spark workers, as the ratings RDD would be
+   * repeatedly shuffled during each iteration.  This implementation reduces that overhead by
+   * performing the shuffling operation up front, precomputing each partition's ratings dependencies
+   * and duplicating those values to the appropriate workers before starting iterations to solve for
+   * the factor matrices.  See the Scaladoc for `OutBlock` for a detailed explanation of how the
+   * precomputation is done.
+   *
+   * Note that the term "rating block" is a bit of a misnomer, as the ratings are not partitioned by
+   * contiguous blocks from the ratings matrix but by a hash function on the rating's location in
+   * the matrix.  If it helps you to visualize the partitions, it is easier to think of the term
+   * "block" as referring to a subset of an RDD containing the ratings rather than a contiguous
+   * submatrix of the ratings matrix.
    */
   @DeveloperApi
   def train[ID: ClassTag]( // scalastyle:ignore
@@ -791,32 +813,43 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
       checkpointInterval: Int = 10,
       seed: Long = 0L)(
       implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+
     require(!ratings.isEmpty(), s"No ratings available from $ratings")
     require(intermediateRDDStorageLevel != StorageLevel.NONE,
       "ALS is not designed to run without persisting intermediate RDDs.")
+
     val sc = ratings.sparkContext
+
+    // Precompute the rating dependencies of each partition
     val userPart = new ALSPartitioner(numUserBlocks)
     val itemPart = new ALSPartitioner(numItemBlocks)
-    val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
-    val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
-    val solver = if (nonnegative) new NNLSSolver else new CholeskySolver
     val blockRatings = partitionRatings(ratings, userPart, itemPart)
       .persist(intermediateRDDStorageLevel)
     val (userInBlocks, userOutBlocks) =
       makeBlocks("user", blockRatings, userPart, itemPart, intermediateRDDStorageLevel)
-    // materialize blockRatings and user blocks
-    userOutBlocks.count()
+    userOutBlocks.count()    // materialize blockRatings and user blocks
     val swappedBlockRatings = blockRatings.map {
       case ((userBlockId, itemBlockId), RatingBlock(userIds, itemIds, localRatings)) =>
         ((itemBlockId, userBlockId), RatingBlock(itemIds, userIds, localRatings))
     }
     val (itemInBlocks, itemOutBlocks) =
       makeBlocks("item", swappedBlockRatings, itemPart, userPart, intermediateRDDStorageLevel)
-    // materialize item blocks
-    itemOutBlocks.count()
+    itemOutBlocks.count()    // materialize item blocks
+
+    // Encoders for storing each user/item's partition ID and index within its partition using a
+    // single integer; used as an optimization
+    val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
+    val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
+
+    // These are the user and item factor matrices that, once trained, are multiplied together to
+    // estimate the rating matrix.  The two matrices are stored in RDDs, partitioned by column such
+    // that each factor column resides on the same Spark worker as its corresponding user or item.
     val seedGen = new XORShiftRandom(seed)
     var userFactors = initialize(userInBlocks, rank, seedGen.nextLong())
     var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong())
+
+    val solver = if (nonnegative) new NNLSSolver else new CholeskySolver
+
     var previousCheckpointFile: Option[String] = None
     val shouldCheckpoint: Int => Boolean = (iter) =>
       sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0)
@@ -830,6 +863,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
             logWarning(s"Cannot delete checkpoint file $file:", e)
         }
       }
+
     if (implicitPrefs) {
       for (iter <- 1 to maxIter) {
         userFactors.setName(s"userFactors-$iter").persist(intermediateRDDStorageLevel)
@@ -910,26 +944,154 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
   private type FactorBlock = Array[Array[Float]]
 
   /**
-   * Out-link block that stores, for each dst (item/user) block, which src (user/item) factors to
-   * send. For example, outLinkBlock(0) contains the local indices (not the original src IDs) of the
-   * src factors in this block to send to dst block 0.
+   * A mapping of the columns of the items factor matrix that are needed when calculating each row
+   * of the users factor matrix, and vice versa.
+   *
+   * Specifically, when calculating a user factor vector, since only those columns of the items
+   * factor matrix that correspond to the items that that user has rated are needed, we can avoid
+   * having to repeatedly copy the entire items factor matrix to each worker later in the algorithm
+   * by precomputing these dependencies for all users, storing them in an RDD of `OutBlock`s.  The
+   * items' dependencies on the columns of the users factor matrix is computed similarly.
+   *
+   * =Example=
+   *
+   * Using the example provided in the `InBlock` Scaladoc, `userOutBlocks` would look like the
+   * following:
+   *
+   * {{{
+   *     userOutBlocks.collect() == Seq(
+   *       0 -> Array(Array(0, 1), Array(0, 1)),
+   *       1 -> Array(Array(0), Array(0))
+   *     )
+   * }}}
+   *
+   * Each value in this map-like sequence is of type `Array[Array[Int]]`.  The values in the
+   * inner array are the ranks of the sorted user IDs in that partition; so in the example above,
+   * `Array(0, 1)` in partition 0 refers to user IDs 0 and 6, since when all unique user IDs in
+   * partition 0 are sorted, 0 is the first ID and 6 is the second.  The position of each inner
+   * array in its enclosing outer array denotes the partition number to which item IDs map; in the
+   * example, the first `Array(0, 1)` is in position 0 of its outer array, denoting item IDs that
+   * map to partition 0.
+   *
+   * In summary, the data structure encodes the following information:
+   *
+   *   *  There are ratings with user IDs 0 and 6 (encoded in `Array(0, 1)`, where 0 and 1 are the
+   *   indices of the user IDs 0 and 6 on partition 0) whose item IDs map to partitions 0 and 1
+   *   (represented by the fact that `Array(0, 1)` appears in both the 0th and 1st positions).
+   *
+   *   *  There are ratings with user ID 3 (encoded in `Array(0)`, where 0 is the index of the user
+   *   ID 3 on partition 1) whose item IDs map to partitions 0 and 1 (represented by the fact that
+   *   `Array(0)` appears in both the 0th and 1st positions).
    */
   private type OutBlock = Array[Array[Int]]
 
   /**
-   * In-link block for computing src (user/item) factors. This includes the original src IDs
-   * of the elements within this block as well as encoded dst (item/user) indices and corresponding
-   * ratings. The dst indices are in the form of (blockId, localIndex), which are not the original
-   * dst IDs. To compute src factors, we expect receiving dst factors that match the dst indices.
-   * For example, if we have an in-link record
+   * In-link block for computing user and item factor matrices.
+   *
+   * The ALS algorithm partitions the columns of the users factor matrix evenly among Spark workers.
+   * Since each column of the factor matrix is calculated using the known ratings of the correspond-
+   * ing user, and since the ratings don't change across iterations, the ALS algorithm preshuffles
+   * the ratings to the appropriate partitions, storing them in `InBlock` objects.
+   *
+   * The ratings shuffled by item ID are computed similarly and also stored in `InBlock` objects.
+   * Note that this means every rating is stored twice, once as shuffled by user ID and once by item
+   * ID.  This is a necessary tradeoff, since in general a rating will not be on the same worker
+   * when partitioned by user as by item.
+   *
+   * =Example=
+   *
+   * Say we have a small collection of eight items to offer the seven users in our application.  We
+   * have some known ratings given by the users, as seen in the matrix below:
+   *
+   * {{{
+   *                       Items
+   *            0   1   2   3   4   5   6   7
+   *          +---+---+---+---+---+---+---+---+
+   *        0 |   |0.1|   |   |0.4|   |   |0.7|
+   *          +---+---+---+---+---+---+---+---+
+   *        1 |   |   |   |   |   |   |   |   |
+   *          +---+---+---+---+---+---+---+---+
+   *     U  2 |   |   |   |   |   |   |   |   |
+   *     s    +---+---+---+---+---+---+---+---+
+   *     e  3 |   |3.1|   |   |3.4|   |   |3.7|
+   *     r    +---+---+---+---+---+---+---+---+
+   *     s  4 |   |   |   |   |   |   |   |   |
+   *          +---+---+---+---+---+---+---+---+
+   *        5 |   |   |   |   |   |   |   |   |
+   *          +---+---+---+---+---+---+---+---+
+   *        6 |   |6.1|   |   |6.4|   |   |6.7|
+   *          +---+---+---+---+---+---+---+---+
+   * }}}
+   *
+   * The ratings are represented as an RDD, passed to the `partitionRatings` method as the `ratings`
+   * parameter:
+   *
+   * {{{
+   *     ratings.collect() == Seq(
+   *       Rating(0, 1, 0.1f),
+   *       Rating(0, 4, 0.4f),
+   *       Rating(0, 7, 0.7f),
+   *       Rating(3, 1, 3.1f),
+   *       Rating(3, 4, 3.4f),
+   *       Rating(3, 7, 3.7f),
+   *       Rating(6, 1, 6.1f),
+   *       Rating(6, 4, 6.4f),
+   *       Rating(6, 7, 6.7f)
+   *     )
+   * }}}
    *
-   * {srcId: 0, dstBlockId: 2, dstLocalIndex: 3, rating: 5.0},
+   * Say that we are using two partitions to calculate each factor matrix:
    *
-   * and assume that the dst factors are stored as dstFactors: Map[Int, Array[Array[Float]]], which
-   * is a blockId to dst factors map, the corresponding dst factor of the record is dstFactor(2)(3).
+   * {{{
+   *     val userPart = new ALSPartitioner(2)
+   *     val itemPart = new ALSPartitioner(2)
+   *     val blockRatings = partitionRatings(ratings, userPart, itemPart)
+   * }}}
    *
-   * We use a CSC-like (compressed sparse column) format to store the in-link information. So we can
-   * compute src factors one after another using only one normal equation instance.
+   * Ratings are mapped to partitions using the user/item IDs modulo the number of partitions.  With
+   * two partitions, ratings with even-valued user IDs are shuffled to partition 0 while those with
+   * odd-valued user IDs are shuffled to partition 1:
+   *
+   * {{{
+   *     userInBlocks.collect() == Seq(
+   *       0 -> Seq(
+   *              // Internally, the class stores the ratings in a more optimized format than
+   *              // a sequence of `Rating`s, but for clarity we show it as such here.
+   *              Rating(0, 1, 0.1f),
+   *              Rating(0, 4, 0.4f),
+   *              Rating(0, 7, 0.7f),
+   *              Rating(6, 1, 6.1f),
+   *              Rating(6, 4, 6.4f),
+   *              Rating(6, 7, 6.7f)
+   *            ),
+   *       1 -> Seq(
+   *              Rating(3, 1, 3.1f),
+   *              Rating(3, 4, 3.4f),
+   *              Rating(3, 7, 3.7f)
+   *            )
+   *     )
+   * }}}
+   *
+   * Similarly, ratings with even-valued item IDs are shuffled to partition 0 while those with
+   * odd-valued item IDs are shuffled to partition 1:
+   *
+   * {{{
+   *     itemInBlocks.collect() == Seq(
+   *       0 -> Seq(
+   *              Rating(0, 4, 0.4f),
+   *              Rating(3, 4, 3.4f),
+   *              Rating(6, 4, 6.4f)
+   *            ),
+   *       1 -> Seq(
+   *              Rating(0, 1, 0.1f),
+   *              Rating(0, 7, 0.7f),
+   *              Rating(3, 1, 3.1f),
+   *              Rating(3, 7, 3.7f),
+   *              Rating(6, 1, 6.1f),
+   *              Rating(6, 7, 6.7f)
+   *            )
+   *     )
+   * }}}
    *
    * @param srcIds src ids (ordered)
    * @param dstPtrs dst pointers. Elements in range [dstPtrs(i), dstPtrs(i+1)) of dst indices and
@@ -1026,7 +1188,24 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
   }
 
   /**
-   * Partitions raw ratings into blocks.
+   * Groups an RDD of [[Rating]]s by the user partition and item partition to which each `Rating`
+   * maps according to the given partitioners.  The returned pair RDD holds the ratings, encoded in
+   * a memory-efficient format but otherwise unchanged, keyed by the (user partition ID, item
+   * partition ID) pair.
+   *
+   * Performance note: This is an expensive operation that performs an RDD shuffle.
+   *
+   * Implementation note: This implementation produces the same result as the following but
+   * generates fewer intermediate objects:
+   *
+   * {{{
+   *     ratings.map { r =>
+   *       ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r)
+   *     }.aggregateByKey(new RatingBlockBuilder)(
+   *         seqOp = (b, r) => b.add(r),
+   *         combOp = (b0, b1) => b0.merge(b1.build()))
+   *       .mapValues(_.build())
+   * }}}
    *
    * @param ratings raw ratings
    * @param srcPart partitioner for src IDs
@@ -1037,17 +1216,6 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
       ratings: RDD[Rating[ID]],
       srcPart: Partitioner,
       dstPart: Partitioner): RDD[((Int, Int), RatingBlock[ID])] = {
-
-     /* The implementation produces the same result as the following but generates less objects.
-
-     ratings.map { r =>
-       ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r)
-     }.aggregateByKey(new RatingBlockBuilder)(
-         seqOp = (b, r) => b.add(r),
-         combOp = (b0, b1) => b0.merge(b1.build()))
-       .mapValues(_.build())
-     */
-
     val numPartitions = srcPart.numPartitions * dstPart.numPartitions
     ratings.mapPartitions { iter =>
       val builders = Array.fill(numPartitions)(new RatingBlockBuilder[ID])
@@ -1135,8 +1303,8 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
     def length: Int = srcIds.length
 
     /**
-     * Compresses the block into an [[InBlock]]. The algorithm is the same as converting a
-     * sparse matrix from coordinate list (COO) format into compressed sparse column (CSC) format.
+     * Compresses the block into an `InBlock`. The algorithm is the same as converting a sparse
+     * matrix from coordinate list (COO) format into compressed sparse column (CSC) format.
      * Sorting is done using Spark's built-in Timsort to avoid generating too many objects.
      */
     def compress(): InBlock[ID] = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 7574af3d77ea8..9d31e792633cd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -671,58 +671,45 @@ class ALSSuite
       .setItemCol("item")
   }
 
-  test("recommendForAllUsers with k < num_items") {
-    val topItems = getALSModel.recommendForAllUsers(2)
-    assert(topItems.count() == 3)
-    assert(topItems.columns.contains("user"))
-
-    val expected = Map(
-      0 -> Array((3, 54f), (4, 44f)),
-      1 -> Array((3, 39f), (5, 33f)),
-      2 -> Array((3, 51f), (5, 45f))
-    )
-    checkRecommendations(topItems, expected, "item")
-  }
-
-  test("recommendForAllUsers with k = num_items") {
-    val topItems = getALSModel.recommendForAllUsers(4)
-    assert(topItems.count() == 3)
-    assert(topItems.columns.contains("user"))
-
+  test("recommendForAllUsers with k <, = and > num_items") {
+    val model = getALSModel
+    val numUsers = model.userFactors.count
+    val numItems = model.itemFactors.count
     val expected = Map(
       0 -> Array((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
       1 -> Array((3, 39f), (5, 33f), (4, 26f), (6, 16f)),
       2 -> Array((3, 51f), (5, 45f), (4, 30f), (6, 18f))
     )
-    checkRecommendations(topItems, expected, "item")
-  }
 
-  test("recommendForAllItems with k < num_users") {
-    val topUsers = getALSModel.recommendForAllItems(2)
-    assert(topUsers.count() == 4)
-    assert(topUsers.columns.contains("item"))
-
-    val expected = Map(
-      3 -> Array((0, 54f), (2, 51f)),
-      4 -> Array((0, 44f), (2, 30f)),
-      5 -> Array((2, 45f), (0, 42f)),
-      6 -> Array((0, 28f), (2, 18f))
-    )
-    checkRecommendations(topUsers, expected, "user")
+    Seq(2, 4, 6).foreach { k =>
+      val n = math.min(k, numItems).toInt
+      val expectedUpToN = expected.mapValues(_.slice(0, n))
+      val topItems = model.recommendForAllUsers(k)
+      assert(topItems.count() == numUsers)
+      assert(topItems.columns.contains("user"))
+      checkRecommendations(topItems, expectedUpToN, "item")
+    }
   }
 
-  test("recommendForAllItems with k = num_users") {
-    val topUsers = getALSModel.recommendForAllItems(3)
-    assert(topUsers.count() == 4)
-    assert(topUsers.columns.contains("item"))
-
+  test("recommendForAllItems with k <, = and > num_users") {
+    val model = getALSModel
+    val numUsers = model.userFactors.count
+    val numItems = model.itemFactors.count
     val expected = Map(
       3 -> Array((0, 54f), (2, 51f), (1, 39f)),
       4 -> Array((0, 44f), (2, 30f), (1, 26f)),
       5 -> Array((2, 45f), (0, 42f), (1, 33f)),
       6 -> Array((0, 28f), (2, 18f), (1, 16f))
     )
-    checkRecommendations(topUsers, expected, "user")
+
+    Seq(2, 3, 4).foreach { k =>
+      val n = math.min(k, numUsers).toInt
+      val expectedUpToN = expected.mapValues(_.slice(0, n))
+      val topUsers = getALSModel.recommendForAllItems(k)
+      assert(topUsers.count() == numItems)
+      assert(topUsers.columns.contains("item"))
+      checkRecommendations(topUsers, expectedUpToN, "user")
+    }
   }
 
   private def checkRecommendations(
diff --git a/pom.xml b/pom.xml
index a1a1817e2f7d3..6835ea14cd42b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -83,6 +83,7 @@
 
   <modules>
     <module>common/sketch</module>
+    <module>common/kvstore</module>
     <module>common/network-common</module>
     <module>common/network-shuffle</module>
     <module>common/unsafe</module>
@@ -441,6 +442,11 @@
         <artifactId>httpcore</artifactId>
         <version>${commons.httpcore.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.fusesource.leveldbjni</groupId>
+        <artifactId>leveldbjni-all</artifactId>
+        <version>1.8</version>
+      </dependency>
       <dependency>
         <groupId>org.seleniumhq.selenium</groupId>
         <artifactId>selenium-java</artifactId>
@@ -588,6 +594,11 @@
         <artifactId>metrics-graphite</artifactId>
         <version>${codahale.metrics.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-core</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <dependency>
         <groupId>com.fasterxml.jackson.core</groupId>
         <artifactId>jackson-databind</artifactId>
@@ -2546,6 +2557,13 @@
       </modules>
     </profile>
 
+    <profile>
+      <id>hadoop-cloud</id>
+      <modules>
+        <module>hadoop-cloud</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>scala-2.10</id>
       <activation>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e52baf51aed1a..89b0c7a3ab7b0 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -50,16 +50,16 @@ object BuildCommons {
   ).map(ProjectRef(buildLocation, _))
 
   val allProjects@Seq(
-    core, graphx, mllib, mllibLocal, repl, networkCommon, networkShuffle, launcher, unsafe, tags, sketch, _*
+    core, graphx, mllib, mllibLocal, repl, networkCommon, networkShuffle, launcher, unsafe, tags, sketch, kvstore, _*
   ) = Seq(
     "core", "graphx", "mllib", "mllib-local", "repl", "network-common", "network-shuffle", "launcher", "unsafe",
-    "tags", "sketch"
+    "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
   val optionallyEnabledProjects@Seq(mesos, yarn, sparkGangliaLgpl,
-    streamingKinesisAsl, dockerIntegrationTests) =
+    streamingKinesisAsl, dockerIntegrationTests, hadoopCloud) =
     Seq("mesos", "yarn", "ganglia-lgpl", "streaming-kinesis-asl",
-      "docker-integration-tests").map(ProjectRef(buildLocation, _))
+      "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
     Seq("network-yarn", "streaming-flume-assembly", "streaming-kafka-0-8-assembly", "streaming-kafka-0-10-assembly", "streaming-kinesis-asl-assembly")
@@ -310,7 +310,7 @@ object SparkBuild extends PomBuild {
   val mimaProjects = allProjects.filterNot { x =>
     Seq(
       spark, hive, hiveThriftServer, catalyst, repl, networkCommon, networkShuffle, networkYarn,
-      unsafe, tags, sqlKafka010
+      unsafe, tags, sqlKafka010, kvstore
     ).contains(x)
   }
 
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 41e68a45a6159..5f25dce161963 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -237,23 +237,28 @@ def registerFunction(self, name, f, returnType=StringType()):
         :param name: name of the UDF
         :param f: python function
         :param returnType: a :class:`pyspark.sql.types.DataType` object
+        :return: a wrapped :class:`UserDefinedFunction`
 
-        >>> spark.catalog.registerFunction("stringLengthString", lambda x: len(x))
+        >>> strlen = spark.catalog.registerFunction("stringLengthString", len)
         >>> spark.sql("SELECT stringLengthString('test')").collect()
         [Row(stringLengthString(test)=u'4')]
 
+        >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
+        [Row(stringLengthString(text)=u'3')]
+
         >>> from pyspark.sql.types import IntegerType
-        >>> spark.catalog.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> _ = spark.catalog.registerFunction("stringLengthInt", len, IntegerType())
         >>> spark.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
 
         >>> from pyspark.sql.types import IntegerType
-        >>> spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
         >>> spark.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
         """
         udf = UserDefinedFunction(f, returnType, name)
         self._jsparkSession.udf().registerPython(name, udf._judf)
+        return udf._wrapped()
 
     @since(2.0)
     def isCached(self, tableName):
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index fdb7abbad4e5f..5197a9e004610 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -185,22 +185,26 @@ def registerFunction(self, name, f, returnType=StringType()):
         :param name: name of the UDF
         :param f: python function
         :param returnType: a :class:`pyspark.sql.types.DataType` object
+        :return: a wrapped :class:`UserDefinedFunction`
 
-        >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x))
+        >>> strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
         >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
         [Row(stringLengthString(test)=u'4')]
 
+        >>> sqlContext.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
+        [Row(stringLengthString(text)=u'3')]
+
         >>> from pyspark.sql.types import IntegerType
-        >>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> _ = sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
 
         >>> from pyspark.sql.types import IntegerType
-        >>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> _ = sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
         """
-        self.sparkSession.catalog.registerFunction(name, f, returnType)
+        return self.sparkSession.catalog.registerFunction(name, f, returnType)
 
     @ignore_unicode_prefix
     @since(2.1)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 843ae3816f061..8b3487c3f1083 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1917,6 +1917,19 @@ def __call__(self, *cols):
         sc = SparkContext._active_spark_context
         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
 
+    def _wrapped(self):
+        """
+        Wrap this udf with a function and attach docstring from func
+        """
+        @functools.wraps(self.func)
+        def wrapper(*args):
+            return self(*args)
+
+        wrapper.func = self.func
+        wrapper.returnType = self.returnType
+
+        return wrapper
+
 
 @since(1.3)
 def udf(f=None, returnType=StringType()):
@@ -1951,15 +1964,7 @@ def udf(f=None, returnType=StringType()):
     """
     def _udf(f, returnType=StringType()):
         udf_obj = UserDefinedFunction(f, returnType)
-
-        @functools.wraps(f)
-        def wrapper(*args):
-            return udf_obj(*args)
-
-        wrapper.func = udf_obj.func
-        wrapper.returnType = udf_obj.returnType
-
-        return wrapper
+        return udf_obj._wrapped()
 
     # decorator @udf, @udf() or @udf(dataType())
     if f is None or isinstance(f, (str, DataType)):
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 960fb882cf901..90ce8f81eb7fd 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -563,6 +563,63 @@ def partitionBy(self, *cols):
         self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
         return self
 
+    @since(2.3)
+    def bucketBy(self, numBuckets, col, *cols):
+        """Buckets the output by the given columns.If specified,
+        the output is laid out on the file system similar to Hive's bucketing scheme.
+
+        :param numBuckets: the number of buckets to save
+        :param col: a name of a column, or a list of names.
+        :param cols: additional names (optional). If `col` is a list it should be empty.
+
+        .. note:: Applicable for file-based data sources in combination with
+                  :py:meth:`DataFrameWriter.saveAsTable`.
+
+        >>> (df.write.format('parquet')
+        ...     .bucketBy(100, 'year', 'month')
+        ...     .mode("overwrite")
+        ...     .saveAsTable('bucketed_table'))
+        """
+        if not isinstance(numBuckets, int):
+            raise TypeError("numBuckets should be an int, got {0}.".format(type(numBuckets)))
+
+        if isinstance(col, (list, tuple)):
+            if cols:
+                raise ValueError("col is a {0} but cols are not empty".format(type(col)))
+
+            col, cols = col[0], col[1:]
+
+        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+            raise TypeError("all names should be `str`")
+
+        self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
+        return self
+
+    @since(2.3)
+    def sortBy(self, col, *cols):
+        """Sorts the output in each bucket by the given columns on the file system.
+
+        :param col: a name of a column, or a list of names.
+        :param cols: additional names (optional). If `col` is a list it should be empty.
+
+        >>> (df.write.format('parquet')
+        ...     .bucketBy(100, 'year', 'month')
+        ...     .sortBy('day')
+        ...     .mode("overwrite")
+        ...     .saveAsTable('sorted_bucketed_table'))
+        """
+        if isinstance(col, (list, tuple)):
+            if cols:
+                raise ValueError("col is a {0} but cols are not empty".format(type(col)))
+
+            col, cols = col[0], col[1:]
+
+        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+            raise TypeError("all names should be `str`")
+
+        self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
+        return self
+
     @since(1.4)
     def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
         """Saves the contents of the :class:`DataFrame` to a data source.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f644624f7f317..e3fe01eae243f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -211,6 +211,12 @@ def test_sqlcontext_reuses_sparksession(self):
         sqlContext2 = SQLContext(self.sc)
         self.assertTrue(sqlContext1.sparkSession is sqlContext2.sparkSession)
 
+    def tearDown(self):
+        super(SQLTests, self).tearDown()
+
+        # tear down test_bucketed_write state
+        self.spark.sql("DROP TABLE IF EXISTS pyspark_bucket")
+
     def test_row_should_be_read_only(self):
         row = Row(a=1, b=2)
         self.assertEqual(1, row.a)
@@ -436,6 +442,15 @@ def test_udf_with_order_by_and_limit(self):
         res.explain(True)
         self.assertEqual(res.collect(), [Row(id=0, copy=0)])
 
+    def test_udf_registration_returns_udf(self):
+        df = self.spark.range(10)
+        add_three = self.spark.udf.register("add_three", lambda x: x + 3, IntegerType())
+
+        self.assertListEqual(
+            df.selectExpr("add_three(id) AS plus_three").collect(),
+            df.select(add_three("id").alias("plus_three")).collect()
+        )
+
     def test_wholefile_json(self):
         people1 = self.spark.read.json("python/test_support/sql/people.json")
         people_array = self.spark.read.json("python/test_support/sql/people_array.json",
@@ -2187,6 +2202,54 @@ def test_BinaryType_serialization(self):
         df = self.spark.createDataFrame(data, schema=schema)
         df.collect()
 
+    def test_bucketed_write(self):
+        data = [
+            (1, "foo", 3.0), (2, "foo", 5.0),
+            (3, "bar", -1.0), (4, "bar", 6.0),
+        ]
+        df = self.spark.createDataFrame(data, ["x", "y", "z"])
+
+        def count_bucketed_cols(names, table="pyspark_bucket"):
+            """Given a sequence of column names and a table name
+            query the catalog and return number o columns which are
+            used for bucketing
+            """
+            cols = self.spark.catalog.listColumns(table)
+            num = len([c for c in cols if c.name in names and c.isBucket])
+            return num
+
+        # Test write with one bucketing column
+        df.write.bucketBy(3, "x").mode("overwrite").saveAsTable("pyspark_bucket")
+        self.assertEqual(count_bucketed_cols(["x"]), 1)
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
+        # Test write two bucketing columns
+        df.write.bucketBy(3, "x", "y").mode("overwrite").saveAsTable("pyspark_bucket")
+        self.assertEqual(count_bucketed_cols(["x", "y"]), 2)
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
+        # Test write with bucket and sort
+        df.write.bucketBy(2, "x").sortBy("z").mode("overwrite").saveAsTable("pyspark_bucket")
+        self.assertEqual(count_bucketed_cols(["x"]), 1)
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
+        # Test write with a list of columns
+        df.write.bucketBy(3, ["x", "y"]).mode("overwrite").saveAsTable("pyspark_bucket")
+        self.assertEqual(count_bucketed_cols(["x", "y"]), 2)
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
+        # Test write with bucket and sort with a list of columns
+        (df.write.bucketBy(2, "x")
+            .sortBy(["y", "z"])
+            .mode("overwrite").saveAsTable("pyspark_bucket"))
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
+        # Test write with bucket and sort with multiple columns
+        (df.write.bucketBy(2, "x")
+            .sortBy("y", "z")
+            .mode("overwrite").saveAsTable("pyspark_bucket"))
+        self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect()))
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
index ef08502ec8dd6..ddea762fdb919 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
@@ -59,7 +59,7 @@ private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf:
   @tailrec
   private def parse(args: List[String]): Unit = args match {
     case ("--host" | "-h") :: value :: tail =>
-      Utils.checkHost(value, "Please use hostname " + value)
+      Utils.checkHost(value)
       host = value
       parse(tail)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 81dd8efc0015f..8a5319bebe54e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -216,8 +216,8 @@ class InMemoryCatalog(
       } else {
         tableDefinition
       }
-
-      catalog(db).tables.put(table, new TableDesc(tableWithLocation))
+      val tableProp = tableWithLocation.properties.filter(_._1 != "comment")
+      catalog(db).tables.put(table, new TableDesc(tableWithLocation.copy(properties = tableProp)))
     }
   }
 
@@ -298,7 +298,9 @@ class InMemoryCatalog(
     assert(tableDefinition.identifier.database.isDefined)
     val db = tableDefinition.identifier.database.get
     requireTableExists(db, tableDefinition.identifier.table)
-    catalog(db).tables(tableDefinition.identifier.table).table = tableDefinition
+    val updatedProperties = tableDefinition.properties.filter(kv => kv._1 != "comment")
+    val newTableDefinition = tableDefinition.copy(properties = updatedProperties)
+    catalog(db).tables(tableDefinition.identifier.table).table = newTableDefinition
   }
 
   override def alterTableSchema(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index cc0cbba275b81..c39017ebbfe60 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -132,10 +132,10 @@ case class CatalogTablePartition(
   /**
    * Given the partition schema, returns a row with that schema holding the partition values.
    */
-  def toRow(partitionSchema: StructType, defaultTimeZondId: String): InternalRow = {
+  def toRow(partitionSchema: StructType, defaultTimeZoneId: String): InternalRow = {
     val caseInsensitiveProperties = CaseInsensitiveMap(storage.properties)
     val timeZoneId = caseInsensitiveProperties.getOrElse(
-      DateTimeUtils.TIMEZONE_OPTION, defaultTimeZondId)
+      DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId)
     InternalRow.fromSeq(partitionSchema.map { field =>
       val partValue = if (spec(field.name) == ExternalCatalogUtils.DEFAULT_PARTITION_NAME) {
         null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index bb8fd5032d63d..a98cd33f2780c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -488,7 +488,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
  * Deterministic version of [[UnixTimestamp]], must have at least one parameter.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr[, pattern]) - Returns the UNIX timestamp of the give time.",
+  usage = "_FUNC_(expr[, pattern]) - Returns the UNIX timestamp of the given time.",
   extended = """
     Examples:
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
@@ -1225,8 +1225,8 @@ case class ParseToTimestamp(left: Expression, format: Expression, child: Express
   extends RuntimeReplaceable {
 
   def this(left: Expression, format: Expression) = {
-  this(left, format, Cast(UnixTimestamp(left, format), TimestampType))
-}
+    this(left, format, Cast(UnixTimestamp(left, format), TimestampType))
+  }
 
   override def flatArguments: Iterator[Any] = Iterator(left, format)
   override def sql: String = s"$prettyName(${left.sql}, ${format.sql})"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 6fc828f63f152..85b368c862630 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -122,7 +122,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         logDebug(
           s"""
           |=== Result of Batch ${batch.name} ===
-          |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
+          |${sideBySide(batchStartPlan.treeString, curPlan.treeString).mkString("\n")}
         """.stripMargin)
       } else {
         logTrace(s"Batch ${batch.name} has no effect.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index eb6aad5b2d2bb..bf596fa0a89db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -423,7 +423,7 @@ object DateTimeUtils {
   }
 
   /**
-   * Parses a given UTF8 date string to the corresponding a corresponding [[Int]] value.
+   * Parses a given UTF8 date string to a corresponding [[Int]] value.
    * The return type is [[Option]] in order to distinguish between 0 and null. The following
    * formats are allowed:
    *
@@ -498,6 +498,11 @@ object DateTimeUtils {
     false
   }
 
+  lazy val validTimezones = TimeZone.getAvailableIDs().toSet
+  def isValidTimezone(timezoneId: String): Boolean = {
+    validTimezones.contains(timezoneId)
+  }
+
   /**
    * Returns the microseconds since year zero (-17999) from microseconds since epoch.
    */
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index e170133f0f0bf..fe4be963e8184 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -115,7 +115,7 @@
     <dependency>
       <groupId>com.h2database</groupId>
       <artifactId>h2</artifactId>
-      <version>1.4.183</version>
+      <version>1.4.195</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 9d641b528723a..dabbc2b6387e4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.execution.datasources.parquet;
 
 import java.io.IOException;
+import java.util.TimeZone;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;
@@ -30,6 +32,7 @@
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.DecimalType;
 
@@ -90,11 +93,30 @@ public class VectorizedColumnReader {
 
   private final PageReader pageReader;
   private final ColumnDescriptor descriptor;
+  private final TimeZone storageTz;
+  private final TimeZone sessionTz;
 
-  public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader)
+  public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader,
+                                Configuration conf)
       throws IOException {
     this.descriptor = descriptor;
     this.pageReader = pageReader;
+    // If the table has a timezone property, apply the correct conversions.  See SPARK-12297.
+    // The conf is sometimes null in tests.
+    String sessionTzString =
+        conf == null ? null : conf.get(SQLConf.SESSION_LOCAL_TIMEZONE().key());
+    if (sessionTzString == null || sessionTzString.isEmpty()) {
+      sessionTz = DateTimeUtils.defaultTimeZone();
+    } else {
+      sessionTz = TimeZone.getTimeZone(sessionTzString);
+    }
+    String storageTzString =
+        conf == null ? null : conf.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY());
+    if (storageTzString == null || storageTzString.isEmpty()) {
+      storageTz = sessionTz;
+    } else {
+      storageTz = TimeZone.getTimeZone(storageTzString);
+    }
     this.maxDefLevel = descriptor.getMaxDefinitionLevel();
 
     DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
@@ -289,7 +311,7 @@ private void decodeDictionaryIds(int rowId, int num, ColumnVector column,
             // TODO: Convert dictionary of Binaries to dictionary of Longs
             if (!column.isNullAt(i)) {
               Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
-              column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
+              column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v, sessionTz, storageTz));
             }
           }
         } else {
@@ -422,7 +444,7 @@ private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOE
         if (defColumn.readInteger() == maxDefLevel) {
           column.putLong(rowId + i,
               // Read 12 bytes for INT96
-              ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12)));
+              ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12), sessionTz, storageTz));
         } else {
           column.putNull(rowId + i);
         }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 51bdf0f0f2291..d8974ddf24704 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -21,6 +21,7 @@
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.parquet.column.ColumnDescriptor;
@@ -95,6 +96,8 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
    */
   private boolean returnColumnarBatch;
 
+  private Configuration conf;
+
   /**
    * The default config on whether columnarBatch should be offheap.
    */
@@ -107,6 +110,7 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
   public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
       throws IOException, InterruptedException, UnsupportedOperationException {
     super.initialize(inputSplit, taskAttemptContext);
+    this.conf = taskAttemptContext.getConfiguration();
     initializeInternal();
   }
 
@@ -277,7 +281,7 @@ private void checkEndOfRowGroup() throws IOException {
     for (int i = 0; i < columns.size(); ++i) {
       if (missingColumns[i]) continue;
       columnReaders[i] = new VectorizedColumnReader(columns.get(i),
-          pages.getPageReader(columns.get(i)));
+          pages.getPageReader(columns.get(i)), conf);
     }
     totalCountLoadedSoFar += pages.getRowCount();
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 55540563ef911..793fb9b795596 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -231,8 +231,12 @@ case class AlterTableSetPropertiesCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView)
-    // This overrides old properties
-    val newTable = table.copy(properties = table.properties ++ properties)
+    // This overrides old properties and update the comment parameter of CatalogTable
+    // with the newly added/modified comment since CatalogTable also holds comment as its
+    // direct property.
+    val newTable = table.copy(
+      properties = table.properties ++ properties,
+      comment = properties.get("comment"))
     catalog.alterTable(newTable)
     Seq.empty[Row]
   }
@@ -267,8 +271,10 @@ case class AlterTableUnsetPropertiesCommand(
         }
       }
     }
+    // If comment is in the table property, we reset it to None
+    val tableComment = if (propKeys.contains("comment")) None else table.properties.get("comment")
     val newProperties = table.properties.filter { case (k, _) => !propKeys.contains(k) }
-    val newTable = table.copy(properties = newProperties)
+    val newTable = table.copy(properties = newProperties, comment = tableComment)
     catalog.alterTable(newTable)
     Seq.empty[Row]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index ebf03e1bf8869..5843c5b56d44c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -26,7 +26,6 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 import scala.util.Try
 
-import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
@@ -37,7 +36,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.execution.datasources.{DataSource, FileFormat, PartitioningUtils}
+import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
@@ -74,6 +73,10 @@ case class CreateTableLikeCommand(
       sourceTableDesc.provider
     }
 
+    val properties = sourceTableDesc.properties.filter { case (k, _) =>
+      k == ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    }
+
     // If the location is specified, we create an external table internally.
     // Otherwise create a managed table.
     val tblType = if (location.isEmpty) CatalogTableType.MANAGED else CatalogTableType.EXTERNAL
@@ -86,6 +89,7 @@ case class CreateTableLikeCommand(
           locationUri = location.map(CatalogUtils.stringToURI(_))),
         schema = sourceTableDesc.schema,
         provider = newProvider,
+        properties = properties,
         partitionColumnNames = sourceTableDesc.partitionColumnNames,
         bucketSpec = sourceTableDesc.bucketSpec)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 0183805d56257..71eaab119d75d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.jdbc
 
-import java.sql.{Connection, Driver, DriverManager, PreparedStatement, ResultSet, ResultSetMetaData, SQLException}
+import java.sql.{Connection, Driver, DriverManager, JDBCType, PreparedStatement, ResultSet, ResultSetMetaData, SQLException}
 import java.util.Locale
 
 import scala.collection.JavaConverters._
@@ -217,20 +217,29 @@ object JdbcUtils extends Logging {
       case java.sql.Types.OTHER         => null
       case java.sql.Types.REAL          => DoubleType
       case java.sql.Types.REF           => StringType
+      case java.sql.Types.REF_CURSOR    => null
       case java.sql.Types.ROWID         => LongType
       case java.sql.Types.SMALLINT      => IntegerType
       case java.sql.Types.SQLXML        => StringType
       case java.sql.Types.STRUCT        => StringType
       case java.sql.Types.TIME          => TimestampType
+      case java.sql.Types.TIME_WITH_TIMEZONE
+                                        => TimestampType
       case java.sql.Types.TIMESTAMP     => TimestampType
+      case java.sql.Types.TIMESTAMP_WITH_TIMEZONE
+                                        => TimestampType
+      case -101                         => TimestampType // Value for Timestamp with Time Zone in Oracle
       case java.sql.Types.TINYINT       => IntegerType
       case java.sql.Types.VARBINARY     => BinaryType
       case java.sql.Types.VARCHAR       => StringType
-      case _                            => null
+      case _                            =>
+        throw new SQLException("Unrecognized SQL type " + sqlType)
       // scalastyle:on
     }
 
-    if (answer == null) throw new SQLException("Unsupported type " + sqlType)
+    if (answer == null) {
+      throw new SQLException("Unsupported type " + JDBCType.valueOf(sqlType).getName)
+    }
     answer
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 2f3a2c62b912c..8113768cd793f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -632,4 +632,6 @@ object ParquetFileFormat extends Logging {
         Failure(cause)
     }.toOption
   }
+
+  val PARQUET_TIMEZONE_TABLE_PROPERTY = "parquet.mr.int96.write.zone"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index f1a35dd8a6200..bf395a0bef745 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -95,7 +95,8 @@ private[parquet] class ParquetReadSupport extends ReadSupport[UnsafeRow] with Lo
     new ParquetRecordMaterializer(
       parquetRequestedSchema,
       ParquetReadSupport.expandUDT(catalystRequestedSchema),
-      new ParquetSchemaConverter(conf))
+      new ParquetSchemaConverter(conf),
+      conf)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
index 4e49a0dac97c0..df041996cdea9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer}
 import org.apache.parquet.schema.MessageType
 
@@ -29,13 +30,17 @@ import org.apache.spark.sql.types.StructType
  * @param parquetSchema Parquet schema of the records to be read
  * @param catalystSchema Catalyst schema of the rows to be constructed
  * @param schemaConverter A Parquet-Catalyst schema converter that helps initializing row converters
+ * @param hadoopConf hadoop Configuration for passing extra params for parquet conversion
  */
 private[parquet] class ParquetRecordMaterializer(
-    parquetSchema: MessageType, catalystSchema: StructType, schemaConverter: ParquetSchemaConverter)
+    parquetSchema: MessageType,
+    catalystSchema: StructType,
+    schemaConverter: ParquetSchemaConverter,
+    hadoopConf: Configuration)
   extends RecordMaterializer[UnsafeRow] {
 
   private val rootConverter =
-    new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, NoopUpdater)
+    new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, hadoopConf, NoopUpdater)
 
   override def getCurrentRecord: UnsafeRow = rootConverter.currentRecord
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
index 32e6c60cd9766..d52ff62d93b26 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -19,10 +19,12 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteOrder
+import java.util.TimeZone
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.{GroupType, MessageType, OriginalType, Type}
@@ -34,6 +36,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLTimestamp
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -117,12 +120,14 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd
  * @param parquetType Parquet schema of Parquet records
  * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined
  *        types should have been expanded.
+ * @param hadoopConf a hadoop Configuration for passing any extra parameters for parquet conversion
  * @param updater An updater which propagates converted field values to the parent container
  */
 private[parquet] class ParquetRowConverter(
     schemaConverter: ParquetSchemaConverter,
     parquetType: GroupType,
     catalystType: StructType,
+    hadoopConf: Configuration,
     updater: ParentContainerUpdater)
   extends ParquetGroupConverter(updater) with Logging {
 
@@ -261,18 +266,18 @@ private[parquet] class ParquetRowConverter(
 
       case TimestampType =>
         // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+        // If the table has a timezone property, apply the correct conversions.  See SPARK-12297.
+        val sessionTzString = hadoopConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key)
+        val sessionTz = Option(sessionTzString).map(TimeZone.getTimeZone(_))
+          .getOrElse(DateTimeUtils.defaultTimeZone())
+        val storageTzString = hadoopConf.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
+        val storageTz = Option(storageTzString).map(TimeZone.getTimeZone(_)).getOrElse(sessionTz)
         new ParquetPrimitiveConverter(updater) {
           // Converts nanosecond timestamps stored as INT96
           override def addBinary(value: Binary): Unit = {
-            assert(
-              value.length() == 12,
-              "Timestamps (with nanoseconds) are expected to be stored in 12-byte long binaries, " +
-              s"but got a ${value.length()}-byte binary.")
-
-            val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
-            val timeOfDayNanos = buf.getLong
-            val julianDay = buf.getInt
-            updater.setLong(DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos))
+            val timestamp = ParquetRowConverter.binaryToSQLTimestamp(value, sessionTz = sessionTz,
+              storageTz = storageTz)
+            updater.setLong(timestamp)
           }
         }
 
@@ -302,7 +307,7 @@ private[parquet] class ParquetRowConverter(
 
       case t: StructType =>
         new ParquetRowConverter(
-          schemaConverter, parquetType.asGroupType(), t, new ParentContainerUpdater {
+          schemaConverter, parquetType.asGroupType(), t, hadoopConf, new ParentContainerUpdater {
             override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
           })
 
@@ -651,6 +656,7 @@ private[parquet] class ParquetRowConverter(
 }
 
 private[parquet] object ParquetRowConverter {
+
   def binaryToUnscaledLong(binary: Binary): Long = {
     // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here
     // we are using `Binary.toByteBuffer.array()` to steal the underlying byte array without
@@ -673,12 +679,35 @@ private[parquet] object ParquetRowConverter {
     unscaled
   }
 
-  def binaryToSQLTimestamp(binary: Binary): SQLTimestamp = {
+  /**
+   * Converts an int96 to a SQLTimestamp, given both the storage timezone and the local timezone.
+   * The timestamp is really meant to be interpreted as a "floating time", but since we
+   * actually store it as micros since epoch, why we have to apply a conversion when timezones
+   * change.
+   *
+   * @param binary a parquet Binary which holds one int96
+   * @param sessionTz the session timezone.  This will be used to determine how to display the time,
+    *                  and compute functions on the timestamp which involve a timezone, eg. extract
+    *                  the hour.
+   * @param storageTz the timezone which was used to store the timestamp.  This should come from the
+    *                  timestamp table property, or else assume its the same as the sessionTz
+   * @return a timestamp (millis since epoch) which will render correctly in the sessionTz
+   */
+  def binaryToSQLTimestamp(
+      binary: Binary,
+      sessionTz: TimeZone,
+      storageTz: TimeZone): SQLTimestamp = {
     assert(binary.length() == 12, s"Timestamps (with nanoseconds) are expected to be stored in" +
       s" 12-byte long binaries. Found a ${binary.length()}-byte binary instead.")
     val buffer = binary.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
     val timeOfDayNanos = buffer.getLong
     val julianDay = buffer.getInt
-    DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos)
+    val utcEpochMicros = DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos)
+    // avoid expensive time logic if possible.
+    if (sessionTz.getID() != storageTz.getID()) {
+      DateTimeUtils.convertTz(utcEpochMicros, sessionTz, storageTz)
+    } else {
+      utcEpochMicros
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
index 38b0e33937f3c..679ed8e361b74 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.nio.{ByteBuffer, ByteOrder}
 import java.util
+import java.util.TimeZone
 
 import scala.collection.JavaConverters.mapAsJavaMapConverter
 
@@ -75,6 +76,9 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
   // Reusable byte array used to write decimal values
   private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION))
 
+  private var storageTz: TimeZone = _
+  private var sessionTz: TimeZone = _
+
   override def init(configuration: Configuration): WriteContext = {
     val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA)
     this.schema = StructType.fromString(schemaString)
@@ -91,6 +95,19 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
 
 
     this.rootFieldWriters = schema.map(_.dataType).map(makeWriter)
+    // If the table has a timezone property, apply the correct conversions.  See SPARK-12297.
+    val sessionTzString = configuration.get(SQLConf.SESSION_LOCAL_TIMEZONE.key)
+    sessionTz = if (sessionTzString == null || sessionTzString == "") {
+      TimeZone.getDefault()
+    } else {
+      TimeZone.getTimeZone(sessionTzString)
+    }
+    val storageTzString = configuration.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
+    storageTz = if (storageTzString == null || storageTzString == "") {
+      sessionTz
+    } else {
+      TimeZone.getTimeZone(storageTzString)
+    }
 
     val messageType = new ParquetSchemaConverter(configuration).convert(schema)
     val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
@@ -178,7 +195,13 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
 
           // NOTE: Starting from Spark 1.5, Spark SQL `TimestampType` only has microsecond
           // precision.  Nanosecond parts of timestamp values read from INT96 are simply stripped.
-          val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
+          val rawMicros = row.getLong(ordinal)
+          val adjustedMicros = if (sessionTz.getID() == storageTz.getID()) {
+            rawMicros
+          } else {
+            DateTimeUtils.convertTz(rawMicros, storageTz, sessionTz)
+          }
+          val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(adjustedMicros)
           val buf = ByteBuffer.wrap(timestampBuffer)
           buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
           recordConsumer.addBinary(Binary.fromReusedByteArray(timestampBuffer))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index f07e04368389f..987011edfe1e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2491,10 +2491,10 @@ object functions {
    * Converts a date/timestamp/string to a value of string in the format specified by the date
    * format given by the second argument.
    *
-   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
-   * pattern letters of `java.text.SimpleDateFormat` can be used.
+   * A pattern `dd.MM.yyyy` would return a string like `18.03.1993`.
+   * All pattern letters of `java.text.SimpleDateFormat` can be used.
    *
-   * @note Use when ever possible specialized functions like [[year]]. These benefit from a
+   * @note Use specialized functions like [[year]] whenever possible as they benefit from a
    * specialized implementation.
    *
    * @group datetime_funcs
@@ -2647,7 +2647,11 @@ object functions {
   }
 
   /**
-   * Gets current Unix timestamp in seconds.
+   * Returns the current Unix timestamp (in seconds).
+   *
+   * @note All calls of `unix_timestamp` within the same query return the same value
+   * (i.e. the current timestamp is calculated at the start of query evaluation).
+   *
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2657,7 +2661,9 @@ object functions {
 
   /**
    * Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds),
-   * using the default timezone and the default locale, return null if fail.
+   * using the default timezone and the default locale.
+   * Returns `null` if fails.
+   *
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2666,13 +2672,15 @@ object functions {
   }
 
   /**
-   * Convert time string with given pattern
-   * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
-   * to Unix time stamp (in seconds), return null if fail.
+   * Converts time string with given pattern to Unix timestamp (in seconds).
+   * Returns `null` if fails.
+   *
+   * @see <a href="http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html">
+   * Customizing Formats</a>
    * @group datetime_funcs
    * @since 1.5.0
    */
-  def unix_timestamp(s: Column, p: String): Column = withExpr {UnixTimestamp(s.expr, Literal(p)) }
+  def unix_timestamp(s: Column, p: String): Column = withExpr { UnixTimestamp(s.expr, Literal(p)) }
 
   /**
    * Convert time string to a Unix timestamp (in seconds).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index e1049c665a417..142b005850a49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.storage.StorageLevel
 
 
-
 /**
  * Internal implementation of the user-facing `Catalog`.
  */
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-after-alter-table.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-after-alter-table.sql
new file mode 100644
index 0000000000000..69bff6656c43a
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-after-alter-table.sql
@@ -0,0 +1,29 @@
+CREATE TABLE table_with_comment (a STRING, b INT, c STRING, d STRING) USING parquet COMMENT 'added';
+
+DESC FORMATTED table_with_comment;
+
+-- ALTER TABLE BY MODIFYING COMMENT
+ALTER TABLE table_with_comment SET TBLPROPERTIES("comment"= "modified comment", "type"= "parquet");
+
+DESC FORMATTED table_with_comment;
+
+-- DROP TEST TABLE
+DROP TABLE table_with_comment;
+
+-- CREATE TABLE WITHOUT COMMENT
+CREATE TABLE table_comment (a STRING, b INT) USING parquet;
+
+DESC FORMATTED table_comment;
+
+-- ALTER TABLE BY ADDING COMMENT
+ALTER TABLE table_comment SET TBLPROPERTIES(comment = "added comment");
+
+DESC formatted table_comment;
+
+-- ALTER UNSET PROPERTIES COMMENT
+ALTER TABLE table_comment UNSET TBLPROPERTIES IF EXISTS ('comment');
+
+DESC FORMATTED table_comment;
+
+-- DROP TEST TABLE
+DROP TABLE table_comment;
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out
new file mode 100644
index 0000000000000..1cc11c475bc40
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out
@@ -0,0 +1,161 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 12
+
+
+-- !query 0
+CREATE TABLE table_with_comment (a STRING, b INT, c STRING, d STRING) USING parquet COMMENT 'added'
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+DESC FORMATTED table_with_comment
+-- !query 1 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 1 output
+# col_name          	data_type           	comment             
+a                   	string              	                    
+b                   	int                 	                    
+c                   	string              	                    
+d                   	string              	                    
+                    	                    	                    
+# Detailed Table Information	                    	                    
+Database            	default             	                    
+Table               	table_with_comment  	                    
+Created [not included in comparison]
+Last Access [not included in comparison]
+Type                	MANAGED             	                    
+Provider            	parquet             	                    
+Comment             	added               	                    
+Location [not included in comparison]sql/core/spark-warehouse/table_with_comment
+
+
+-- !query 2
+ALTER TABLE table_with_comment SET TBLPROPERTIES("comment"= "modified comment", "type"= "parquet")
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+DESC FORMATTED table_with_comment
+-- !query 3 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 3 output
+# col_name          	data_type           	comment             
+a                   	string              	                    
+b                   	int                 	                    
+c                   	string              	                    
+d                   	string              	                    
+                    	                    	                    
+# Detailed Table Information	                    	                    
+Database            	default             	                    
+Table               	table_with_comment  	                    
+Created [not included in comparison]
+Last Access [not included in comparison]
+Type                	MANAGED             	                    
+Provider            	parquet             	                    
+Comment             	modified comment    	                    
+Properties          	[type=parquet]      	                    
+Location [not included in comparison]sql/core/spark-warehouse/table_with_comment
+
+
+-- !query 4
+DROP TABLE table_with_comment
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+CREATE TABLE table_comment (a STRING, b INT) USING parquet
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+DESC FORMATTED table_comment
+-- !query 6 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 6 output
+# col_name          	data_type           	comment             
+a                   	string              	                    
+b                   	int                 	                    
+                    	                    	                    
+# Detailed Table Information	                    	                    
+Database            	default             	                    
+Table               	table_comment       	                    
+Created [not included in comparison]
+Last Access [not included in comparison]
+Type                	MANAGED             	                    
+Provider            	parquet             	                    
+Location [not included in comparison]sql/core/spark-warehouse/table_comment
+
+
+-- !query 7
+ALTER TABLE table_comment SET TBLPROPERTIES(comment = "added comment")
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+DESC formatted table_comment
+-- !query 8 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 8 output
+# col_name          	data_type           	comment             
+a                   	string              	                    
+b                   	int                 	                    
+                    	                    	                    
+# Detailed Table Information	                    	                    
+Database            	default             	                    
+Table               	table_comment       	                    
+Created [not included in comparison]
+Last Access [not included in comparison]
+Type                	MANAGED             	                    
+Provider            	parquet             	                    
+Comment             	added comment       	                    
+Location [not included in comparison]sql/core/spark-warehouse/table_comment
+
+
+-- !query 9
+ALTER TABLE table_comment UNSET TBLPROPERTIES IF EXISTS ('comment')
+-- !query 9 schema
+struct<>
+-- !query 9 output
+
+
+
+-- !query 10
+DESC FORMATTED table_comment
+-- !query 10 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 10 output
+# col_name          	data_type           	comment             
+a                   	string              	                    
+b                   	int                 	                    
+                    	                    	                    
+# Detailed Table Information	                    	                    
+Database            	default             	                    
+Table               	table_comment       	                    
+Created [not included in comparison]
+Last Access [not included in comparison]
+Type                	MANAGED             	                    
+Provider            	parquet             	                    
+Location [not included in comparison]sql/core/spark-warehouse/table_comment
+
+
+-- !query 11
+DROP TABLE table_comment
+-- !query 11 schema
+struct<>
+-- !query 11 output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 5bd36ec25ccb0..d9f3689411ab7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -18,13 +18,13 @@
 package org.apache.spark.sql.jdbc
 
 import java.math.BigDecimal
-import java.sql.{Date, DriverManager, Timestamp}
+import java.sql.{Date, DriverManager, SQLException, Timestamp}
 import java.util.{Calendar, GregorianCalendar, Properties}
 
 import org.h2.jdbc.JdbcSQLException
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.DataSourceScanExec
@@ -141,6 +141,15 @@ class JDBCSuite extends SparkFunSuite
         |OPTIONS (url '$url', dbtable 'TEST.TIMETYPES', user 'testUser', password 'testPass')
        """.stripMargin.replaceAll("\n", " "))
 
+    conn.prepareStatement("CREATE TABLE test.timezone (tz TIMESTAMP WITH TIME ZONE) " +
+      "AS SELECT '1999-01-08 04:05:06.543543543 GMT-08:00'")
+      .executeUpdate()
+    conn.commit()
+
+    conn.prepareStatement("CREATE TABLE test.array (ar ARRAY) " +
+      "AS SELECT '(1, 2, 3)'")
+      .executeUpdate()
+    conn.commit()
 
     conn.prepareStatement("create table test.flttypes (a DOUBLE, b REAL, c DECIMAL(38, 18))"
         ).executeUpdate()
@@ -919,6 +928,17 @@ class JDBCSuite extends SparkFunSuite
     assert(res === (foobarCnt, 0L, foobarCnt) :: Nil)
   }
 
+  test("unsupported types") {
+    var e = intercept[SparkException] {
+      spark.read.jdbc(urlWithUserAndPass, "TEST.TIMEZONE", new Properties()).collect()
+    }.getMessage
+    assert(e.contains("java.lang.UnsupportedOperationException: unimplemented"))
+    e = intercept[SQLException] {
+      spark.read.jdbc(urlWithUserAndPass, "TEST.ARRAY", new Properties()).collect()
+    }.getMessage
+    assert(e.contains("Unsupported type ARRAY"))
+  }
+
   test("SPARK-19318: Connection properties keys should be case-sensitive.") {
     def testJdbcOptions(options: JDBCOptions): Unit = {
       // Spark JDBC data source options are case-insensitive
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index ba48facff2933..8fef467f5f5cb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -39,9 +39,10 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
-import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -224,6 +225,14 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       throw new TableAlreadyExistsException(db = db, table = table)
     }
 
+    val tableTz = tableDefinition.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
+    tableTz.foreach { tz =>
+      if (!DateTimeUtils.isValidTimezone(tz)) {
+        throw new AnalysisException(s"Cannot set" +
+          s" ${ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY} to invalid timezone $tz")
+      }
+    }
+
     if (tableDefinition.tableType == VIEW) {
       client.createTable(tableDefinition, ignoreIfExists)
     } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 6b98066cb76c8..e0b565c0d79a0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode._
 import org.apache.spark.sql.types._
 
@@ -174,7 +175,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // We don't support hive bucketed tables, only ones we write out.
             bucketSpec = None,
             fileFormat = fileFormat,
-            options = options)(sparkSession = sparkSession)
+            options = options ++ getStorageTzOptions(relation))(sparkSession = sparkSession)
           val created = LogicalRelation(fsRelation, updatedTable)
           tableRelationCache.put(tableIdentifier, created)
           created
@@ -201,7 +202,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
                 userSpecifiedSchema = Option(dataSchema),
                 // We don't support hive bucketed tables, only ones we write out.
                 bucketSpec = None,
-                options = options,
+                options = options ++ getStorageTzOptions(relation),
                 className = fileType).resolveRelation(),
               table = updatedTable)
 
@@ -222,6 +223,13 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     result.copy(output = newOutput)
   }
 
+  private def getStorageTzOptions(relation: CatalogRelation): Map[String, String] = {
+    // We add the table timezone to the relation options, which automatically gets injected into the
+    // hadoopConf for the Parquet Converters
+    val storageTzKey = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    relation.tableMeta.properties.get(storageTzKey).map(storageTzKey -> _).toMap
+  }
+
   private def inferIfNeeded(
       relation: CatalogRelation,
       options: Map[String, String],
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
index 05b6059472f59..2bfd63d9b56e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -17,12 +17,22 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.File
+import java.net.URLDecoder
 import java.sql.Timestamp
+import java.util.TimeZone
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.parquet.hadoop.ParquetFileReader
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
+
+import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompatibilityTest, ParquetFileFormat}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{StringType, StructType, TimestampType}
 
 class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHiveSingleton {
   /**
@@ -141,4 +151,369 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
       Row(Seq(Row(1))),
       "ARRAY<STRUCT<array_element: INT>>")
   }
+
+  val testTimezones = Seq(
+    "UTC" -> "UTC",
+    "LA" -> "America/Los_Angeles",
+    "Berlin" -> "Europe/Berlin"
+  )
+  // Check creating parquet tables with timestamps, writing data into them, and reading it back out
+  // under a variety of conditions:
+  // * tables with explicit tz and those without
+  // * altering table properties directly
+  // * variety of timezones, local & non-local
+  val sessionTimezones = testTimezones.map(_._2).map(Some(_)) ++ Seq(None)
+  sessionTimezones.foreach { sessionTzOpt =>
+    val sparkSession = spark.newSession()
+    sessionTzOpt.foreach { tz => sparkSession.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, tz) }
+    testCreateWriteRead(sparkSession, "no_tz", None, sessionTzOpt)
+    val localTz = TimeZone.getDefault.getID()
+    testCreateWriteRead(sparkSession, "local", Some(localTz), sessionTzOpt)
+    // check with a variety of timezones.  The unit tests currently are configured to always use
+    // America/Los_Angeles, but even if they didn't, we'd be sure to cover a non-local timezone.
+    testTimezones.foreach { case (tableName, zone) =>
+      if (zone != localTz) {
+        testCreateWriteRead(sparkSession, tableName, Some(zone), sessionTzOpt)
+      }
+    }
+  }
+
+  private def testCreateWriteRead(
+      sparkSession: SparkSession,
+      baseTable: String,
+      explicitTz: Option[String],
+      sessionTzOpt: Option[String]): Unit = {
+    testCreateAlterTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
+    testWriteTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
+    testReadTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
+  }
+
+  private def checkHasTz(spark: SparkSession, table: String, tz: Option[String]): Unit = {
+    val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
+    assert(tableMetadata.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY) === tz)
+  }
+
+  private def testCreateAlterTablesWithTimezone(
+      spark: SparkSession,
+      baseTable: String,
+      explicitTz: Option[String],
+      sessionTzOpt: Option[String]): Unit = {
+    test(s"SPARK-12297: Create and Alter Parquet tables and timezones; explicitTz = $explicitTz; " +
+      s"sessionTzOpt = $sessionTzOpt") {
+      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+      withTable(baseTable, s"like_$baseTable", s"select_$baseTable", s"partitioned_$baseTable") {
+        // If we ever add a property to set the table timezone by default, defaultTz would change
+        val defaultTz = None
+        // check that created tables have correct TBLPROPERTIES
+        val tblProperties = explicitTz.map {
+          tz => s"""TBLPROPERTIES ($key="$tz")"""
+        }.getOrElse("")
+        spark.sql(
+          s"""CREATE TABLE $baseTable (
+                |  x int
+                | )
+                | STORED AS PARQUET
+                | $tblProperties
+            """.stripMargin)
+        val expectedTableTz = explicitTz.orElse(defaultTz)
+        checkHasTz(spark, baseTable, expectedTableTz)
+        spark.sql(
+          s"""CREATE TABLE partitioned_$baseTable (
+                |  x int
+                | )
+                | PARTITIONED BY (y int)
+                | STORED AS PARQUET
+                | $tblProperties
+            """.stripMargin)
+        checkHasTz(spark, s"partitioned_$baseTable", expectedTableTz)
+        spark.sql(s"CREATE TABLE like_$baseTable LIKE $baseTable")
+        checkHasTz(spark, s"like_$baseTable", expectedTableTz)
+        spark.sql(
+          s"""CREATE TABLE select_$baseTable
+                | STORED AS PARQUET
+                | AS
+                | SELECT * from $baseTable
+            """.stripMargin)
+        checkHasTz(spark, s"select_$baseTable", defaultTz)
+
+        // check alter table, setting, unsetting, resetting the property
+        spark.sql(
+          s"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="America/Los_Angeles")""")
+        checkHasTz(spark, baseTable, Some("America/Los_Angeles"))
+        spark.sql(s"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="UTC")""")
+        checkHasTz(spark, baseTable, Some("UTC"))
+        spark.sql(s"""ALTER TABLE $baseTable UNSET TBLPROPERTIES ($key)""")
+        checkHasTz(spark, baseTable, None)
+        explicitTz.foreach { tz =>
+          spark.sql(s"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="$tz")""")
+          checkHasTz(spark, baseTable, expectedTableTz)
+        }
+      }
+    }
+  }
+
+  val desiredTimestampStrings = Seq(
+    "2015-12-31 22:49:59.123",
+    "2015-12-31 23:50:59.123",
+    "2016-01-01 00:39:59.123",
+    "2016-01-01 01:29:59.123"
+  )
+  // We don't want to mess with timezones inside the tests themselves, since we use a shared
+  // spark context, and then we might be prone to issues from lazy vals for timezones.  Instead,
+  // we manually adjust the timezone just to determine what the desired millis (since epoch, in utc)
+  // is for various "wall-clock" times in different timezones, and then we can compare against those
+  // in our tests.
+  val timestampTimezoneToMillis = {
+    val originalTz = TimeZone.getDefault
+    try {
+      desiredTimestampStrings.flatMap { timestampString =>
+        Seq("America/Los_Angeles", "Europe/Berlin", "UTC").map { tzId =>
+          TimeZone.setDefault(TimeZone.getTimeZone(tzId))
+          val timestamp = Timestamp.valueOf(timestampString)
+          (timestampString, tzId) -> timestamp.getTime()
+        }
+      }.toMap
+    } finally {
+      TimeZone.setDefault(originalTz)
+    }
+  }
+
+  private def createRawData(spark: SparkSession): Dataset[(String, Timestamp)] = {
+    import spark.implicits._
+    val df = desiredTimestampStrings.toDF("display")
+    // this will get the millis corresponding to the display time given the current *session*
+    // timezone.
+    df.withColumn("ts", expr("cast(display as timestamp)")).as[(String, Timestamp)]
+  }
+
+  private def testWriteTablesWithTimezone(
+      spark: SparkSession,
+      baseTable: String,
+      explicitTz: Option[String],
+      sessionTzOpt: Option[String]) : Unit = {
+    val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    test(s"SPARK-12297: Write to Parquet tables with Timestamps; explicitTz = $explicitTz; " +
+        s"sessionTzOpt = $sessionTzOpt") {
+
+      withTable(s"saveAsTable_$baseTable", s"insert_$baseTable", s"partitioned_ts_$baseTable") {
+        val sessionTzId = sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
+        // check that created tables have correct TBLPROPERTIES
+        val tblProperties = explicitTz.map {
+          tz => s"""TBLPROPERTIES ($key="$tz")"""
+        }.getOrElse("")
+
+        val rawData = createRawData(spark)
+        // Check writing data out.
+        // We write data into our tables, and then check the raw parquet files to see whether
+        // the correct conversion was applied.
+        rawData.write.saveAsTable(s"saveAsTable_$baseTable")
+        checkHasTz(spark, s"saveAsTable_$baseTable", None)
+        spark.sql(
+          s"""CREATE TABLE insert_$baseTable (
+                |  display string,
+                |  ts timestamp
+                | )
+                | STORED AS PARQUET
+                | $tblProperties
+               """.stripMargin)
+        checkHasTz(spark, s"insert_$baseTable", explicitTz)
+        rawData.write.insertInto(s"insert_$baseTable")
+        // no matter what, roundtripping via the table should leave the data unchanged
+        val readFromTable = spark.table(s"insert_$baseTable").collect()
+          .map { row => (row.getAs[String](0), row.getAs[Timestamp](1)).toString() }.sorted
+        assert(readFromTable === rawData.collect().map(_.toString()).sorted)
+
+        // Now we load the raw parquet data on disk, and check if it was adjusted correctly.
+        // Note that we only store the timezone in the table property, so when we read the
+        // data this way, we're bypassing all of the conversion logic, and reading the raw
+        // values in the parquet file.
+        val onDiskLocation = spark.sessionState.catalog
+          .getTableMetadata(TableIdentifier(s"insert_$baseTable")).location.getPath
+        // we test reading the data back with and without the vectorized reader, to make sure we
+        // haven't broken reading parquet from non-hive tables, with both readers.
+        Seq(false, true).foreach { vectorized =>
+          spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized)
+          val readFromDisk = spark.read.parquet(onDiskLocation).collect()
+          val storageTzId = explicitTz.getOrElse(sessionTzId)
+          readFromDisk.foreach { row =>
+            val displayTime = row.getAs[String](0)
+            val millis = row.getAs[Timestamp](1).getTime()
+            val expectedMillis = timestampTimezoneToMillis((displayTime, storageTzId))
+            assert(expectedMillis === millis, s"Display time '$displayTime' was stored " +
+              s"incorrectly with sessionTz = ${sessionTzOpt}; Got $millis, expected " +
+              s"$expectedMillis (delta = ${millis - expectedMillis})")
+          }
+        }
+
+        // check tables partitioned by timestamps.  We don't compare the "raw" data in this case,
+        // since they are adjusted even when we bypass the hive table.
+        rawData.write.partitionBy("ts").saveAsTable(s"partitioned_ts_$baseTable")
+        val partitionDiskLocation = spark.sessionState.catalog
+          .getTableMetadata(TableIdentifier(s"partitioned_ts_$baseTable")).location.getPath
+        // no matter what mix of timezones we use, the dirs should specify the value with the
+        // same time we use for display.
+        val parts = new File(partitionDiskLocation).list().collect {
+          case name if name.startsWith("ts=") => URLDecoder.decode(name.stripPrefix("ts="))
+        }.toSet
+        assert(parts === desiredTimestampStrings.toSet)
+      }
+    }
+  }
+
+  private def testReadTablesWithTimezone(
+      spark: SparkSession,
+      baseTable: String,
+      explicitTz: Option[String],
+      sessionTzOpt: Option[String]): Unit = {
+    val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    test(s"SPARK-12297: Read from Parquet tables with Timestamps; explicitTz = $explicitTz; " +
+      s"sessionTzOpt = $sessionTzOpt") {
+      withTable(s"external_$baseTable", s"partitioned_$baseTable") {
+        // we intentionally save this data directly, without creating a table, so we can
+        // see that the data is read back differently depending on table properties.
+        // we'll save with adjusted millis, so that it should be the correct millis after reading
+        // back.
+        val rawData = createRawData(spark)
+        // to avoid closing over entire class
+        val timestampTimezoneToMillis = this.timestampTimezoneToMillis
+        import spark.implicits._
+        val adjustedRawData = (explicitTz match {
+          case Some(tzId) =>
+            rawData.map { case (displayTime, _) =>
+              val storageMillis = timestampTimezoneToMillis((displayTime, tzId))
+              (displayTime, new Timestamp(storageMillis))
+            }
+          case _ =>
+            rawData
+        }).withColumnRenamed("_1", "display").withColumnRenamed("_2", "ts")
+        withTempPath { basePath =>
+          val unpartitionedPath = new File(basePath, "flat")
+          val partitionedPath = new File(basePath, "partitioned")
+          adjustedRawData.write.parquet(unpartitionedPath.getCanonicalPath)
+          val options = Map("path" -> unpartitionedPath.getCanonicalPath) ++
+            explicitTz.map { tz => Map(key -> tz) }.getOrElse(Map())
+
+          spark.catalog.createTable(
+            tableName = s"external_$baseTable",
+            source = "parquet",
+            schema = new StructType().add("display", StringType).add("ts", TimestampType),
+            options = options
+          )
+
+          // also write out a partitioned table, to make sure we can access that correctly.
+          // add a column we can partition by (value doesn't particularly matter).
+          val partitionedData = adjustedRawData.withColumn("id", monotonicallyIncreasingId)
+          partitionedData.write.partitionBy("id")
+            .parquet(partitionedPath.getCanonicalPath)
+          // unfortunately, catalog.createTable() doesn't let us specify partitioning, so just use
+          // a "CREATE TABLE" stmt.
+          val tblOpts = explicitTz.map { tz => s"""TBLPROPERTIES ($key="$tz")""" }.getOrElse("")
+          spark.sql(s"""CREATE EXTERNAL TABLE partitioned_$baseTable (
+                         |  display string,
+                         |  ts timestamp
+                         |)
+                         |PARTITIONED BY (id bigint)
+                         |STORED AS parquet
+                         |LOCATION 'file:${partitionedPath.getCanonicalPath}'
+                         |$tblOpts
+                          """.stripMargin)
+          spark.sql(s"msck repair table partitioned_$baseTable")
+
+          for {
+            vectorized <- Seq(false, true)
+            partitioned <- Seq(false, true)
+          } {
+            withClue(s"vectorized = $vectorized; partitioned = $partitioned") {
+              spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized)
+              val sessionTz = sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
+              val table = if (partitioned) s"partitioned_$baseTable" else s"external_$baseTable"
+              val query = s"select display, cast(ts as string) as ts_as_string, ts " +
+                s"from $table"
+              val collectedFromExternal = spark.sql(query).collect()
+              assert( collectedFromExternal.size === 4)
+              collectedFromExternal.foreach { row =>
+                val displayTime = row.getAs[String](0)
+                // the timestamp should still display the same, despite the changes in timezones
+                assert(displayTime === row.getAs[String](1).toString())
+                // we'll also check that the millis behind the timestamp has the appropriate
+                // adjustments.
+                val millis = row.getAs[Timestamp](2).getTime()
+                val expectedMillis = timestampTimezoneToMillis((displayTime, sessionTz))
+                val delta = millis - expectedMillis
+                val deltaHours = delta / (1000L * 60 * 60)
+                assert(millis === expectedMillis, s"Display time '$displayTime' did not have " +
+                  s"correct millis: was $millis, expected $expectedMillis; delta = $delta " +
+                  s"($deltaHours hours)")
+              }
+
+              // Now test that the behavior is still correct even with a filter which could get
+              // pushed down into parquet.  We don't need extra handling for pushed down
+              // predicates because (a) in ParquetFilters, we ignore TimestampType and (b) parquet
+              // does not read statistics from int96 fields, as they are unsigned.  See
+              // scalastyle:off line.size.limit
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348
+              // scalastyle:on line.size.limit
+              //
+              // Just to be defensive in case anything ever changes in parquet, this test checks
+              // the assumption on column stats, and also the end-to-end behavior.
+
+              val hadoopConf = sparkContext.hadoopConfiguration
+              val fs = FileSystem.get(hadoopConf)
+              val parts = if (partitioned) {
+                val subdirs = fs.listStatus(new Path(partitionedPath.getCanonicalPath))
+                  .filter(_.getPath().getName().startsWith("id="))
+                fs.listStatus(subdirs.head.getPath())
+                  .filter(_.getPath().getName().endsWith(".parquet"))
+              } else {
+                fs.listStatus(new Path(unpartitionedPath.getCanonicalPath))
+                  .filter(_.getPath().getName().endsWith(".parquet"))
+              }
+              // grab the meta data from the parquet file.  The next section of asserts just make
+              // sure the test is configured correctly.
+              assert(parts.size == 1)
+              val oneFooter = ParquetFileReader.readFooter(hadoopConf, parts.head.getPath)
+              assert(oneFooter.getFileMetaData.getSchema.getColumns.size === 2)
+              assert(oneFooter.getFileMetaData.getSchema.getColumns.get(1).getType() ===
+                PrimitiveTypeName.INT96)
+              val oneBlockMeta = oneFooter.getBlocks().get(0)
+              val oneBlockColumnMeta = oneBlockMeta.getColumns().get(1)
+              val columnStats = oneBlockColumnMeta.getStatistics
+              // This is the important assert.  Column stats are written, but they are ignored
+              // when the data is read back as mentioned above, b/c int96 is unsigned.  This
+              // assert makes sure this holds even if we change parquet versions (if eg. there
+              // were ever statistics even on unsigned columns).
+              assert(columnStats.isEmpty)
+
+              // These queries should return the entire dataset, but if the predicates were
+              // applied to the raw values in parquet, they would incorrectly filter data out.
+              Seq(
+                ">" -> "2015-12-31 22:00:00",
+                "<" -> "2016-01-01 02:00:00"
+              ).foreach { case (comparison, value) =>
+                val query =
+                  s"select ts from $table where ts $comparison '$value'"
+                val countWithFilter = spark.sql(query).count()
+                assert(countWithFilter === 4, query)
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-12297: exception on bad timezone") {
+    val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    val badTzException = intercept[AnalysisException] {
+      spark.sql(
+        s"""CREATE TABLE bad_tz_table (
+              |  x int
+              | )
+              | STORED AS PARQUET
+              | TBLPROPERTIES ($key="Blart Versenwald III")
+            """.stripMargin)
+    }
+    assert(badTzException.getMessage.contains("Blart Versenwald III"))
+  }
 }