alteryx · markhamstra · Dec 9, 2015 · Dec 7, 2015 · Dec 7, 2015 · Dec 7, 2015
diff --git a/R/pkg/inst/tests/jarTest.R → R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/jarTest.R → R/pkg/inst/tests/testthat/jarTest.R
diff --git a/R/pkg/inst/tests/packageInAJarTest.R → ...g/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/packageInAJarTest.R → ...g/inst/tests/testthat/packageInAJarTest.R
diff --git a/R/pkg/inst/tests/test_Serde.R → R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/test_Serde.R → R/pkg/inst/tests/testthat/test_Serde.R
diff --git a/R/pkg/inst/tests/test_binaryFile.R → R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/test_binaryFile.R → R/pkg/inst/tests/testthat/test_binaryFile.R
diff --git a/R/pkg/inst/tests/test_binary_function.R → ...nst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/test_binary_function.R → ...nst/tests/testthat/test_binary_function.R
diff --git a/R/pkg/inst/tests/test_broadcast.R → R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/test_broadcast.R → R/pkg/inst/tests/testthat/test_broadcast.R
diff --git a/R/pkg/inst/tests/test_client.R → R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/test_client.R → R/pkg/inst/tests/testthat/test_client.R
diff --git a/R/pkg/inst/tests/test_context.R → R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/test_context.R → R/pkg/inst/tests/testthat/test_context.R
diff --git a/R/pkg/inst/tests/test_includeJAR.R → R/pkg/inst/tests/testthat/test_includeJAR.R b/R/pkg/inst/tests/test_includeJAR.R → R/pkg/inst/tests/testthat/test_includeJAR.R
@@ -20,7 +20,7 @@ runScript <- function() {
   sparkHome <- Sys.getenv("SPARK_HOME")
   sparkTestJarPath <- "R/lib/SparkR/test_support/sparktestjar_2.10-1.0.jar"
   jarPath <- paste("--jars", shQuote(file.path(sparkHome, sparkTestJarPath)))
-  scriptPath <- file.path(sparkHome, "R/lib/SparkR/tests/jarTest.R")
+  scriptPath <- file.path(sparkHome, "R/lib/SparkR/tests/testthat/jarTest.R")
   submitPath <- file.path(sparkHome, "bin/spark-submit")
   res <- system2(command = submitPath,
                  args = c(jarPath, scriptPath),

diff --git a/R/pkg/inst/tests/test_includePackage.R → ...inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/test_includePackage.R → ...inst/tests/testthat/test_includePackage.R
diff --git a/R/pkg/inst/tests/test_mllib.R → R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/test_mllib.R → R/pkg/inst/tests/testthat/test_mllib.R
@@ -26,7 +26,7 @@ sc <- sparkR.init()
 sqlContext <- sparkRSQL.init(sc)
 
 test_that("glm and predict", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   test <- select(training, "Sepal_Length")
   model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
   prediction <- predict(model, test)
@@ -39,7 +39,7 @@ test_that("glm and predict", {
 })
 
 test_that("glm should work with long formula", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   training$LongLongLongLongLongName <- training$Sepal_Width
   training$VeryLongLongLongLonLongName <- training$Sepal_Length
   training$AnotherLongLongLongLongName <- training$Species
@@ -51,31 +51,31 @@ test_that("glm should work with long formula", {
 })
 
 test_that("predictions match with native glm", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
   vals <- collect(select(predict(model, training), "prediction"))
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
 test_that("dot minus and intercept vs native glm", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   model <- glm(Sepal_Width ~ . - Species + 0, data = training)
   vals <- collect(select(predict(model, training), "prediction"))
   rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
 test_that("feature interaction vs native glm", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
   vals <- collect(select(predict(model, training), "prediction"))
   rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
 test_that("summary coefficients match with native glm", {
-  training <- createDataFrame(sqlContext, iris)
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "normal"))
   coefs <- unlist(stats$coefficients)
   devianceResiduals <- unlist(stats$devianceResiduals)
@@ -92,7 +92,7 @@ test_that("summary coefficients match with native glm", {
 })
 
 test_that("summary coefficients match with native glm of family 'binomial'", {
-  df <- createDataFrame(sqlContext, iris)
+  df <- suppressWarnings(createDataFrame(sqlContext, iris))
   training <- filter(df, df$Species != "setosa")
   stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
     family = "binomial"))

diff --git a/R/pkg/inst/tests/test_parallelize_collect.R → ...tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/test_parallelize_collect.R → ...tests/testthat/test_parallelize_collect.R
diff --git a/R/pkg/inst/tests/test_rdd.R → R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/test_rdd.R → R/pkg/inst/tests/testthat/test_rdd.R
diff --git a/R/pkg/inst/tests/test_shuffle.R → R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/test_shuffle.R → R/pkg/inst/tests/testthat/test_shuffle.R
diff --git a/R/pkg/inst/tests/test_sparkSQL.R → R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R → R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -133,38 +133,45 @@ test_that("create DataFrame from RDD", {
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
 
-  df <- jsonFile(sqlContext, jsonPathNa)
-  hiveCtx <- tryCatch({
-    newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
-  },
-  error = function(err) {
-    skip("Hive is not build with SparkSQL, skipped")
-  })
-  sql(hiveCtx, "CREATE TABLE people (name string, age double, height float)")
-  insertInto(df, "people")
-  expect_equal(sql(hiveCtx, "SELECT age from people WHERE name = 'Bob'"), c(16))
-  expect_equal(sql(hiveCtx, "SELECT height from people WHERE name ='Bob'"), c(176.5))
-
   schema <- structType(structField("name", "string"), structField("age", "integer"),
                        structField("height", "float"))
-  df2 <- createDataFrame(sqlContext, df.toRDD, schema)
-  df2AsDF <- as.DataFrame(sqlContext, df.toRDD, schema)
+  df <- read.df(sqlContext, jsonPathNa, "json", schema)
+  df2 <- createDataFrame(sqlContext, toRDD(df), schema)
+  df2AsDF <- as.DataFrame(sqlContext, toRDD(df), schema)
   expect_equal(columns(df2), c("name", "age", "height"))
   expect_equal(columns(df2AsDF), c("name", "age", "height"))
   expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
   expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
-  expect_equal(collect(where(df2, df2$name == "Bob")), c("Bob", 16, 176.5))
-  expect_equal(collect(where(df2AsDF, df2$name == "Bob")), c("Bob", 16, 176.5))
+  expect_equal(as.list(collect(where(df2, df2$name == "Bob"))),
+               list(name = "Bob", age = 16, height = 176.5))
+  expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))),
+               list(name = "Bob", age = 16, height = 176.5))
 
   localDF <- data.frame(name=c("John", "Smith", "Sarah"),
-                        age=c(19, 23, 18),
-                        height=c(164.10, 181.4, 173.7))
+                        age=c(19L, 23L, 18L),
+                        height=c(176.5, 181.4, 173.7))
   df <- createDataFrame(sqlContext, localDF, schema)
   expect_is(df, "DataFrame")
   expect_equal(count(df), 3)
   expect_equal(columns(df), c("name", "age", "height"))
   expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
-  expect_equal(collect(where(df, df$name == "John")), c("John", 19, 164.10))
+  expect_equal(as.list(collect(where(df, df$name == "John"))),
+               list(name = "John", age = 19L, height = 176.5))
+
+  ssc <- callJMethod(sc, "sc")
+  hiveCtx <- tryCatch({
+    newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
+  },
+  error = function(err) {
+    skip("Hive is not build with SparkSQL, skipped")
+  })
+  sql(hiveCtx, "CREATE TABLE people (name string, age double, height float)")
+  df <- read.df(hiveCtx, jsonPathNa, "json", schema)
+  invisible(insertInto(df, "people"))
+  expect_equal(collect(sql(hiveCtx, "SELECT age from people WHERE name = 'Bob'"))$age,
+               c(16))
+  expect_equal(collect(sql(hiveCtx, "SELECT height from people WHERE name ='Bob'"))$height,
+               c(176.5))
 })
 
 test_that("convert NAs to null type in DataFrames", {
@@ -250,7 +257,7 @@ test_that("create DataFrame from list or data.frame", {
   ldf2 <- collect(df)
   expect_equal(ldf$a, ldf2$a)
 
-  irisdf <- createDataFrame(sqlContext, iris)
+  irisdf <- suppressWarnings(createDataFrame(sqlContext, iris))
   iris_collected <- collect(irisdf)
   expect_equivalent(iris_collected[,-5], iris[,-5])
   expect_equal(iris_collected$Species, as.character(iris$Species))
@@ -463,7 +470,7 @@ test_that("union on two RDDs created from DataFrames returns an RRDD", {
   RDD2 <- toRDD(df)
   unioned <- unionRDD(RDD1, RDD2)
   expect_is(unioned, "RDD")
-  expect_equal(SparkR:::getSerializedMode(unioned), "byte")
+  expect_equal(getSerializedMode(unioned), "byte")
   expect_equal(collect(unioned)[[2]]$name, "Andy")
 })
 
@@ -485,13 +492,13 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", {
 
   unionByte <- unionRDD(rdd, dfRDD)
   expect_is(unionByte, "RDD")
-  expect_equal(SparkR:::getSerializedMode(unionByte), "byte")
+  expect_equal(getSerializedMode(unionByte), "byte")
   expect_equal(collect(unionByte)[[1]], 1)
   expect_equal(collect(unionByte)[[12]]$name, "Andy")
 
   unionString <- unionRDD(textRDD, dfRDD)
   expect_is(unionString, "RDD")
-  expect_equal(SparkR:::getSerializedMode(unionString), "byte")
+  expect_equal(getSerializedMode(unionString), "byte")
   expect_equal(collect(unionString)[[1]], "Michael")
   expect_equal(collect(unionString)[[5]]$name, "Andy")
 })
@@ -504,7 +511,7 @@ test_that("objectFile() works with row serialization", {
   objectIn <- objectFile(sc, objectPath)
 
   expect_is(objectIn, "RDD")
-  expect_equal(SparkR:::getSerializedMode(objectIn), "byte")
+  expect_equal(getSerializedMode(objectIn), "byte")
   expect_equal(collect(objectIn)[[2]]$age, 30)
 })
 
@@ -849,6 +856,7 @@ test_that("write.df() as parquet file", {
 })
 
 test_that("test HiveContext", {
+  ssc <- callJMethod(sc, "sc")
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
   },
@@ -863,10 +871,10 @@ test_that("test HiveContext", {
   expect_equal(count(df2), 3)
 
   jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
-  saveAsTable(df, "json", "json", "append", path = jsonPath2)
-  df3 <- sql(hiveCtx, "select * from json")
+  invisible(saveAsTable(df, "json2", "json", "append", path = jsonPath2))
+  df3 <- sql(hiveCtx, "select * from json2")
   expect_is(df3, "DataFrame")
-  expect_equal(count(df3), 6)
+  expect_equal(count(df3), 3)
 })
 
 test_that("column operators", {
@@ -1311,7 +1319,7 @@ test_that("toJSON() returns an RDD of the correct values", {
   df <- jsonFile(sqlContext, jsonPath)
   testRDD <- toJSON(df)
   expect_is(testRDD, "RDD")
-  expect_equal(SparkR:::getSerializedMode(testRDD), "string")
+  expect_equal(getSerializedMode(testRDD), "string")
   expect_equal(collect(testRDD)[[1]], mockLines[1])
 })
 
@@ -1641,7 +1649,7 @@ test_that("SQL error message is returned from JVM", {
   expect_equal(grepl("Table not found: blah", retError), TRUE)
 })
 
-irisDF <- createDataFrame(sqlContext, iris)
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
 
 test_that("Method as.data.frame as a synonym for collect()", {
   expect_equal(as.data.frame(irisDF), collect(irisDF))
@@ -1670,7 +1678,7 @@ test_that("attach() on a DataFrame", {
 })
 
 test_that("with() on a DataFrame", {
-  df <- createDataFrame(sqlContext, iris)
+  df <- suppressWarnings(createDataFrame(sqlContext, iris))
   expect_error(Sepal_Length)
   sum1 <- with(df, list(summary(Sepal_Length), summary(Sepal_Width)))
   expect_equal(collect(sum1[[1]])[1, "Sepal_Length"], "150")

diff --git a/R/pkg/inst/tests/test_take.R → R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/test_take.R → R/pkg/inst/tests/testthat/test_take.R
diff --git a/R/pkg/inst/tests/test_textFile.R → R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/test_textFile.R → R/pkg/inst/tests/testthat/test_textFile.R
diff --git a/R/pkg/inst/tests/test_utils.R → R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/test_utils.R → R/pkg/inst/tests/testthat/test_utils.R
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
@@ -18,4 +18,7 @@
 library(testthat)
 library(SparkR)
 
+# Turn all warnings into errors
+options("warn" = 2)
+
 test_package("SparkR")
diff --git a/R/run-tests.sh b/R/run-tests.sh
@@ -23,7 +23,7 @@ FAILED=0
 LOGFILE=$FWDIR/unit-tests.out
 rm -f $LOGFILE
 
-SPARK_TESTING=1 $FWDIR/../bin/sparkR --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+SPARK_TESTING=1 $FWDIR/../bin/sparkR --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
 FAILED=$((PIPESTATUS[0]||$FAILED))
 
 if [[ $FAILED != 0 ]]; then

diff --git a/bin/spark-class b/bin/spark-class
@@ -71,6 +71,12 @@ fi
 
 export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
 
+# For tests
+if [[ -n "$SPARK_TESTING" ]]; then
+  unset YARN_CONF_DIR
+  unset HADOOP_CONF_DIR
+fi
+
 # The launcher library will print arguments separated by a NULL character, to allow arguments with
 # characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
 # an array that will be used to exec the final command.

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -253,7 +253,7 @@ private[spark] object RangePartitioner {
    */
   def sketch[K : ClassTag](
       rdd: RDD[K],
-      sampleSizePerPartition: Int): (Long, Array[(Int, Int, Array[K])]) = {
+      sampleSizePerPartition: Int): (Long, Array[(Int, Long, Array[K])]) = {
     val shift = rdd.id
     // val classTagK = classTag[K] // to avoid serializing the entire partitioner object
     val sketched = rdd.mapPartitionsWithIndex { (idx, iter) =>
@@ -262,7 +262,7 @@ private[spark] object RangePartitioner {
         iter, sampleSizePerPartition, seed)
       Iterator((idx, n, sample))
     }.collect()
-    val numItems = sketched.map(_._2.toLong).sum
+    val numItems = sketched.map(_._2).sum
     (numItems, sketched)
   }
 

diff --git a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala
@@ -39,7 +39,7 @@ import org.apache.spark.Logging
  * @param lock a [[MemoryManager]] instance to synchronize on
  * @param poolName a human-readable name for this pool, for use in log messages
  */
-class ExecutionMemoryPool(
+private[memory] class ExecutionMemoryPool(
     lock: Object,
     poolName: String
   ) extends MemoryPool(lock) with Logging {

diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/MemoryPool.scala
@@ -27,7 +27,7 @@ import javax.annotation.concurrent.GuardedBy
  *             to `Object` to avoid programming errors, since this object should only be used for
  *             synchronization purposes.
  */
-abstract class MemoryPool(lock: Object) {
+private[memory] abstract class MemoryPool(lock: Object) {
 
   @GuardedBy("lock")
   private[this] var _poolSize: Long = 0

diff --git a/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala
@@ -31,7 +31,7 @@ import org.apache.spark.storage.{MemoryStore, BlockStatus, BlockId}
  *
  * @param lock a [[MemoryManager]] instance to synchronize on
  */
-class StorageMemoryPool(lock: Object) extends MemoryPool(lock) with Logging {
+private[memory] class StorageMemoryPool(lock: Object) extends MemoryPool(lock) with Logging {
 
   @GuardedBy("lock")
   private[this] var _memoryUsed: Long = 0L

diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -49,7 +49,7 @@ import org.apache.spark.storage.{BlockStatus, BlockId}
 private[spark] class UnifiedMemoryManager private[memory] (
     conf: SparkConf,
     val maxMemory: Long,
-    private val storageRegionSize: Long,
+    storageRegionSize: Long,
     numCores: Int)
   extends MemoryManager(
     conf,

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -398,7 +398,15 @@ private[serializer] class KryoInputDataInputBridge(input: KryoInput) extends Dat
   override def readUTF(): String = input.readString() // readString in kryo does utf8
   override def readInt(): Int = input.readInt()
   override def readUnsignedShort(): Int = input.readShortUnsigned()
-  override def skipBytes(n: Int): Int = input.skip(n.toLong).toInt
+  override def skipBytes(n: Int): Int = {
+    var remaining: Long = n
+    while (remaining > 0) {
+      val skip = Math.min(Integer.MAX_VALUE, remaining).asInstanceOf[Int]
+      input.skip(skip)
+      remaining -= skip
+    }
+    n
+  }
   override def readFully(b: Array[Byte]): Unit = input.read(b)
   override def readFully(b: Array[Byte], off: Int, len: Int): Unit = input.read(b, off, len)
   override def readLine(): String = throw new UnsupportedOperationException("readLine")

diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -34,7 +34,7 @@ private[spark] object SamplingUtils {
       input: Iterator[T],
       k: Int,
       seed: Long = Random.nextLong())
-    : (Array[T], Int) = {
+    : (Array[T], Long) = {
     val reservoir = new Array[T](k)
     // Put the first k elements in the reservoir.
     var i = 0
@@ -52,16 +52,17 @@ private[spark] object SamplingUtils {
       (trimReservoir, i)
     } else {
       // If input size > k, continue the sampling process.
+      var l = i.toLong
       val rand = new XORShiftRandom(seed)
       while (input.hasNext) {
         val item = input.next()
-        val replacementIndex = rand.nextInt(i)
+        val replacementIndex = (rand.nextDouble() * l).toLong
         if (replacementIndex < k) {
-          reservoir(replacementIndex) = item
+          reservoir(replacementIndex.toInt) = item
         }
-        i += 1
+        l += 1
       }
-      (reservoir, i)
+      (reservoir, l)
     }
   }