peter-toth · peter-toth · Oct 25, 2018 · Oct 24, 2018 · Oct 24, 2018 · Oct 25, 2018
diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt
@@ -2,44 +2,44 @@
 Hive UDAF vs Spark AF
 ================================================================================================
 
-Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
-Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 hive udaf vs spark af:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-hive udaf w/o group by                        4895 / 5020          0.0       74685.5       1.0X
-spark af w/o group by                           38 /   46          1.7         580.3     128.7X
-hive udaf w/ group by                         3309 / 3331          0.0       50491.1       1.5X
-spark af w/ group by w/o fallback               41 /   46          1.6         626.4     119.2X
-spark af w/ group by w/ fallback               111 /  115          0.6        1690.9      44.2X
+hive udaf w/o group by                        6370 / 6400          0.0       97193.6       1.0X
+spark af w/o group by                           54 /   63          1.2         820.8     118.4X
+hive udaf w/ group by                         4492 / 4507          0.0       68539.5       1.4X
+spark af w/ group by w/o fallback               58 /   64          1.1         881.7     110.2X
+spark af w/ group by w/ fallback               136 /  142          0.5        2075.0      46.8X
 
 
 ================================================================================================
 ObjectHashAggregateExec vs SortAggregateExec - typed_count
 ================================================================================================
 
-Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
-Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 object agg v.s. sort agg:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-sort agg w/ group by                        33760 / 34049          3.1         322.0       1.0X
-object agg w/ group by w/o fallback           7625 / 7877         13.8          72.7       4.4X
-object agg w/ group by w/ fallback          22695 / 22749          4.6         216.4       1.5X
-sort agg w/o group by                         4861 / 6473         21.6          46.4       6.9X
-object agg w/o group by w/o fallback          4143 / 4272         25.3          39.5       8.1X
+sort agg w/ group by                        41500 / 41630          2.5         395.8       1.0X
+object agg w/ group by w/o fallback         10075 / 10122         10.4          96.1       4.1X
+object agg w/ group by w/ fallback          28131 / 28205          3.7         268.3       1.5X
+sort agg w/o group by                         6182 / 6221         17.0          59.0       6.7X
+object agg w/o group by w/o fallback          5435 / 5468         19.3          51.8       7.6X
 
 
 ================================================================================================
 ObjectHashAggregateExec vs SortAggregateExec - percentile_approx
 ================================================================================================
 
-Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
-Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 object agg v.s. sort agg:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-sort agg w/ group by                           745 /  786          2.8         355.2       1.0X
-object agg w/ group by w/o fallback            596 /  626          3.5         284.3       1.2X
-object agg w/ group by w/ fallback             769 /  810          2.7         366.6       1.0X
-sort agg w/o group by                          564 /  589          3.7         268.7       1.3X
-object agg w/o group by w/o fallback           573 /  598          3.7         273.3       1.3X
+sort agg w/ group by                           970 / 1025          2.2         462.5       1.0X
+object agg w/ group by w/o fallback            772 /  798          2.7         368.1       1.3X
+object agg w/ group by w/ fallback            1013 / 1044          2.1         483.1       1.0X
+sort agg w/o group by                          751 /  781          2.8         358.0       1.3X
+object agg w/o group by w/o fallback           772 /  814          2.7         368.0       1.3X
 
 
diff --git a/...est/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala b/...est/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala
@@ -46,11 +46,11 @@ import org.apache.spark.sql.types.LongType
  */
 object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
 
-  val spark: SparkSession = TestHive.sparkSession
-
-  private def hiveUDAFvsSparkAF(): Unit = {
-    val N = 2 << 15
+  private val spark: SparkSession = TestHive.sparkSession
+  private val sql = spark.sql _
+  import spark.implicits._
 
+  private def hiveUDAFvsSparkAF(N: Int): Unit = {
     val benchmark = new Benchmark(
       name = "hive udaf vs spark af",
       valuesPerIteration = N,
@@ -61,7 +61,7 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
       output = output
     )
 
-    spark.sql(
+    sql(
       s"CREATE TEMPORARY FUNCTION hive_percentile_approx AS '" +
         s"${classOf[GenericUDAFPercentileApprox].getName}'"
     )
@@ -70,56 +70,44 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
 
     benchmark.addCase("hive udaf w/o group by") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
-        spark.sql("SELECT hive_percentile_approx(id, 0.5) FROM t").collect()
+        sql("SELECT hive_percentile_approx(id, 0.5) FROM t").collect()
       }
     }
 
     benchmark.addCase("spark af w/o group by") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") {
-        spark.sql("SELECT percentile_approx(id, 0.5) FROM t").collect()
+        sql("SELECT percentile_approx(id, 0.5) FROM t").collect()
       }
     }
 
     benchmark.addCase("hive udaf w/ group by") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
-        spark
-          .sql(
-            s"SELECT hive_percentile_approx(id, 0.5) " +
-              s"FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)"
-          )
-          .collect()
+        sql(
+          s"SELECT hive_percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)"
+        ).collect()
       }
     }
 
     benchmark.addCase("spark af w/ group by w/o fallback") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") {
-        spark
-          .sql(
-            s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)"
-          )
+        sql(s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)")
           .collect()
       }
     }
 
     benchmark.addCase("spark af w/ group by w/ fallback") { _ =>
       withSQLConf(
         SQLConf.USE_OBJECT_HASH_AGG.key -> "true",
-        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2"
-      ) {
-        spark
-          .sql(
-            s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)"
-          )
+        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") {
+        sql(s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)")
           .collect()
       }
     }
 
     benchmark.run()
   }
 
-  private def objectHashAggregateExecVsSortAggregateExecUsingTypedCount(): Unit = {
-    val N: Long = 1024 * 1024 * 100
-
+  private def objectHashAggregateExecVsSortAggregateExecUsingTypedCount(N: Int): Unit = {
     val benchmark = new Benchmark(
       name = "object agg v.s. sort agg",
       valuesPerIteration = N,
@@ -130,8 +118,6 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
       output = output
     )
 
-    import spark.implicits._
-
     def typed_count(column: Column): Column =
       Column(TestingTypedCount(column.expr).toAggregateExpression())
 
@@ -152,8 +138,7 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
     benchmark.addCase("object agg w/ group by w/ fallback") { _ =>
       withSQLConf(
         SQLConf.USE_OBJECT_HASH_AGG.key -> "true",
-        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2"
-      ) {
+        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") {
         df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).collect()
       }
     }
@@ -173,9 +158,7 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
     benchmark.run()
   }
 
-  private def objectHashAggregateExecVsSortAggregateExecUsingPercentileApprox(): Unit = {
-    val N = 2 << 20
-
+  private def objectHashAggregateExecVsSortAggregateExecUsingPercentileApprox(N: Int): Unit = {
     val benchmark = new Benchmark(
       name = "object agg v.s. sort agg",
       valuesPerIteration = N,
@@ -186,34 +169,25 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
       output = output
     )
 
-    import spark.implicits._
-
     val df = spark.range(N).coalesce(1)
 
     benchmark.addCase("sort agg w/ group by") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
-        df.groupBy($"id" / (N / 4) cast LongType)
-          .agg(percentile_approx($"id", 0.5))
-          .collect()
+        df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect()
       }
     }
 
     benchmark.addCase("object agg w/ group by w/o fallback") { _ =>
       withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") {
-        df.groupBy($"id" / (N / 4) cast LongType)
-          .agg(percentile_approx($"id", 0.5))
-          .collect()
+        df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect()
       }
     }
 
     benchmark.addCase("object agg w/ group by w/ fallback") { _ =>
       withSQLConf(
         SQLConf.USE_OBJECT_HASH_AGG.key -> "true",
-        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2"
-      ) {
-        df.groupBy($"id" / (N / 4) cast LongType)
-          .agg(percentile_approx($"id", 0.5))
-          .collect()
+        SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") {
+        df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect()
       }
     }
 
@@ -240,15 +214,15 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper {
 
   override def runBenchmarkSuite(): Unit = {
     runBenchmark("Hive UDAF vs Spark AF") {
-      hiveUDAFvsSparkAF()
+      hiveUDAFvsSparkAF(2 << 15)
     }
 
     runBenchmark("ObjectHashAggregateExec vs SortAggregateExec - typed_count") {
-      objectHashAggregateExecVsSortAggregateExecUsingTypedCount
+      objectHashAggregateExecVsSortAggregateExecUsingTypedCount(1024 * 1024 * 100)
     }
 
     runBenchmark("ObjectHashAggregateExec vs SortAggregateExec - percentile_approx") {
-      objectHashAggregateExecVsSortAggregateExecUsingPercentileApprox
+      objectHashAggregateExecVsSortAggregateExecUsingPercentileApprox(2 << 20)
     }
   }
 }