NVIDIA · revans2 · Nov 2, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
@@ -498,6 +498,7 @@ def test_window_running_no_part(b_gen, batch_size):
             'FIRST(b) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS first_keep_nulls',
             'FIRST(b, TRUE) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS first_ignore_nulls',
             'NTH_VALUE(b, 1) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nth_1_keep_nulls']
+
     if isinstance(b_gen.data_type, NumericType) and not isinstance(b_gen, FloatGen) and not isinstance(b_gen, DoubleGen):
         query_parts.append('sum(b) over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as sum_col')
 
@@ -514,6 +515,37 @@ def test_window_running_no_part(b_gen, batch_size):
         validate_execs_in_gpu_plan = ['GpuRunningWindowExec'],
         conf = conf)
 
+
+# TODO: ROW vs RANGE parametrization?
+@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
+@pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:'))
+def test_range_running_window_no_part(b_gen, batch_size):
+    conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
+            'spark.rapids.sql.castFloatToDecimal.enabled': True}
+    query_parts = ['COUNT(b) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS count_col',
+                   'MIN(b) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS min_col',
+                   'MAX(b) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS max_col',
+                   'FIRST(b) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS first_keep_nulls',
+                   'FIRST(b, TRUE) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS first_ignore_nulls',
+                   'NTH_VALUE(b, 1) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nth_1_keep_nulls']
+
+    if isinstance(b_gen.data_type, NumericType) and not isinstance(b_gen, FloatGen) and not isinstance(b_gen, DoubleGen):
+        query_parts.append('SUM(b) OVER (ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS sum_col')
+
+    if spark_version() > "3.1.1":
+        query_parts.append('NTH_VALUE(b, 1) IGNORE NULLS OVER '
+                           '(ORDER BY a RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nth_1_ignore_nulls')
+
+    assert_gpu_and_cpu_are_equal_sql(
+        lambda spark : two_col_df(spark, UniqueLongGen(), b_gen, length=1024 * 14),
+        "window_agg_table",
+        'select ' +
+        ', '.join(query_parts) +
+        ' from window_agg_table ',
+        validate_execs_in_gpu_plan = ['GpuRunningWindowExec'],
+        conf = conf)
+
+
 # Test that we can do a running window sum on floats and doubles.  This becomes problematic because we do the agg in parallel
 # which means that the result can switch back and forth from Inf to not Inf depending on the order of aggregations.
 # We test this by limiting the range of the values in the sum to never hit Inf, and by using abs so we don't have

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala
@@ -465,11 +465,23 @@ object GpuWindowExec {
   }
 
   def isRunningWindow(spec: GpuWindowSpecDefinition): Boolean = spec match {
-    case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(RowFrame,
-    GpuSpecialFrameBoundary(UnboundedPreceding), GpuSpecialFrameBoundary(CurrentRow))) => true
-    case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(RowFrame,
-    GpuSpecialFrameBoundary(UnboundedPreceding), GpuLiteral(value, _))) if value == 0 => true
-    case _ => false
+    case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(
+                                         RowFrame,
+                                         GpuSpecialFrameBoundary(UnboundedPreceding),
+                                         GpuSpecialFrameBoundary(CurrentRow))) => true
+    case GpuWindowSpecDefinition(_, _,
+      GpuSpecifiedWindowFrame(RowFrame,
+                              GpuSpecialFrameBoundary(UnboundedPreceding), GpuLiteral(value, _)))
+      if value == 0 => true
+    case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(
+                                         RangeFrame,
+                                         GpuSpecialFrameBoundary(UnboundedPreceding),
+                                         GpuSpecialFrameBoundary(CurrentRow))) => true
+    case GpuWindowSpecDefinition(_, _,
+      GpuSpecifiedWindowFrame(RangeFrame,
+                              GpuSpecialFrameBoundary(UnboundedPreceding), GpuLiteral(value, _)))
+      if value == 0 => true
+     case _ => false
   }
 
   def isUnboundedToUnboundedWindow(spec: GpuWindowSpecDefinition): Boolean = spec match {
@@ -1674,6 +1686,28 @@ case class GpuRunningWindowExec(
 
   override def otherCopyArgs: Seq[AnyRef] = cpuPartitionSpec :: cpuOrderSpec :: Nil
 
+  override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq(outputBatching)
+
+  override def outputBatching: CoalesceGoal = {
+    val isRangeFrame = windowOps.exists {
+      case GpuAlias(
+             GpuWindowExpression(
+               _, GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(RangeFrame, _, _))),
+           _) => true
+      case _ => false
+    }
+    if (!isRangeFrame) {
+      return null // NO batching restrictions on ROW frames.
+    }
+    if (gpuPartitionSpec.isEmpty) {
+      // If unpartitioned, batch on the order-by column.
+      BatchedByKey(gpuOrderSpec)(cpuOrderSpec)
+    } else {
+      // If partitioned, batch on partition-columns + order-by columns.
+      BatchedByKey(gpuPartitionOrdering ++ gpuOrderSpec)(cpuPartitionOrdering ++ cpuOrderSpec)
+    }
+  }
+
   override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
     val numOutputBatches = gpuLongMetric(GpuMetric.NUM_OUTPUT_BATCHES)
     val numOutputRows = gpuLongMetric(GpuMetric.NUM_OUTPUT_ROWS)