switch to directly using exclusive scan

apache · Dec 18, 2020 · 1360f30 · 1360f30
1 parent 95c0f61
commit 1360f30
Showing 1 changed file with 25 additions and 17 deletions.
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
@@ -51,6 +51,10 @@ def atomic_add(x, y):
     return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
 
 
+def ceil_div(a, b):
+    return tvm.tir.indexdiv(a + b - 1, b)
+
+
 def rearrange_indices_out_ir(data, output, valid_box_count):
     """Hybrid routine to rearrange nms output to
     move all valid entries to top.
@@ -155,7 +159,7 @@ def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     with ib.new_scope():
         nthread_tx = max_threads
-        nthread_bx = num_anchors // max_threads + 1
+        nthread_bx = ceil_div(num_anchors, max_threads)
         nthread_by = batch_size
         tx = te.thread_axis("threadIdx.x")
         bx = te.thread_axis("blockIdx.x")
@@ -212,9 +216,6 @@ def get_valid_indices_ir(valid_boxes, valid_count, valid_indices):
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
 
-    def ceil_div(a, b):
-        return tvm.tir.indexdiv(a + b - 1, b)
-
     # Copy boxes to valid_indices
     with ib.new_scope():
         nthread_tx = max_threads
@@ -227,19 +228,16 @@ def ceil_div(a, b):
         ib.scope_attr(bx, "thread_extent", nthread_bx)
         ib.scope_attr(by, "thread_extent", nthread_by)
         tid = bx * nthread_tx + tx
-        with ib.if_scope(tid == 0):
-            valid_indices[by, 0] = 0
-        with ib.else_scope():
-            with ib.if_scope(tid < num_anchors):
-                valid_indices[by, tid] = valid_boxes[by, tid - 1]
+        with ib.if_scope(tid < num_anchors):
+            valid_indices[by, tid] = valid_boxes[by, tid]
 
     nthread_tx = max_threads
     nthread_bx = ceil_div(num_anchors, max_threads)
     nthread_by = batch_size
 
-    ## The following algorithm performs parallel prefix sum to get
+    ## The following algorithm performs parallel exclusive scan to get
     ## a tensor that can later be used to select valid indices
-    # Up Sweep of prefix sum
+    # Up Sweep of exclusive scan
     lim = tvm.tir.generic.cast(
         tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(num_anchors, "float64"))), "int64"
     )
@@ -271,9 +269,15 @@ def ceil_div(a, b):
                         by * num_anchors + middle[0] - 1
                     ]
 
-    # Down Sweep of prefix sum
-    with ib.for_range(0, lim - 1, dtype="int64") as l2_width:
-        width = 2 << (lim - l2_width - 2)
+    # Down Sweep of exclusive scan
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", batch_size)
+        with ib.if_scope(bx < batch_size):
+            valid_indices[(bx + 1) * num_anchors - 1] = 0
+
+    with ib.for_range(0, lim, dtype="int64") as l2_width:
+        width = 2 << (lim - l2_width - 1)
 
         with ib.new_scope():
             tx = te.thread_axis("threadIdx.x")
@@ -291,13 +295,17 @@ def ceil_div(a, b):
             start = ib.allocate("int64", (1,), name="start", scope="local")
             middle = ib.allocate("int64", (1,), name="middle", scope="local")
             end = ib.allocate("int64", (1,), name="end", scope="local")
+            tmp = ib.allocate("int32", (1,), name="end", scope="local")
             start[0] = width * tid
-            with ib.if_scope(tvm.tir.all(start[0] > 0, start[0] < num_anchors)):
+            with ib.if_scope(tvm.tir.all(start[0] < num_anchors)):
                 middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                end[0] = tvm.tir.min(start[0] + width, num_anchors)
                 with ib.if_scope(middle[0] < num_anchors):
-                    valid_indices[by * num_anchors + middle[0] - 1] += valid_indices[
-                        by * num_anchors + start[0] - 1
+                    tmp[0] = valid_indices[by * num_anchors + middle[0] - 1]
+                    valid_indices[by * num_anchors + middle[0] - 1] = valid_indices[
+                        by * num_anchors + end[0] - 1
                     ]
+                    valid_indices[by * num_anchors + end[0] - 1] += tmp[0]
 
     ## Write Sum to valid_count
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)