[Large Tensor] Implemented LT flag for OpPerf testing (apache#17449)

* Passing large_tensor parameter down * Adding large tensor testing functionality for convolutional operators * Added large tensor test functionality for conv ops * Fixing sizing for conv ops * Added gemm large tensor, print on conv * Updated input for gemm ops and print statements * Fixed deconv large tensor test * Added bias for deconv * Added test functionality for nn_activation and nn_basic ops * Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test * Dropped unnecessary print statements * Fixed lint errors * Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions * Adding docs, changed large_tensor to int64_tensor for clarity * Added warmup/runs to gemm ops, debugging process failure * Resolved merge conficts, added default params and input switching functionality * Dynamic input handling for default inputs, additional custom data for int64 * Fixed RPD issue * Everything through reduction ops working * Passing large_tensor parameter down * Adding large tensor testing functionality for convolutional operators * Added large tensor test functionality for conv ops * Fixing sizing for conv ops * Added gemm large tensor, print on conv * Updated input for gemm ops and print statements * Fixed deconv large tensor test * Added bias for deconv * Added test functionality for nn_activation and nn_basic ops * Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test * Dropped unnecessary print statements * Fixed lint errors * Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions * Adding docs, changed large_tensor to int64_tensor for clarity * Added warmup/runs to gemm ops, debugging process failure * Resolved merge conficts, added default params and input switching functionality * Dynamic input handling for default inputs, additional custom data for int64 * Fixed RPD issue * Everything through reduction ops working * Random sampling & loss ops working * Added indices, depth, ravel_data in default_params * Added indexing ops - waiting for merge on ravel * Added optimizer ops * All misc ops working * All NN Basic ops working * Fixed LT input for ROIPooling * Refactored NN Conv tests * Added test for inline optimizer ops * Dropping extra tests to decrease execution time * Switching to inline tests for RNN to support additional modes * Added state_cell as NDArray param, removed linalg testing for int64 tensor * Cleaned up styling * Fixed conv and deconv tests * Retrigger CI for continuous build * Cleaned up GEMM op inputs * Dropped unused param from default_params
anirudh2290 · May 29, 2020 · 8a2e6aa · 8a2e6aa
1 parent 38c9877
commit 8a2e6aa
Show file tree

Hide file tree

Showing 19 changed files with 941 additions and 253 deletions.
diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py
@@ -29,8 +29,8 @@
 """
 
 
-def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the
     rearrange operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -55,5 +57,5 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
     mx_rearrange_ops = get_all_rearrange_operators()
 
     # Run benchmarks
-    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, warmup, runs)
+    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_rearrange_op_results
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,8 +38,8 @@
     get_all_elemen_wise_binary_operators, get_all_misc_binary_operators
 
 
-def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
+def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the miscellaneous
     binary operators in MXNet.
 
     Parameters
@@ -48,6 +48,10 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,12 +65,12 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
     # Fetch all Miscellaneous Binary Operators
     mx_binary_misc_ops = get_all_misc_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     broadcast operators in MXNet.
 
     Parameters
@@ -77,6 +81,8 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -90,12 +96,12 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
     # Fetch all Binary Broadcast Operators
     mx_binary_broadcast_ops = get_all_broadcast_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     element_wise operators in MXNet.
 
     Parameters
@@ -106,6 +112,8 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 10
         Number of times to run for warmup
     runs: int, default 50
@@ -119,5 +127,5 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
     # Fetch all Binary Element_wise Operators
     mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM
     operators (dot, batch_dot, khatri_rao) in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -57,43 +59,75 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
-    # Benchmark tests for dot operator
+    standard_inputs_dot = [{"lhs": (1024, 1024),
+                            "rhs": (1024, 1024)},
+                           {"lhs": (1000, 10),
+                            "rhs": (1000, 10),
+                            "transpose_b": True},
+                           {"lhs": (1000, 1),
+                            "rhs": (100, 1000),
+                            "transpose_a": True,
+                            "transpose_b": True}]
+    int64_tensor_inputs_dot = [{"lhs": (2**16, 2**16),
+                                "rhs": (2**16, 2**16)},
+                               {"lhs": (4, 2**30),
+                                "rhs": (4, 2**30),
+                                "transpose_b": True},
+                               {"lhs": (2**28, 16),
+                                "rhs": (16, 2**28),
+                                "transpose_a": True,
+                                "transpose_b": True}]
+    standard_inputs_batch_dot = [{"lhs": (32, 1024, 1024),
+                                  "rhs": (32, 1024, 1024)},
+                                 {"lhs": (32, 1000, 10),
+                                  "rhs": (32, 1000, 10),
+                                  "transpose_b": True},
+                                 {"lhs": (32, 1000, 1),
+                                  "rhs": (32, 100, 1000),
+                                  "transpose_a": True,
+                                  "transpose_b": True}]
+    int64_tensor_inputs_batch_dot = [{"lhs": (1, 2**16, 2**16),
+                                      "rhs": (1, 2**16, 2**16)},
+                                     {"lhs": (1, 4, 2**30),
+                                      "rhs": (1, 4, 2**30),
+                                      "transpose_b": True},
+                                     {"lhs": (1, 2**28, 16),
+                                      "rhs": (1, 16, 2**28),
+                                      "transpose_a": True,
+                                      "transpose_b": True}]
+    standard_inputs_khatri_rao = [{"args": [(32, 32), (32, 32)]},
+                                  {"args": [(64, 64), (64, 64)]}]
+    int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}]
+
+    if int64_tensor == 'on':
+        inputs_dot = int64_tensor_inputs_dot
+        inputs_batch_dot = int64_tensor_inputs_batch_dot
+        inputs_khatri_rao = int64_tensor_inputs_khatri_rao
+    else:
+        inputs_dot = standard_inputs_dot
+        inputs_batch_dot = standard_inputs_batch_dot
+        inputs_khatri_rao = standard_inputs_khatri_rao
+
+    # Benchmark tests for dot and batch_dot operators
     dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (1024, 1024),
-                 "rhs": (1024, 1024)},
-                {"lhs": (1000, 10),
-                 "rhs": (1000, 10),
-                 "transpose_b": True},
-                {"lhs": (1000, 1),
-                 "rhs": (100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Benchmark tests for batch_dot operator
+
     batch_dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (32, 1024, 1024),
-                 "rhs": (32, 1024, 1024)},
-                {"lhs": (32, 1000, 10),
-                 "rhs": (32, 1000, 10),
-                 "transpose_b": True},
-                {"lhs": (32, 1000, 1),
-                 "rhs": (32, 100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_batch_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Operator khatri_rao is not yet implemented for GPU
+        # Operator khatri_rao is not yet implemented for GPU
     khatri_rao_benchmark_res = []
     if ctx != mx.gpu():
         # Benchmark tests for khatri_rao operator
         khatri_rao_benchmark_res = run_performance_test(
             [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False,
             dtype=dtype, ctx=ctx,
-            inputs=[{"args": [(32, 32), (32, 32)]},
-                    {"args": [(64, 64), (64, 64)]}],
+            inputs=inputs_khatri_rao,
             warmup=warmup, runs=runs, profiler=profiler)
 
     # Prepare combined results for GEMM operators

diff --git a/benchmark/opperf/nd_operations/indexing_routines.py b/benchmark/opperf/nd_operations/indexing_routines.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the indexing routines
+def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the indexing routines
     in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,5 +63,5 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
     mx_indexing_ops = get_all_indexing_routines()
 
     # Run benchmarks
-    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, warmup, runs)
+    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_indexing_op_results
diff --git a/benchmark/opperf/nd_operations/linalg_operators.py b/benchmark/opperf/nd_operations/linalg_operators.py
@@ -34,8 +34,8 @@
 from benchmark.opperf.utils.common_utils import merge_map_list
 from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
-def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the linear algebra
+def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the linear algebra
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -74,5 +76,5 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
     # Fetch all Linear Algebra Operators
     mx_linalg_ops = get_all_linalg_operators()
     # Run benchmarks
-    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, warmup, runs)
+    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results])