From 8a2e6aafb85ebee26c341a689d5ce8cfb6398479 Mon Sep 17 00:00:00 2001 From: Connor Goggins Date: Sat, 29 Feb 2020 00:43:08 -0800 Subject: [PATCH] [Large Tensor] Implemented LT flag for OpPerf testing (#17449) * Passing large_tensor parameter down * Adding large tensor testing functionality for convolutional operators * Added large tensor test functionality for conv ops * Fixing sizing for conv ops * Added gemm large tensor, print on conv * Updated input for gemm ops and print statements * Fixed deconv large tensor test * Added bias for deconv * Added test functionality for nn_activation and nn_basic ops * Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test * Dropped unnecessary print statements * Fixed lint errors * Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions * Adding docs, changed large_tensor to int64_tensor for clarity * Added warmup/runs to gemm ops, debugging process failure * Resolved merge conficts, added default params and input switching functionality * Dynamic input handling for default inputs, additional custom data for int64 * Fixed RPD issue * Everything through reduction ops working * Passing large_tensor parameter down * Adding large tensor testing functionality for convolutional operators * Added large tensor test functionality for conv ops * Fixing sizing for conv ops * Added gemm large tensor, print on conv * Updated input for gemm ops and print statements * Fixed deconv large tensor test * Added bias for deconv * Added test functionality for nn_activation and nn_basic ops * Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test * Dropped unnecessary print statements * Fixed lint errors * Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions * Adding docs, changed large_tensor to int64_tensor for clarity * Added warmup/runs to gemm ops, debugging process failure * Resolved merge conficts, added default params and input switching functionality * Dynamic input handling for default inputs, additional custom data for int64 * Fixed RPD issue * Everything through reduction ops working * Random sampling & loss ops working * Added indices, depth, ravel_data in default_params * Added indexing ops - waiting for merge on ravel * Added optimizer ops * All misc ops working * All NN Basic ops working * Fixed LT input for ROIPooling * Refactored NN Conv tests * Added test for inline optimizer ops * Dropping extra tests to decrease execution time * Switching to inline tests for RNN to support additional modes * Added state_cell as NDArray param, removed linalg testing for int64 tensor * Cleaned up styling * Fixed conv and deconv tests * Retrigger CI for continuous build * Cleaned up GEMM op inputs * Dropped unused param from default_params --- .../opperf/nd_operations/array_rearrange.py | 8 +- .../opperf/nd_operations/binary_operators.py | 26 +- .../opperf/nd_operations/gemm_operators.py | 84 ++-- .../opperf/nd_operations/indexing_routines.py | 8 +- .../opperf/nd_operations/linalg_operators.py | 8 +- .../opperf/nd_operations/misc_operators.py | 73 ++-- .../nd_operations/nn_activation_operators.py | 10 +- .../nd_operations/nn_basic_operators.py | 78 +++- .../opperf/nd_operations/nn_conv_operators.py | 287 ++++++++++---- .../opperf/nd_operations/nn_loss_operators.py | 8 +- .../nd_operations/nn_optimizer_operators.py | 66 ++-- .../random_sampling_operators.py | 8 +- .../nd_operations/reduction_operators.py | 8 +- .../sorting_searching_operators.py | 8 +- .../opperf/nd_operations/unary_operators.py | 26 +- benchmark/opperf/opperf.py | 56 +-- benchmark/opperf/rules/default_params.py | 371 +++++++++++++++++- benchmark/opperf/utils/benchmark_utils.py | 4 +- benchmark/opperf/utils/op_registry_utils.py | 57 +-- 19 files changed, 941 insertions(+), 253 deletions(-) diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py index 12af8345543e..631d0bb997bc 100644 --- a/benchmark/opperf/nd_operations/array_rearrange.py +++ b/benchmark/opperf/nd_operations/array_rearrange.py @@ -29,8 +29,8 @@ """ -def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the +def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the rearrange operators in MXNet. Parameters @@ -41,6 +41,8 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -55,5 +57,5 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=' mx_rearrange_ops = get_all_rearrange_operators() # Run benchmarks - mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, warmup, runs) + mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_rearrange_op_results diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py index 5d95360a73db..4444219e6054 100644 --- a/benchmark/opperf/nd_operations/binary_operators.py +++ b/benchmark/opperf/nd_operations/binary_operators.py @@ -38,8 +38,8 @@ get_all_elemen_wise_binary_operators, get_all_misc_binary_operators -def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous +def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the miscellaneous binary operators in MXNet. Parameters @@ -48,6 +48,10 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi Context to run benchmarks dtype: str, default 'float32' Precision to use for benchmarks + profiler: str, default 'native' + Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -61,12 +65,12 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi # Fetch all Miscellaneous Binary Operators mx_binary_misc_ops = get_all_misc_binary_operators() # Run benchmarks - mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, warmup, runs) + mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_binary_op_results -def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the binary +def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary broadcast operators in MXNet. Parameters @@ -77,6 +81,8 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -90,12 +96,12 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', # Fetch all Binary Broadcast Operators mx_binary_broadcast_ops = get_all_broadcast_binary_operators() # Run benchmarks - mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, warmup, runs) + mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_binary_op_results -def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the binary +def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary element_wise operators in MXNet. Parameters @@ -106,6 +112,8 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32 Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 10 Number of times to run for warmup runs: int, default 50 @@ -119,5 +127,5 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32 # Fetch all Binary Element_wise Operators mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators() # Run benchmarks - mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, warmup, runs) + mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_binary_op_results diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py index f1028123b421..55b3435a8f24 100644 --- a/benchmark/opperf/nd_operations/gemm_operators.py +++ b/benchmark/opperf/nd_operations/gemm_operators.py @@ -35,8 +35,8 @@ """ -def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the GEMM +def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM operators (dot, batch_dot, khatri_rao) in MXNet. Parameters @@ -47,6 +47,8 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -57,43 +59,75 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ - # Benchmark tests for dot operator + standard_inputs_dot = [{"lhs": (1024, 1024), + "rhs": (1024, 1024)}, + {"lhs": (1000, 10), + "rhs": (1000, 10), + "transpose_b": True}, + {"lhs": (1000, 1), + "rhs": (100, 1000), + "transpose_a": True, + "transpose_b": True}] + int64_tensor_inputs_dot = [{"lhs": (2**16, 2**16), + "rhs": (2**16, 2**16)}, + {"lhs": (4, 2**30), + "rhs": (4, 2**30), + "transpose_b": True}, + {"lhs": (2**28, 16), + "rhs": (16, 2**28), + "transpose_a": True, + "transpose_b": True}] + standard_inputs_batch_dot = [{"lhs": (32, 1024, 1024), + "rhs": (32, 1024, 1024)}, + {"lhs": (32, 1000, 10), + "rhs": (32, 1000, 10), + "transpose_b": True}, + {"lhs": (32, 1000, 1), + "rhs": (32, 100, 1000), + "transpose_a": True, + "transpose_b": True}] + int64_tensor_inputs_batch_dot = [{"lhs": (1, 2**16, 2**16), + "rhs": (1, 2**16, 2**16)}, + {"lhs": (1, 4, 2**30), + "rhs": (1, 4, 2**30), + "transpose_b": True}, + {"lhs": (1, 2**28, 16), + "rhs": (1, 16, 2**28), + "transpose_a": True, + "transpose_b": True}] + standard_inputs_khatri_rao = [{"args": [(32, 32), (32, 32)]}, + {"args": [(64, 64), (64, 64)]}] + int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}] + + if int64_tensor == 'on': + inputs_dot = int64_tensor_inputs_dot + inputs_batch_dot = int64_tensor_inputs_batch_dot + inputs_khatri_rao = int64_tensor_inputs_khatri_rao + else: + inputs_dot = standard_inputs_dot + inputs_batch_dot = standard_inputs_batch_dot + inputs_khatri_rao = standard_inputs_khatri_rao + + # Benchmark tests for dot and batch_dot operators dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "dot")], run_backward=True, dtype=dtype, ctx=ctx, - inputs=[{"lhs": (1024, 1024), - "rhs": (1024, 1024)}, - {"lhs": (1000, 10), - "rhs": (1000, 10), - "transpose_b": True}, - {"lhs": (1000, 1), - "rhs": (100, 1000), - "transpose_a": True, - "transpose_b": True}], + inputs=inputs_dot, warmup=warmup, runs=runs, profiler=profiler) - # Benchmark tests for batch_dot operator + batch_dot_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True, dtype=dtype, ctx=ctx, - inputs=[{"lhs": (32, 1024, 1024), - "rhs": (32, 1024, 1024)}, - {"lhs": (32, 1000, 10), - "rhs": (32, 1000, 10), - "transpose_b": True}, - {"lhs": (32, 1000, 1), - "rhs": (32, 100, 1000), - "transpose_a": True, - "transpose_b": True}], + inputs=inputs_batch_dot, warmup=warmup, runs=runs, profiler=profiler) - # Operator khatri_rao is not yet implemented for GPU + # Operator khatri_rao is not yet implemented for GPU khatri_rao_benchmark_res = [] if ctx != mx.gpu(): # Benchmark tests for khatri_rao operator khatri_rao_benchmark_res = run_performance_test( [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False, dtype=dtype, ctx=ctx, - inputs=[{"args": [(32, 32), (32, 32)]}, - {"args": [(64, 64), (64, 64)]}], + inputs=inputs_khatri_rao, warmup=warmup, runs=runs, profiler=profiler) # Prepare combined results for GEMM operators diff --git a/benchmark/opperf/nd_operations/indexing_routines.py b/benchmark/opperf/nd_operations/indexing_routines.py index a957785940a5..ee99de2b57bf 100644 --- a/benchmark/opperf/nd_operations/indexing_routines.py +++ b/benchmark/opperf/nd_operations/indexing_routines.py @@ -35,8 +35,8 @@ """ -def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the indexing routines +def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the indexing routines in MXNet. Parameters @@ -47,6 +47,8 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -61,5 +63,5 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na mx_indexing_ops = get_all_indexing_routines() # Run benchmarks - mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, warmup, runs) + mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_indexing_op_results diff --git a/benchmark/opperf/nd_operations/linalg_operators.py b/benchmark/opperf/nd_operations/linalg_operators.py index d2c1cee0a307..1d35ef1fc951 100644 --- a/benchmark/opperf/nd_operations/linalg_operators.py +++ b/benchmark/opperf/nd_operations/linalg_operators.py @@ -34,8 +34,8 @@ from benchmark.opperf.utils.common_utils import merge_map_list from benchmark.opperf.rules.default_params import MX_OP_MODULE -def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the linear algebra +def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the linear algebra operators in MXNet. Parameters @@ -46,6 +46,8 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -74,5 +76,5 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat # Fetch all Linear Algebra Operators mx_linalg_ops = get_all_linalg_operators() # Run benchmarks - mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, warmup, runs) + mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results]) diff --git a/benchmark/opperf/nd_operations/misc_operators.py b/benchmark/opperf/nd_operations/misc_operators.py index 5a0efc57de0d..fb8535a959a0 100644 --- a/benchmark/opperf/nd_operations/misc_operators.py +++ b/benchmark/opperf/nd_operations/misc_operators.py @@ -37,7 +37,7 @@ from benchmark.opperf.custom_operations.custom_operations import CustomAddOneProp -def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): +def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous operators in MXNet. @@ -49,6 +49,8 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -59,6 +61,48 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ + + standard_inputs_array_ops = [{"args": [(1024, 1024)], + "num_arrays": 1}, + {"args": [(10000, 1)], + "num_arrays": 1}, + {"args": [(10000, 10)], + "num_arrays": 1}] + int64_tensor_inputs_array_ops = [{"args": [(2**32, 1)], + "num_arrays":1}] + standard_inputs_add_n = [{"args": [(1024, 1024)]}, + {"args": [(10000, 1)]}, + {"args": [(10000, 10)]}] + int64_tensor_inputs_add_n = [{"args": [(2**16, 2**16)]}] + standard_inputs_upsampling = [{"args": (32, 3, 256, 256), + "scale": 2, + "sample_type": "nearest"}, + {"args": (32, 3, 10000, 1), + "scale": 4, + "sample_type": "nearest"}] + int64_tensor_inputs_upsampling = [{"args": (2**32 + 1, 1, 1, 1), + "scale": 2, + "sample_type": "nearest"}] + standard_inputs_custom = [{"args": [(1024, 1024)], + "op_type": "CustomAddOne"}, + {"args": [(10000, 1)], + "op_type": "CustomAddOne"}, + {"args": [(10000, 10)], + "op_type": "CustomAddOne"}] + int64_tensor_inputs_custom = [{"args": [(2**32 + 1, 1)], + "op_type": "CustomAddOne"}] + + if int64_tensor == 'on': + inputs_array_ops = int64_tensor_inputs_array_ops + inputs_add_n = int64_tensor_inputs_add_n + inputs_upsampling = int64_tensor_inputs_upsampling + inputs_custom = int64_tensor_inputs_custom + else: + inputs_array_ops = standard_inputs_array_ops + inputs_add_n = standard_inputs_add_n + inputs_upsampling = standard_inputs_upsampling + inputs_custom = standard_inputs_custom + # Individual tests for ops with positional args array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"), getattr(MX_OP_MODULE, "multi_all_finite"), @@ -67,12 +111,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na dtype=dtype, ctx=ctx, profiler=profiler, - inputs=[{"args": [(1024, 1024)], - "num_arrays": 1}, - {"args": [(10000, 1)], - "num_arrays": 1}, - {"args": [(10000, 10)], - "num_arrays": 1}], + inputs=inputs_array_ops, warmup=warmup, runs=runs) add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")], @@ -80,9 +119,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na dtype=dtype, ctx=ctx, profiler=profiler, - inputs=[{"args": [(1024, 1024)]}, - {"args": [(10000, 1)]}, - {"args": [(10000, 10)]}], + inputs=inputs_add_n, warmup=warmup, runs=runs) # There are currently issus with UpSampling with bilinear interpolation. @@ -92,12 +129,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na dtype=dtype, ctx=ctx, profiler=profiler, - inputs=[{"args": (32, 3, 256, 256), - "scale": 2, - "sample_type": "nearest"}, - {"args": (32, 3, 10000, 1), - "scale": 4, - "sample_type": "nearest"}], + inputs=inputs_upsampling, warmup=warmup, runs=runs) # Create and register CustomAddOne operator for use in Custom op testing @@ -108,17 +140,12 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na dtype=dtype, ctx=ctx, profiler=profiler, - inputs=[{"args": [(1024, 1024)], - "op_type": "CustomAddOne"}, - {"args": [(10000, 1)], - "op_type": "CustomAddOne"}, - {"args": [(10000, 10)], - "op_type": "CustomAddOne"}], + inputs=inputs_custom, warmup=warmup, runs=runs) # Fetch remaining Miscellaneous Operators mx_misc_ops = get_remaining_miscellaneous_operators() # Run benchmarks - mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs) + mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results]) diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py index b77777cc04dd..161dfe72123e 100644 --- a/benchmark/opperf/nd_operations/nn_activation_operators.py +++ b/benchmark/opperf/nd_operations/nn_activation_operators.py @@ -43,9 +43,9 @@ """ -def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the activation - operators in MXNet. +def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the activation + operators (relu, sigmoid, softmax) in MXNet. Parameters ---------- @@ -55,6 +55,8 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler= Precision to use for benchmarks profiler: str, default 'native' Module to use for tracking benchmark excecution time + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -70,6 +72,6 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler= mx_activation_ops = get_all_nn_activation_operators() # Run benchmarks - mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, warmup, runs) + mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_activation_op_results \ No newline at end of file diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py index a8273d4105dc..f3007bac188c 100644 --- a/benchmark/opperf/nd_operations/nn_basic_operators.py +++ b/benchmark/opperf/nd_operations/nn_basic_operators.py @@ -20,6 +20,10 @@ from benchmark.opperf.utils.op_registry_utils import get_all_nn_basic_operators from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list +from benchmark.opperf.rules.default_params import MX_OP_MODULE + """Performance benchmark tests for MXNet NDArray basic NN Operators. 1. FullyConnected @@ -45,8 +49,8 @@ """ -def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the NN basic +def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the basic neural network operators in MXNet. Parameters @@ -56,7 +60,9 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n dtype: str, default 'float32' Precision to use for benchmarks profiler: str, default 'native' - Module to use for tracking benchmark excecution time + Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -68,9 +74,71 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n """ + standard_data_list = [(1024, 4, 4)] + int64_tensor_data_list = [(2**28, 4, 4)] + + if int64_tensor == 'on': + data_list = int64_tensor_data_list + else: + data_list = standard_data_list + + for data in data_list: + rnn_relu_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": data, + "parameters": (7,), + "state": (1, 4, 1), + "mode": "rnn_relu", + "state_size": 1, + "num_layers": 1}], + warmup=warmup, + runs=runs) + rnn_tanh_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": data, + "parameters": (7,), + "state": (1, 4, 1), + "mode": "rnn_tanh", + "state_size": 1, + "num_layers": 1}], + warmup=warmup, + runs=runs) + rnn_lstm_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": data, + "parameters": (28,), + "state": (1, 4, 1), + "state_cell": (1, 4, 1), + "mode": "lstm", + "state_size": 1, + "num_layers": 1}], + warmup=warmup, + runs=runs) + rnn_gru_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": data, + "parameters": (21,), + "state": (1, 4, 1), + "mode": "gru", + "state_size": 1, + "num_layers": 1}], + warmup=warmup, + runs=runs) # Fetch all NN Basic Operators mx_nn_basic_ops = get_all_nn_basic_operators() # Run benchmarks - mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, warmup, runs) - return mx_nn_basic_op_results + mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) + return merge_map_list(rnn_relu_benchmark + rnn_tanh_benchmark + rnn_lstm_benchmark + rnn_gru_benchmark + [mx_nn_basic_op_results]) diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py index 9c80f00c354b..d44b89117511 100644 --- a/benchmark/opperf/nd_operations/nn_conv_operators.py +++ b/benchmark/opperf/nd_operations/nn_conv_operators.py @@ -52,16 +52,55 @@ """ -def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): +def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the pooling + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + profiler: str, default 'native' + Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) + warmup: int, default 25 + Number of times to run for warmup + runs: int, default 100 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ pool_types = ['avg', 'max', 'sum'] global_pool_types = [0, 1] + standard_data_list_pool1d = [(32, 3, 256), (32, 3, 64)] + int64_tensor_data_list_pool1d = [(1, 1, 2**32)] + standard_data_list_pool2d = [(32, 3, 256, 256), (32, 3, 64, 64)] + int64_tensor_data_list_pool2d = [(2**28, 1, 4, 4)] + standard_data_list_roipool = [(32, 3, 256, 256), (32, 3, 64, 64)] + int64_tensor_data_list_roipool = [(32, 3, 2**13, 2**13)] + + if int64_tensor == 'on': + data_list_pool1d = int64_tensor_data_list_pool1d + data_list_pool2d = int64_tensor_data_list_pool2d + data_list_roipool = int64_tensor_data_list_roipool + else: + data_list_pool1d = standard_data_list_pool1d + data_list_pool2d = standard_data_list_pool2d + data_list_roipool = standard_data_list_roipool + # Run 1D and 2D Pooling performance runs pool1d_benchmark_res = [] pool2d_benchmark_res = [] for pool_type in pool_types: for global_pool in global_pool_types: - for pool1d_data in [(32, 3, 256), (32, 3, 64)]: + for pool1d_data in data_list_pool1d: pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, @@ -73,10 +112,10 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na "global_pool": global_pool, "stride": 1, "pad": 1} - ], + ], warmup=warmup, runs=runs) - for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: + for pool2d_data in data_list_pool2d: pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")], run_backward=True, dtype=dtype, @@ -88,68 +127,118 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na "global_pool": global_pool, "stride": (1, 1), "pad": (0, 0)} - ], + ], warmup=warmup, runs=runs) - # Run ROI Pooling performance runs - roipool_benchmark_res = [] - for roipool_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: - roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")], - run_backward=True, - dtype=dtype, - ctx=ctx, - profiler=profiler, - inputs=[{"data": roipool_data, - "rois": (32, 5), - "pooled_size": (2, 2), - "spatial_scale": .5} - ], - warmup=warmup, - runs=runs) + # Run ROI Pooling performance runs + roipool_benchmark_res = [] + for roipool_data in data_list_roipool: + roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": roipool_data, + "rois": (32, 5), + "pooled_size": (2, 2), + "spatial_scale": .5} + ], + warmup=warmup, + runs=runs) # Prepare combined results mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res) return mx_pooling_op_results -def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - # Conv1D Benchmarks +def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the convolution + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + profiler: str, default 'native' + Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) + warmup: int, default 25 + Number of times to run for warmup + runs: int, default 100 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + + standard_data_list_conv1d = [(32, 3, 256), (32, 3, 64)] + int64_tensor_data_list_conv1d = [(2**30, 1, 4)] + standard_weight_conv1d = (1, 3, 3) + int64_tensor_weight_conv1d = (1, 1, 1) + standard_kernel_conv1d = (3,) + int64_tensor_kernel_conv1d = (1,) + standard_data_list_conv2d = [(32, 3, 256, 256), (32, 3, 64, 64)] + int64_tensor_data_list_conv2d = [(2**28, 1, 4, 4)] + standard_weight_conv2d = (1, 3, 3, 3) + int64_tensor_weight_conv2d = (1, 1, 1, 1) + standard_kernel_conv2d = (3, 3) + int64_tensor_kernel_conv2d = (1, 1) + + if int64_tensor == 'on': + data_list_conv1d = int64_tensor_data_list_conv1d + weight_conv1d = int64_tensor_weight_conv1d + kernel_conv1d = int64_tensor_kernel_conv1d + data_list_conv2d = int64_tensor_data_list_conv2d + weight_conv2d = int64_tensor_weight_conv2d + kernel_conv2d = int64_tensor_kernel_conv2d + else: + data_list_conv1d = standard_data_list_conv1d + weight_conv1d = standard_weight_conv1d + kernel_conv1d = standard_kernel_conv1d + data_list_conv2d = standard_data_list_conv2d + weight_conv2d = standard_weight_conv2d + kernel_conv2d = standard_kernel_conv2d + conv1d_benchmark_res = [] - for conv_data in [(32, 3, 256), (32, 3, 64)]: + conv2d_benchmark_res = [] + # Conv1D Benchmarks + for conv_data in data_list_conv1d: conv1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": conv_data, - "weight": (64, 3, 3), - "bias": (64,), - "kernel": (3,), + "weight": weight_conv1d, + "bias": (1,), + "kernel": kernel_conv1d, "stride": (1,), "dilate": (1,), "pad": (0,), - "num_filter": 64, - "layout": 'NCW'} - ], + "num_filter": 1, + "layout": 'NCW'}], warmup=warmup, runs=runs) # Conv2D Benchmarks - conv2d_benchmark_res = [] - for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: + for conv_data in data_list_conv2d: conv2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, inputs=[{"data": conv_data, - "weight": (64, 3, 3, 3), - "bias": (64,), - "kernel": (3, 3), + "weight": weight_conv2d, + "bias": (1,), + "kernel": kernel_conv2d, "stride": (1, 1), "dilate": (1, 1), "pad": (0, 0), - "num_filter": 64, - "layout": 'NCHW'} - ], + "num_filter": 1, + "layout": 'NCHW'}], warmup=warmup, runs=runs) # Prepare combined results @@ -157,50 +246,98 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler return mx_conv_op_results -def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', dtype='float32', warmup=10, runs=50): +def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', int64_tensor='off', dtype='float32', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the transpose convolution + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + profiler: str, default 'native' + Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) + warmup: int, default 25 + Number of times to run for warmup + runs: int, default 100 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + + standard_data_list_conv1d_transpose = [(32, 3, 256), (32, 3, 64)] + int64_tensor_data_list_conv1d_transpose = [(2**30, 1, 4)] + standard_weight_conv1d_transpose = (3, 1, 3) + int64_tensor_weight_conv1d_transpose = (1, 1, 1) + standard_kernel_conv1d_transpose = (3,) + int64_tensor_kernel_conv1d_transpose = (1,) + standard_data_list_conv2d_transpose = [(32, 3, 256, 256), (32, 3, 64, 64)] + int64_tensor_data_list_conv2d_transpose = [(2**28, 1, 4, 4)] + standard_weight_conv2d_transpose = (3, 1, 3, 3) + int64_tensor_weight_conv2d_transpose = (1, 1, 1, 1) + standard_kernel_conv2d_transpose = (3, 3) + int64_tensor_kernel_conv2d_transpose = (1, 1) + + if int64_tensor == 'on': + data_list_conv1d_transpose = int64_tensor_data_list_conv1d_transpose + weight_conv1d_transpose = int64_tensor_weight_conv1d_transpose + kernel_conv1d_transpose = int64_tensor_kernel_conv1d_transpose + data_list_conv2d_transpose = int64_tensor_data_list_conv2d_transpose + weight_conv2d_transpose = int64_tensor_weight_conv2d_transpose + kernel_conv2d_transpose = int64_tensor_kernel_conv2d_transpose + else: + data_list_conv1d_transpose = standard_data_list_conv1d_transpose + weight_conv1d_transpose = standard_weight_conv1d_transpose + kernel_conv1d_transpose = standard_kernel_conv1d_transpose + data_list_conv2d_transpose = standard_data_list_conv2d_transpose + weight_conv2d_transpose = standard_weight_conv2d_transpose + kernel_conv2d_transpose = standard_kernel_conv2d_transpose + # Conv1DTranspose Benchmarks conv1d_transpose_benchmark_res = [] - for conv_data in [(32, 3, 256), (32, 3, 64)]: + for conv_data in data_list_conv1d_transpose: conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")], - run_backward=True, - dtype=dtype, - ctx=ctx, - profiler=profiler, - inputs=[{"data": conv_data, - "weight": (3, 64, 3), - "bias": (64,), - "kernel": (3,), - "stride": (1,), - "dilate": (1,), - "pad": (0,), - "adj": (0,), - "num_filter": 64, - "no_bias": False, - "layout": 'NCW'} - ], - warmup=warmup, - runs=runs) + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": conv_data, + "weight": weight_conv1d_transpose, + "bias": (1,), + "kernel": kernel_conv1d_transpose, + "stride": (1,), + "dilate": (1,), + "pad": (0,), + "num_filter": 1, + "no_bias": False, + "layout": 'NCW'}], + warmup=warmup, + runs=runs) # Conv2DTranspose Benchmarks conv2d_transpose_benchmark_res = [] - for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: + for conv_data in data_list_conv2d_transpose: conv2d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")], - run_backward=True, - dtype=dtype, - ctx=ctx, - profiler=profiler, - inputs=[{"data": conv_data, - "weight": (3, 64, 3, 3), - "bias": (64,), - "kernel": (3, 3), - "stride": (1, 1), - "dilate": (1, 1), - "pad": (0, 0), - "num_filter": 64, - "no_bias": False, - "layout": 'NCHW'} - ], - warmup=warmup, - runs=runs) + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"data": conv_data, + "weight": weight_conv2d_transpose, + "bias": (1,), + "kernel": kernel_conv2d_transpose, + "stride": (1, 1), + "pad": (0, 0), + "num_filter": 1, + "no_bias": False, + "layout": 'NCHW'}], + warmup=warmup, + runs=runs) # Prepare combined results mx_transpose_conv_op_results = merge_map_list(conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res) return mx_transpose_conv_op_results diff --git a/benchmark/opperf/nd_operations/nn_loss_operators.py b/benchmark/opperf/nd_operations/nn_loss_operators.py index 9d894087343b..dea19f14f1af 100644 --- a/benchmark/opperf/nd_operations/nn_loss_operators.py +++ b/benchmark/opperf/nd_operations/nn_loss_operators.py @@ -28,8 +28,8 @@ """ -def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the +def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the Neural Network loss operators in MXNet. Parameters @@ -40,6 +40,8 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -54,5 +56,5 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ mx_loss_ops = get_all_loss_operators() # Run benchmarks - mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, warmup, runs) + mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_loss_op_results diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py index ac380655d136..db18b30081d4 100644 --- a/benchmark/opperf/nd_operations/nn_optimizer_operators.py +++ b/benchmark/opperf/nd_operations/nn_optimizer_operators.py @@ -54,8 +54,8 @@ """ -def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype) for all the neural network +def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the neural network optimizer update operators in MXNet. Parameters @@ -66,6 +66,8 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=' Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -76,60 +78,68 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=' Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ + standard_shape = (5, 5) + int64_tensor_shape = (2**16, 2**16) + + if int64_tensor == 'on': + arg_shape = int64_tensor_shape + else: + arg_shape = standard_shape + # Run independent tests for ops that need specific input data multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), - "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, - "out": nd.random_normal(shape=(5,5))}],run_backward=False) + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape), + "args3": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2, + "out": nd.random_normal(shape=arg_shape)}],run_backward=False) multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), - "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape), + "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False) multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, - "out": nd.random_normal(shape=(5,5))}], run_backward=False) + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2, + "out": nd.random_normal(shape=arg_shape)}], run_backward=False) multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), - "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape), + "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False) preloaded_multi_mp_sgd_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape), "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), - "out": nd.random_normal(shape=(5,5))}], run_backward=False) + "out": nd.random_normal(shape=arg_shape)}], run_backward=False) preloaded_multi_sgd_mom_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), - "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), + inputs=[{"args0": nd.random_normal(shape=arg_shape), + "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape), "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), - "out": nd.random_normal(shape=(5,5))}], run_backward=False) + "out": nd.random_normal(shape=arg_shape)}], run_backward=False) preloaded_multi_sgd_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), + inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape), "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), - "out": nd.random_normal(shape=(5,5))}], run_backward=False) + "out": nd.random_normal(shape=arg_shape)}], run_backward=False) preloaded_multi_mp_sgd_mom_res = run_performance_test( [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")], - inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), - "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)), + inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape), + "args2": nd.random_normal(shape=arg_shape), "args3": nd.random_normal(shape=arg_shape), "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), - "out": nd.random_normal(shape=(5,5))}], run_backward=False) + "out": nd.random_normal(shape=arg_shape)}], run_backward=False) # Fetch remaining optimizer operators mx_optimizer_ops = get_all_optimizer_operators() # Run benchmarks - mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs) + mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\ preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\ - [mx_optimizer_op_results]) + multi_mp_sgd_mom_res + preloaded_multi_sgd_res + [mx_optimizer_op_results]) diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py index b6a1f44dba25..777f26af317c 100644 --- a/benchmark/opperf/nd_operations/random_sampling_operators.py +++ b/benchmark/opperf/nd_operations/random_sampling_operators.py @@ -34,8 +34,8 @@ from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators -def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the random sampling +def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the random sampling operators in MXNet. Parameters @@ -46,6 +46,8 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -59,5 +61,5 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p # Fetch all Random Sampling Operators mx_random_sample_ops = get_all_random_sampling_operators() # Run benchmarks - mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, warmup, runs) + mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_random_sample_op_results diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py index 6cc0d49c899b..d6e4b6dd6c2d 100644 --- a/benchmark/opperf/nd_operations/reduction_operators.py +++ b/benchmark/opperf/nd_operations/reduction_operators.py @@ -31,8 +31,8 @@ from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks -def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the reduction +def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the reduction operators in MXNet. Parameters @@ -43,6 +43,8 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -56,5 +58,5 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile # Fetch all Reduction Operators mx_reduction_broadcast_ops = get_all_reduction_operators() # Run benchmarks - mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, warmup, runs) + mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_reduction_op_results diff --git a/benchmark/opperf/nd_operations/sorting_searching_operators.py b/benchmark/opperf/nd_operations/sorting_searching_operators.py index 2d936cdc48ca..d0d9fc064888 100644 --- a/benchmark/opperf/nd_operations/sorting_searching_operators.py +++ b/benchmark/opperf/nd_operations/sorting_searching_operators.py @@ -29,8 +29,8 @@ """ -def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the sorting and searching +def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the sorting and searching operators in MXNet. Parameters @@ -41,6 +41,8 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -54,5 +56,5 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr # Fetch all Random Sampling Operators mx_sort_search_ops = get_all_sorting_searching_operators() # Run benchmarks - mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, warmup, runs) + mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return mx_sort_search_op_results diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py index 08075906fae5..53cab57cfe15 100644 --- a/benchmark/opperf/nd_operations/unary_operators.py +++ b/benchmark/opperf/nd_operations/unary_operators.py @@ -38,8 +38,8 @@ from benchmark.opperf.utils.common_utils import merge_map_list from benchmark.opperf.rules.default_params import MX_OP_MODULE -def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): - """Runs benchmarks with the given context and precision (dtype)for all the unary +def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): + """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the unary operators in MXNet. Parameters @@ -50,6 +50,8 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n Precision to use for benchmarks profiler: str, default 'native' Type of Profiler to use (native/python) + int64_tensor: str, default 'off' + Input tensor size to use for tests (if on, dimensions >= 2**32) warmup: int, default 25 Number of times to run for warmup runs: int, default 100 @@ -60,16 +62,26 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ + + standard_inputs = [{"args": [(1024, 1024)], + "num_outputs":1}, + {"args": [(10000, 1)], + "num_outputs":1}] + int64_tensor_inputs = [{"args": [(2**32, 1)], + "num_outputs":1}] + + if int64_tensor == 'on': + inputs = int64_tensor_inputs + else: + inputs = standard_inputs + # Run amp_multicast as it needs data as positional argument amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")], run_backward=True, dtype=dtype, ctx=ctx, profiler=profiler, - inputs=[{"args": [(1024, 1024)], - "num_outputs":1}, - {"args": [(10000, 1)], - "num_outputs":1}], + inputs=inputs, warmup=warmup, runs=runs) @@ -77,5 +89,5 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n mx_unary_broadcast_ops = get_all_unary_operators() # Run benchmarks - mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs) + mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs) return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results]) diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py index 5b8c43f417da..c0ac7b7dcd98 100755 --- a/benchmark/opperf/opperf.py +++ b/benchmark/opperf/opperf.py @@ -51,7 +51,7 @@ get_current_runtime_features -def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): +def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100): """Run all the MXNet operators (NDArray) benchmarks. Returns @@ -63,64 +63,66 @@ def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n # *************************MXNET TENSOR OPERATOR BENCHMARKS***************************** # Run all Unary operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx, - dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx, - dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) mxnet_operator_benchmark_results.append(run_mx_binary_misc_operators_benchmarks(ctx=ctx, - dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all GEMM operations benchmarks with default input values mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx, - dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Random sampling operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Reduction operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Sorting and Searching operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Array Rearrange operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Indexing routines benchmarks with default input values - mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # ************************ MXNET NN OPERATOR BENCHMARKS **************************** # Run all basic NN operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Activation operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Pooling operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Convolution operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Optimizer operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) - + mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) + # Run all Transpose Convolution operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all NN loss operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # Run all Miscellaneous operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) - # Run all Linear Algebra operations benchmarks with default input values - mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)) + # Linear Algebra operators do not work with int64 tensor data. Issue tracked here: https://github.com/apache/incubator-mxnet/issues/17716 + if int64_tensor == 'off': + # Run all Linear Algebra operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)) # ****************************** PREPARE FINAL RESULTS ******************************** final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results) @@ -162,6 +164,11 @@ def main(): help='Use built-in CPP profiler (native) or Python' 'time module.' 'Valid Inputs - native, python') + + parser.add_argument('--int64-tensor', type=str, default='off', + help='Run performance tests with large tensor input' + 'data (dimension >= 2**32) or standard input data.' + 'Valid Inputs - on, off') parser.add_argument('-w', '--warmup', type=int, default=25, help='Number of times to run for warmup.' @@ -169,7 +176,7 @@ def main(): parser.add_argument('-r', '--runs', type=int, default=100, help='Number of runs to capture benchmark results.' - 'Valid Inputs - positive integers') + 'Valid Inputs - positive integers') args = parser.parse_args() logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args)) @@ -180,9 +187,10 @@ def main(): ctx = _parse_mxnet_context(args.ctx) dtype = args.dtype profiler = args.profiler + int64_tensor = args.int64_tensor warmup = args.warmup runs = args.runs - benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs) + benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs) # Sort benchmark results alphabetically by op name final_benchmark_results = dict() diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py index 15bcd72b0553..a4362fa63e11 100644 --- a/benchmark/opperf/rules/default_params.py +++ b/benchmark/opperf/rules/default_params.py @@ -35,15 +35,22 @@ DEFAULT_DTYPE_INT = ['int32', 'int64', 'int32'] # randint works for int* types only DEFAULT_DTYPE_FLOAT = ['float16', 'float32', 'float64'] # random_exp works for float* types only +DEFAULT_DATA_LARGE_TENSOR = [(2**16, 2**16)] + # For Binary miscellaneous operators like choose_element0_index # argument data must be indexed via an NDArray. # NOTE: Data used is DEFAULT_DATA DEFAULT_INDEX = [(1, 1024), (1, 1), (1, 100)] +DEFAULT_INDEX_LARGE_TENSOR = [(1, 2**16)] + # For Binary broadcast operators like - broadcast_add/sub/mod/logical_and etc.. DEFAULT_LHS = [(1024, 1024), (10000, 10), (10000, 1)] DEFAULT_RHS = [(1024, 1024), (10000, 10), (10000, 1)] +DEFAULT_LHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)] +DEFAULT_RHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)] + # For operators like - random_uniform, random_normal etc.. DEFAULT_SHAPE = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_SAMPLE = [(2,)] @@ -52,6 +59,15 @@ DEFAULT_K = [1] DEFAULT_P = [1] +DEFAULT_SHAPE_LARGE_TENSOR = [(2**16, 2**16)]#, (2**32, 1), (2**25, 2**7)] +DEFAULT_SAMPLE_LARGE_TENSOR = [(2**32,)] +DEFAULT_DATA_RPD_LARGE_TENSOR = [(2**32 + 1, 5)] +DEFAULT_ALPHA_RPD_LARGE_TENSOR = [(2**32,)] +DEFAULT_SAMPLE_RPE_LARGE_TENSOR = [(1, 2**32)] +DEFAULT_LAM_RPE_LARGE_TENSOR = [(1,)] +DEFAULT_SAMPLE_RPG_LARGE_TENSOR = [(1, 2**32 + 1)] +DEFAULT_ALPHA_RPG_LARGE_TENSOR = [(1,)] + # For operators like - sample_uniform, sample_normal etc.. # NOTE: There are many overlapping operators in random_* and sample_*, # Ex: random_uniform, sample_uniform. Parameter names are same, but, for @@ -73,6 +89,24 @@ DEFAULT_TARGET_SHAPE = [(256, 6)] DEFAULT_DATA_SM = [(32, 32), (64, 64)] +DEFAULT_LOW_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16] +DEFAULT_HIGH_ND_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16] +DEFAULT_MU_ND_LARGE_TENSOR = [[2.0] * 2**16 + [2.5] * 2**16] +DEFAULT_SIGMA_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16] +DEFAULT_ALPHA_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16] +DEFAULT_BETA_ND_LARGE_TENSOR = [[1.0] * 2**16 + [0.7] * 2**16] +DEFAULT_LAM_ND_LARGE_TENSOR = [[1.0] * 2**16 + [8.5] * 2**16] +DEFAULT_K_ND_LARGE_TENSOR = [[20] * 2**16 + [49] * 2**16] +DEFAULT_P_ND_LARGE_TENSOR = [[0.4] * 2**16 + [0.77] * 2**16] +DEFAULT_DATA_BILINEAR_LARGE_TENSOR = [(2**32, 1, 1, 1)] +DEFAULT_GRID_LARGE_TENSOR = [(2**32, 2, 1, 1)] +DEFAULT_DATA_GRIDGEN_LARGE_TENSOR = [(2**31, 2, 1, 1), (1, 6)] +DEFAULT_TARGET_SHAPE_LARGE_TENSOR = [(1, 6)] +DEFAULT_DATA_SM_LARGE_TENSOR = [(2**32,)] +DEFAULT_SHAPE_SE_LARGE_TENSOR = [(1,)] +DEFAULT_LAM_SE_LARGE_TENSOR = [(2**32 + 1,)] +DEFAULT_SHAPE_SU_LARGE_TENSOR = [(2**32,)] + # For reduction operators # NOTE: Data used is DEFAULT_DATA DEFAULT_AXIS_SHAPE = [(), 0, (0, 1)] @@ -107,7 +141,6 @@ DEFAULT_NSIZE = [3] DEFAULT_PARAMETERS = [(7,), (104,)] DEFAULT_STATE = [(1, 4, 1), (2, 10000, 4)] -DEFAULT_MODE = ["rnn_relu", "rnn_tanh"] DEFAULT_STATE_SIZE = [1, 4] DEFAULT_NUM_LAYERS = [1, 2] DEFAULT_NUM_GROUPS = [1, 10] @@ -119,6 +152,30 @@ DEFAULT_KERNEL = [(1, 1, 1), (1, 1, 1)] DEFAULT_STRIDE = [(2, 2, 2), (1, 1, 1)] +DEFAULT_DATA_NN_BASIC_LARGE_TENSOR = [(2**32 + 1, 1)] +DEFAULT_NUM_HIDDEN_LARGE_TENSOR = [(1,)] +DEFAULT_BIAS_LARGE_TENSOR = [(1,)] +DEFAULT_FLATTEN_LARGE_TENSOR = [False] +DEFAULT_GAMMA_LARGE_TENSOR = [(1,)] +DEFAULT_BETA_LARGE_TENSOR = [(1,)] +DEFAULT_MOVING_MEAN_LARGE_TENSOR = [(2**32 + 1,)] +DEFAULT_MOVING_VAR_LARGE_TENSOR = [(2**32 + 1,)] +DEFAULT_INPUT_DIM_LARGE_TENSOR = [2**32] +DEFAULT_OUTPUT_DIM_LARGE_TENSOR = [1] +DEFAULT_KERNEL_SIZE_LARGE_TENSOR = [1] +DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR = [1] +DEFAULT_STRIDE_1_LARGE_TENSOR = [1] +DEFAULT_STRIDE_2_LARGE_TENSOR = [1] +DEFAULT_DILATE_LARGE_TENSOR = [[]] +DEFAULT_PAD_LARGE_TENSOR = [[]] +DEFAULT_OUTPUT_SIZE_LARGE_TENSOR = [(2, 2, 1)] +DEFAULT_KERNEL_LARGE_TENSOR = [(1, 1, 1)] +DEFAULT_STRIDE_LARGE_TENSOR = [[]] +DEFAULT_PARAMETERS_LARGE_TENSOR = [(7,)] +DEFAULT_STATE_LARGE_TENSOR = [(1, 4, 1)] +DEFAULT_STATE_SIZE_LARGE_TENSOR = [1] +DEFAULT_NUM_LAYERS_LARGE_TENSOR = [1] + # BatchNorm DEFAULT_AXIS_BN = [1] @@ -132,41 +189,81 @@ # SVMOutput DEFAULT_LABEL_SVM = [(32, 3, 256), (32, 3, 10000)] +DEFAULT_DATA_SVM_LARGE_TENSOR = [(2**29, 2, 2, 2)] +DEFAULT_LABEL_SVM_LARGE_TENSOR = [(2**29, 2, 2)] + # SoftmaxOutput DEFAULT_LABEL_SM = [(32, 3, 256), (32, 3, 10000)] +DEFAULT_DATA_SO_LARGE_TENSOR = [(2**29, 2, 2, 2)] +DEFAULT_LABEL_SO_LARGE_TENSOR = [(2**29, 2, 2)] + # FullyConnected DEFAULT_WEIGHT_FC = [(64, 3 * 256 * 256), (64, 10)] +DEFAULT_DATA_FC_LARGE_TENSOR = [(2**32, 1)] +DEFAULT_WEIGHT_FC_LARGE_TENSOR = [(1, 1)] +DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR = [1] + # Embedding DEFAULT_WEIGHT_EMBEDDING = [(3, 4), (16, 9)] +DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR = [(2**32, 1)] + # GroupNorm DEFAULT_DATA_GN = [(32, 3, 256, 256), (32, 10, 10000, 10)] DEFAULT_BETA_GAMMA_GN = [(1,), (10,)] +DEFAULT_DATA_GN_LARGE_TENSOR = [(2**27, 4, 4, 2)] +DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR = [(1,)] + # Dropout DEFAULT_DATA_DROPOUT = [(32, 3, 256, 256), (10000, 10)] DEFAULT_MODE_DROPOUT = ["always"] +DEFAULT_DATA_DROPOUT_LARGE_TENSOR = [(2**32 + 1,)] +DEFAULT_P_DROPOUT_LARGE_TENSOR = [.5] +DEFAULT_AXES_DROPOUT_LARGE_TENSOR = [[]] + # SpatialTransformer DEFAULT_DATA_ST = [(32, 3, 256, 6), (256, 3, 10000, 6)] DEFAULT_LOC_TAR_ST = [(32, 6), (256, 6)] +DEFAULT_DATA_ST_LARGE_TENSOR = [(2, 2**29, 1, 6)] +DEFAULT_LOC_TAR_ST_LARGE_TENSOR = [(2, 6)] + # im2col DEFAULT_KERNEL_I2C = [(3,), (3, 3)] DEFAULT_STRIDE_I2C = [(1,), (1, 1)] +DEFAULT_DATA_I2C_LARGE_TENSOR = [(2**29, 2, 2, 6)] +DEFAULT_KERNEL_I2C_LARGE_TENSOR = [(1,)] +DEFAULT_STRIDE_I2C_LARGE_TENSOR = [[]] + # col2im DEFAULT_DATA_C2I = [(32, 64, 256), (32, 64, 256)] -# RNN -DEFAULT_DATA_RNN = [(32, 4, 4), (512, 10000, 10)] -DEFAULT_P_RNN = [.5] +DEFAULT_DATA_C2I_LARGE_TENSOR = [(1, 2**30, 4)] # LRN DEFAULT_BETA_LRN = [.2] +DEFAULT_DATA_LRN_LARGE_TENSOR = [(2**27, 4, 4, 2)] + +# Correlation +DEFAULT_DATA1_LARGE_TENSOR = [(2**23, 8, 8, 8)] +DEFAULT_DATA2_LARGE_TENSOR = [(2**23, 8, 8, 8)] + +# For regression operators +DEFAULT_DATA_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)] +DEFAULT_LABEL_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)] + +# For normalization operators +DEFAULT_DATA_NORM_LARGE_TENSOR = [(2**29, 2, 2, 2)] +DEFAULT_GAMMA_NORM_LARGE_TENSOR = [(2,)] +DEFAULT_BETA_NORM_LARGE_TENSOR = [(2,)] +DEFAULT_AXIS_LARGE_TENSOR = [-1] + # For optimizer operators DEFAULT_WEIGHT = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_GRAD = [(1024, 1024), (10000, 1), (10000, 100)] @@ -194,6 +291,20 @@ DEFAULT_CLIP_WEIGHTS = [-1.0, 0.8] DEFAULT_LAZY_UPDATE = [0, 1] +DEFAULT_WEIGHT_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_GRAD_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_MOM_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_MEAN_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_VAR_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_N_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_D_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_V_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_Z_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_G_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] +DEFAULT_R1_LARGE_TENSOR = [(1,)] +DEFAULT_R2_LARGE_TENSOR = [(1,)] +DEFAULT_DELTA_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)] + # For rearrange operators # NOTE: Data needs to be a 4D tensor for operators like space_to_depth and depth_to_space # Hence below we append 4d to mark the difference. @@ -201,6 +312,9 @@ DEFAULT_DATA_4d = [(1, 4, 2, 4), (10, 25, 10, 100)] DEFAULT_BLOCK_SIZE = [2, 5] +DEFAULT_DATA_4d_LARGE_TENSOR = [(1, 4, 2, 2**29), (1,2**4,2**4,2**24)] +DEFAULT_BLOCK_SIZE_LARGE_TENSOR = [2, 4] + # For miscellaneous operators DEFAULT_DATA_SQUEEZE = [(1, 1024, 1024), (32, 1, 256, 256)] DEFAULT_AXIS_SQUEEZE = [0, 1] @@ -217,6 +331,15 @@ DEFAULT_MHS = [(1024,), (10000,), (10000,)] DEFAULT_RHS_FEI = [(1024,), (10000,), (10000,)] +DEFAULT_DATA_SQUEEZE_LARGE_TENSOR = [(2**32, 1)] +DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR = [1] +DEFAULT_WSS_LARGE_TENSOR = [(2**32, 1)] +DEFAULT_GSS_LARGE_TENSOR = [(2**32, 1)] +DEFAULT_WDS_LARGE_TENSOR = [(2**32, 1)] +DEFAULT_LHS_FEI_LARGE_TENSOR = [(2, 2**32 + 1)] +DEFAULT_RHS_FEI_LARGE_TENSOR = [(2,)] +DEFAULT_MHS_LARGE_TENSOR = [(2,)] + # For swapaxis operator DEFAULT_DIM_1 = [0] DEFAULT_DIM_2 = [1] @@ -231,21 +354,33 @@ DEFAULT_Y = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_COND = [(1024,), (10000,), (10000,)] DEFAULT_DEPTH = [0] + # For ravel_multi_index op, ndim(shape) = 2; hence data NDArray's first dim = 2 # First dimension of input of ravel operator should match shape parameter dimension # DEFAULT_SHAPE is reused for ravel_multi_index op RAVEL_DATA = [(2, 1024)] +RAVEL_DATA_LARGE_TENSOR = [(2, 2**32)] +DEFAULT_X_LARGE_TENSOR = [(2**32, 1)] + # For loss operators DEFAULT_DATA_3d = [(1024, 100, 100)] DEFAULT_LABEL = [(100,100)] DEFAULT_DATA_SMCE = [(1024, 1024)] DEFAULT_LABEL_SMCE = [(1024,)] + +DEFAULT_LABEL_LARGE_TENSOR = [(1, 1)] +DEFAULT_DATA_CTCLOSS = [(2**32, 1, 1)] +DEFAULT_DATA_SMCE_LARGE_TENSOR = [(2**32 + 1, 1)] +DEFAULT_LABEL_SMCE_LARGE_TENSOR = [(2**32 + 1,)] + # For NN operators DEFAULT_ACT_TYPE_LR = ['leaky', 'elu', 'selu', 'gelu'] DEFAULT_ACT_TYPE_ACTIVATION = ['relu', 'sigmoid', 'softrelu', 'softsign', 'tanh'] DEFAULT_LABEL_SOFTMAX = [(1024, 1024), (10000, 1), (10000, 100)] +DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR = [(2**32, 1)] + # For linalg operators DEFAULT_A = [(1024, 1024)] DEFAULT_B = [(1024, 1024)] @@ -253,6 +388,11 @@ DEFAULT_A_MT = [(1024, 1035)] DEFAULT_AXES = [[0, 1]] +DEFAULT_A_LARGE_TENSOR = [(2**16, 2**16)] +DEFAULT_B_LARGE_TENSOR = [(2**16, 2**16)] +DEFAULT_C_LARGE_TENSOR = [(2**16, 2**16)] +DEFAULT_A_MT_LARGE_TENSOR = [(2**32 + 1, 1)] + # Default Inputs. MXNet Op Param Name to Default Input mapping DEFAULTS_INPUTS = {"data": DEFAULT_DATA, "dtype": DEFAULT_DTYPE, @@ -363,13 +503,10 @@ "output_size": DEFAULT_OUTPUT_SIZE, "kernel_col2im": DEFAULT_KERNEL, "stride_col2im": DEFAULT_STRIDE, - "data_rnn": DEFAULT_DATA_RNN, - "p_rnn": DEFAULT_P_RNN, "parameters": DEFAULT_PARAMETERS, "state": DEFAULT_STATE, "state_size": DEFAULT_STATE_SIZE, "num_layers": DEFAULT_NUM_LAYERS, - "mode_rnn": DEFAULT_MODE, "data_groupnorm": DEFAULT_DATA_GN, "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN, "beta_groupnorm": DEFAULT_BETA_GAMMA_GN, @@ -433,6 +570,222 @@ "data_layernorm": DEFAULT_DATA_NN_BASIC, "axis_layernorm": DEFAULT_AXIS} +# Default Inputs for Large Tensor. MXNet Op Param Name to Default Input mapping +DEFAULTS_INPUTS_LARGE_TENSOR = {"data": DEFAULT_DATA_LARGE_TENSOR, + "dtype": DEFAULT_DTYPE, + "dtype_int": DEFAULT_DTYPE_INT, + "dtype_float": DEFAULT_DTYPE_FLOAT, + "sample": DEFAULT_SAMPLE_LARGE_TENSOR, + "lhs": DEFAULT_LHS_LARGE_TENSOR, + "rhs": DEFAULT_RHS_LARGE_TENSOR, + "shape": DEFAULT_SHAPE_LARGE_TENSOR, + "low": DEFAULT_LOW, + "high": DEFAULT_HIGH, + "low_nd": DEFAULT_LOW_ND_LARGE_TENSOR, + "high_nd": DEFAULT_HIGH_ND_LARGE_TENSOR, + "mu_nd": DEFAULT_MU_ND_LARGE_TENSOR, + "sigma": DEFAULT_SIGMA_LARGE_TENSOR, + "alpha_nd": DEFAULT_ALPHA_ND_LARGE_TENSOR, + "beta_nd": DEFAULT_BETA_ND_LARGE_TENSOR, + "lam_nd": DEFAULT_LAM_ND_LARGE_TENSOR, + "lam_random_pdf_exponential": DEFAULT_LAM_RPE_LARGE_TENSOR, + "sample_random_pdf_exponential": DEFAULT_SAMPLE_RPE_LARGE_TENSOR, + "k": DEFAULT_K, + "p": DEFAULT_P, + "k_nd": DEFAULT_K_ND_LARGE_TENSOR, + "p_nd": DEFAULT_P_ND_LARGE_TENSOR, + "axis_shape": DEFAULT_AXIS_SHAPE, + "axis": DEFAULT_AXIS, + "weight" : DEFAULT_WEIGHT_LARGE_TENSOR, + "weight32" : DEFAULT_WEIGHT_LARGE_TENSOR, + "grad" : DEFAULT_GRAD_LARGE_TENSOR, + "mean" : DEFAULT_MEAN_LARGE_TENSOR, + "var" : DEFAULT_VAR_LARGE_TENSOR, + "mom" : DEFAULT_MOM_LARGE_TENSOR, + "r1": DEFAULT_R1_LARGE_TENSOR, + "r2": DEFAULT_R2_LARGE_TENSOR, + "n" : DEFAULT_N_LARGE_TENSOR, + "d" : DEFAULT_D_LARGE_TENSOR, + "v" : DEFAULT_V_LARGE_TENSOR, + "z" : DEFAULT_Z_LARGE_TENSOR, + "g" : DEFAULT_G_LARGE_TENSOR, + "delta" : DEFAULT_DELTA_LARGE_TENSOR, + "lr" : DEFAULT_LR, + "lrs" : DEFAULT_LRS, + "wds" : DEFAULT_LRS, + "wd": DEFAULT_LR, + "gamma1" : DEFAULT_GAMMA_1, + "gamma2" : DEFAULT_GAMMA_2, + "epsilon" : DEFAULT_EPSILON, + "beta1" : DEFAULT_BETA_1, + "beta2" : DEFAULT_BETA_2, + "t" : DEFAULT_T, + "rescale_grad" : DEFAULT_RESCALE_GRAD, + "clip_grad" : DEFAULT_CLIP_GRADIENT, + "lazy_update" : DEFAULT_LAZY_UPDATE, + "data_4d": DEFAULT_DATA_4d_LARGE_TENSOR, + "dim1": DEFAULT_DIM_1, + "dim2": DEFAULT_DIM_2, + "block_size": DEFAULT_BLOCK_SIZE_LARGE_TENSOR, + "args": DEFAULT_ARGS, + "index": DEFAULT_INDEX_LARGE_TENSOR, + "data_smce": DEFAULT_DATA_SMCE_LARGE_TENSOR, + "label_smce": DEFAULT_LABEL_SMCE_LARGE_TENSOR, + "grid": DEFAULT_GRID_LARGE_TENSOR, + "data_bilinearsampler": DEFAULT_DATA_BILINEAR_LARGE_TENSOR, + "transform_type": DEFAULT_TRANSFORM_TYPE, + "data_gridgenerator": DEFAULT_DATA_GRIDGEN_LARGE_TENSOR, + "target_shape_gridgenerator": DEFAULT_TARGET_SHAPE_LARGE_TENSOR, + "data_sample_multinomial": DEFAULT_DATA_SM_LARGE_TENSOR, + "data_random_pdf_dirichlet": DEFAULT_DATA_RPD_LARGE_TENSOR, + "alpha_random_pdf_dirichlet": DEFAULT_ALPHA_RPD_LARGE_TENSOR, + "sample_random_pdf_gamma": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "alpha_random_pdf_gamma": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "beta_random_pdf_gamma": DEFAULT_BETA_LARGE_TENSOR, + "sample_random_pdf_generalized_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "mu_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "alpha_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "sample_random_pdf_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "k_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "p_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "sample_random_pdf_normal": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "mu_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "sigma_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "sample_random_pdf_poisson": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "lam_random_pdf_poisson": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "sample_random_pdf_uniform": DEFAULT_SAMPLE_RPG_LARGE_TENSOR, + "low_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "high_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR, + "shape_sample_exponential": DEFAULT_SHAPE_SE_LARGE_TENSOR, + "lam_sample_exponential": DEFAULT_LAM_SE_LARGE_TENSOR, + "mu_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR, + "sigma_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR, + "shape_sample_poisson": DEFAULT_LAM_SE_LARGE_TENSOR, + "lam_sample_poisson": DEFAULT_SHAPE_SE_LARGE_TENSOR, + "shape_sample_uniform": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "low_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR, + "high_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR, + "alpha_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "beta_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "mu_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "shape_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "alpha_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "shape_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "k_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "p_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR, + "A": DEFAULT_A_LARGE_TENSOR, + "B": DEFAULT_B_LARGE_TENSOR, + "C": DEFAULT_C_LARGE_TENSOR, + "A_linalg_maketrian": DEFAULT_A_MT_LARGE_TENSOR, + "axes": DEFAULT_AXES, + "act_type_leakyrelu": DEFAULT_ACT_TYPE_LR, + "label_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR, + "act_type_activation": DEFAULT_ACT_TYPE_ACTIVATION, + "data_squeeze": DEFAULT_DATA_SQUEEZE_LARGE_TENSOR, + "axis_squeeze": DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR, + "a_min": DEFAULT_A_MIN, + "a_max": DEFAULT_A_MAX, + "weights_sum_sq": DEFAULT_WSS_LARGE_TENSOR, + "grads_sum_sq": DEFAULT_GSS_LARGE_TENSOR, + "wds": DEFAULT_WDS_LARGE_TENSOR, + "eta": DEFAULT_ETA, + "eps": DEFAULT_EPSILON, + "stype": DEFAULT_STYPE, + "indices": DEFAULT_INDICES, + "begin": DEFAULT_BEGIN, + "end": DEFAULT_END, + "shape_like": DEFAULT_DATA_LARGE_TENSOR, + "depth": DEFAULT_DEPTH, + "condition": DEFAULT_X_LARGE_TENSOR, + "x": DEFAULT_X_LARGE_TENSOR, + "y": DEFAULT_X_LARGE_TENSOR, + "ravel_data": RAVEL_DATA_LARGE_TENSOR, + "a": DEFAULT_A_LARGE_TENSOR, + "lhs_fill_element_0index": DEFAULT_LHS_FEI_LARGE_TENSOR, + "rhs_fill_element_0index": DEFAULT_RHS_FEI_LARGE_TENSOR, + "mhs": DEFAULT_MHS_LARGE_TENSOR, + "lrs_multi_lars": DEFAULT_WSS_LARGE_TENSOR, + "data_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR, + "data_spatialtransformer": DEFAULT_DATA_ST_LARGE_TENSOR, + "loc_spatialtransformer": DEFAULT_LOC_TAR_ST_LARGE_TENSOR, + "target_shape": DEFAULT_LOC_TAR_ST_LARGE_TENSOR, + "transform_type_spatialtransformer": DEFAULT_TRANSFORM, + "sampler_type": DEFAULT_SAMPLER, + "data_col2im": DEFAULT_DATA_C2I_LARGE_TENSOR, + "output_size": DEFAULT_OUTPUT_SIZE_LARGE_TENSOR, + "kernel_col2im": DEFAULT_KERNEL_LARGE_TENSOR, + "stride_col2im": DEFAULT_STRIDE_LARGE_TENSOR, + "data_ctcloss": DEFAULT_DATA_CTCLOSS, + "label_ctcloss": DEFAULT_LABEL_LARGE_TENSOR, + "data_ctc_loss": DEFAULT_DATA_CTCLOSS, + "label_ctc_loss": DEFAULT_LABEL_LARGE_TENSOR, + "parameters": DEFAULT_PARAMETERS_LARGE_TENSOR, + "state": DEFAULT_STATE_LARGE_TENSOR, + "state_size": DEFAULT_STATE_SIZE_LARGE_TENSOR, + "num_layers": DEFAULT_NUM_LAYERS_LARGE_TENSOR, + "data_groupnorm": DEFAULT_DATA_GN_LARGE_TENSOR, + "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR, + "beta_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR, + "eps": DEFAULT_EPSILON, + "data_dropout": DEFAULT_DATA_DROPOUT_LARGE_TENSOR, + "mode_dropout": DEFAULT_MODE_DROPOUT, + "p_dropout": DEFAULT_P_DROPOUT_LARGE_TENSOR, + "axes_dropout": DEFAULT_AXES_DROPOUT_LARGE_TENSOR, + "data_nn_basic": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR, + "num_hidden": DEFAULT_NUM_HIDDEN_LARGE_TENSOR, + "data_fullyconnected": DEFAULT_DATA_FC_LARGE_TENSOR, + "weight_fullyconnected": DEFAULT_WEIGHT_FC_LARGE_TENSOR, + "num_hidden_fullyconnected": DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR, + "weight_embedding": DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR, + "bias": DEFAULT_BIAS_LARGE_TENSOR, + "flatten": DEFAULT_FLATTEN_LARGE_TENSOR, + "data_batchnorm": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR, + "gamma_batchnorm": DEFAULT_GAMMA_LARGE_TENSOR, + "beta_batchnorm": DEFAULT_BETA_LARGE_TENSOR, + "moving_mean_batchnorm": DEFAULT_MOVING_MEAN_LARGE_TENSOR, + "moving_var_batchnorm": DEFAULT_MOVING_VAR_LARGE_TENSOR, + "axis_batchnorm": DEFAULT_AXIS_BN, + "data_softmaxoutput": DEFAULT_DATA_SO_LARGE_TENSOR, + "label_softmaxoutput": DEFAULT_LABEL_SO_LARGE_TENSOR, + "data_maeregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR, + "label_maeregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR, + "data_logisticregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR, + "label_logisticregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR, + "data_linearregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR, + "label_linearregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR, + "data_svmoutput": DEFAULT_DATA_SVM_LARGE_TENSOR, + "label_svmoutput": DEFAULT_LABEL_SVM_LARGE_TENSOR, + "grad_scale": DEFAULT_GRAD_SCALE, + "normalization": DEFAULT_NORMALIZATION, + "margin": DEFAULT_MARGIN, + "regularization_coefficient": DEFAULT_REG_COEFF, + "data_l2normalization": DEFAULT_DATA_NORM_LARGE_TENSOR, + "mode_l2normalization": DEFAULT_MODE_L2, + "gamma_layernorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR, + "beta_layernorm": DEFAULT_BETA_NORM_LARGE_TENSOR, + "data_instancenorm": DEFAULT_DATA_NORM_LARGE_TENSOR, + "gamma_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR, + "beta_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR, + "input_dim": DEFAULT_INPUT_DIM_LARGE_TENSOR, + "output_dim": DEFAULT_OUTPUT_DIM_LARGE_TENSOR, + "sparse_grad": DEFAULT_SPARSE_GRAD, + "data1": DEFAULT_DATA1_LARGE_TENSOR, + "data2": DEFAULT_DATA2_LARGE_TENSOR, + "kernel_size": DEFAULT_KERNEL_SIZE_LARGE_TENSOR, + "max_displacement": DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR, + "stride1": DEFAULT_STRIDE_1_LARGE_TENSOR, + "stride2": DEFAULT_STRIDE_2_LARGE_TENSOR, + "data_im2col": DEFAULT_DATA_I2C_LARGE_TENSOR, + "kernel_im2col": DEFAULT_KERNEL_I2C_LARGE_TENSOR, + "stride_im2col": DEFAULT_STRIDE_I2C_LARGE_TENSOR, + "dilate_im2col": DEFAULT_DILATE_LARGE_TENSOR, + "pad_im2col": DEFAULT_PAD_LARGE_TENSOR, + "data_lrn": DEFAULT_DATA_LRN_LARGE_TENSOR, + "alpha_lrn": DEFAULT_ALPHA, + "beta_lrn": DEFAULT_BETA_LRN, + "nsize": DEFAULT_NSIZE, + "data_layernorm": DEFAULT_DATA_NORM_LARGE_TENSOR, + "axis_layernorm": DEFAULT_AXIS_LARGE_TENSOR} # These are names of MXNet operator parameters that is of type NDArray. # We maintain this list to automatically recognize these parameters are to be @@ -446,4 +799,6 @@ "v", "z", "g", "delta", "args", "indices", "shape_like", "y", "x", "condition", "a", "index", "raveL_data", "label", "grid", "A", "B", "C", "r1", "r2", "rois", "lrs", "wds", "weights_sum_sq", - "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state"] + "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state", + "state_cell"] + diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py index f6cdfe004215..f2cce0abec09 100644 --- a/benchmark/opperf/utils/benchmark_utils.py +++ b/benchmark/opperf/utils/benchmark_utils.py @@ -181,7 +181,7 @@ def run_performance_test(ops, inputs, run_backward=True, return op_benchmark_result -def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs): +def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs): # Running SoftmaxOutput backwards on GPU results in errors # track issue here: https://github.com/apache/incubator-mxnet/issues/880 gpu_backwards_disabled_ops = ['SoftmaxOutput'] @@ -195,7 +195,7 @@ def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs): for op, op_params in ops.items(): if ctx == mx.cpu() or op not in gpu_disabled_ops: # Prepare inputs for the operator - inputs = prepare_op_inputs(op, op_params) + inputs = prepare_op_inputs(op, op_params, int64_tensor) # setting backward false for ops with known issue if (ctx == mx.gpu() and op in gpu_backwards_disabled_ops) or op in no_backward: diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py index 99678b8d31a9..b27b8e4e73b5 100644 --- a/benchmark/opperf/utils/op_registry_utils.py +++ b/benchmark/opperf/utils/op_registry_utils.py @@ -20,7 +20,7 @@ from mxnet import runtime import mxnet as mx -from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, MX_OP_MODULE +from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, DEFAULTS_INPUTS_LARGE_TENSOR, MX_OP_MODULE def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forward_backward=True): @@ -109,7 +109,7 @@ def prepare_op_inputs(arg_params, arg_values): return inputs -def prepare_op_inputs(op, arg_params): +def prepare_op_inputs(op, arg_params, int64_tensor): inputs = [] # 4d tensor is needed only by following two ops @@ -120,14 +120,27 @@ def prepare_op_inputs(op, arg_params): # For ops with args that need to change shape/value for different ops custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian', - 'SpatialTransformer', 'col2im', 'RNN', 'GroupNorm', 'Dropout', 'FullyConnected', + 'SpatialTransformer', 'col2im', 'GroupNorm', 'Dropout', 'FullyConnected', 'SoftmaxOutput', 'LinearRegressionOutput', 'BatchNorm', 'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'im2col', 'LRN', 'squeeze', 'fill_element_0index'} + custom_data_int64 = {'random_pdf_dirichlet', 'random_pdf_exponential', 'random_pdf_gamma', + 'random_pdf_generalized_negative_binomial', 'random_pdf_negative_binomial', + 'random_pdf_normal', 'random_pdf_poisson', 'random_pdf_uniform', 'sample_exponential', + 'sample_normal', 'sample_poisson', 'sample_uniform', 'sample_gamma', + 'sample_generalized_negative_binomial', 'sample_negative_binomial', 'CTCLoss', + 'ctc_loss', 'multi_lars'} + int_only = {'random_randint'} float_only = {'log_softmax', 'softmax', 'softmin'} + if int64_tensor == 'on': + default_inputs = DEFAULTS_INPUTS_LARGE_TENSOR + custom_data |= custom_data_int64 + else: + default_inputs = DEFAULTS_INPUTS + # Prepare op to default input mapping arg_values = {} for arg_name, arg_type in zip(arg_params["params"]["arg_names"], @@ -137,29 +150,29 @@ def prepare_op_inputs(op, arg_params): # same for randint (which is the only op that takes only int as input) # rest all operators take int as well as float if op in int_only and arg_name == "dtype": - arg_values[arg_name] = DEFAULTS_INPUTS["dtype_int"] + arg_values[arg_name] = default_inputs["dtype_int"] elif (op.startswith(('random','sample')) or op in float_only) and arg_name == "dtype": - arg_values[arg_name] = DEFAULTS_INPUTS["dtype_float"] + arg_values[arg_name] = default_inputs["dtype_float"] elif "NDArray" in arg_type and op == "ravel_multi_index": - arg_values[arg_name] = DEFAULTS_INPUTS["ravel_data"] - elif op in custom_data and arg_name + "_" + op.lower() in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_" + op.lower()] - elif "NDArray" in arg_type and arg_name + "_nd" in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_nd"] - elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_4d"] - elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_3d"] + arg_values[arg_name] = default_inputs["ravel_data"] + elif op in custom_data and arg_name + "_" + op.lower() in default_inputs: + arg_values[arg_name] = default_inputs[arg_name + "_" + op.lower()] + elif "NDArray" in arg_type and arg_name + "_nd" in default_inputs: + arg_values[arg_name] = default_inputs[arg_name + "_nd"] + elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in default_inputs: + arg_values[arg_name] = default_inputs[arg_name + "_4d"] + elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in default_inputs: + arg_values[arg_name] = default_inputs[arg_name + "_3d"] elif "NDArray" in arg_type and op == 'softmax_cross_entropy': - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_smce"] - elif arg_name in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name] - elif "float" in arg_type and arg_name + "_float" in DEFAULTS_INPUTS: - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_float"] - elif "Shape" in arg_type and arg_name + "_shape" in DEFAULTS_INPUTS: + arg_values[arg_name] = default_inputs[arg_name + "_smce"] + elif arg_name in default_inputs: + arg_values[arg_name] = default_inputs[arg_name] + elif "float" in arg_type and arg_name + "_float" in default_inputs: + arg_values[arg_name] = default_inputs[arg_name + "_float"] + elif "Shape" in arg_type and arg_name + "_shape" in default_inputs: # This is for cases where in some ops 'axis' is Int in some ops a shape tuple. # Ex: axis in sum is shape, axis in sort is int. - arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_shape"] + arg_values[arg_name] = default_inputs[arg_name + "_shape"] # Number of different inputs we want to use to test # the operator @@ -340,7 +353,7 @@ def get_all_nn_basic_operators(): nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', 'SoftmaxOutput', 'LinearRegressionOutput', 'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'SpatialTransformer', 'im2col', - 'col2im', 'GroupNorm', 'RNN', 'LRN'] + 'col2im', 'GroupNorm', 'LRN'] # Get all mxnet operators mx_operators = _get_all_mxnet_operators()