From 8a2e6aafb85ebee26c341a689d5ce8cfb6398479 Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Sat, 29 Feb 2020 00:43:08 -0800
Subject: [PATCH] [Large Tensor] Implemented LT flag for OpPerf testing
 (#17449)

* Passing large_tensor parameter down

* Adding large tensor testing functionality for convolutional operators

* Added large tensor test functionality for conv ops

* Fixing sizing for conv ops

* Added gemm large tensor, print on conv

* Updated input for gemm ops and print statements

* Fixed deconv large tensor test

* Added bias for deconv

* Added test functionality for nn_activation and nn_basic ops

* Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test

* Dropped unnecessary print statements

* Fixed lint errors

* Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions

* Adding docs, changed large_tensor to int64_tensor for clarity

* Added warmup/runs to gemm ops, debugging process failure

* Resolved merge conficts, added default params and input switching functionality

* Dynamic input handling for default inputs, additional custom data for int64

* Fixed RPD issue

* Everything through reduction ops working

* Passing large_tensor parameter down

* Adding large tensor testing functionality for convolutional operators

* Added large tensor test functionality for conv ops

* Fixing sizing for conv ops

* Added gemm large tensor, print on conv

* Updated input for gemm ops and print statements

* Fixed deconv large tensor test

* Added bias for deconv

* Added test functionality for nn_activation and nn_basic ops

* Fixed deconv bias, implemented large tensor test logic for general ops, added default data for large tensor test

* Dropped unnecessary print statements

* Fixed lint errors

* Added large_tensor parameter to existing function descriptions, added descriptions for functions missing descriptions

* Adding docs, changed large_tensor to int64_tensor for clarity

* Added warmup/runs to gemm ops, debugging process failure

* Resolved merge conficts, added default params and input switching functionality

* Dynamic input handling for default inputs, additional custom data for int64

* Fixed RPD issue

* Everything through reduction ops working

* Random sampling & loss ops working

* Added indices, depth, ravel_data in default_params

* Added indexing ops - waiting for merge on ravel

* Added optimizer ops

* All misc ops working

* All NN Basic ops working

* Fixed LT input for ROIPooling

* Refactored NN Conv tests

* Added test for inline optimizer ops

* Dropping extra tests to decrease execution time

* Switching to inline tests for RNN to support additional modes

* Added state_cell as NDArray param, removed linalg testing for int64 tensor

* Cleaned up styling

* Fixed conv and deconv tests

* Retrigger CI for continuous build

* Cleaned up GEMM op inputs

* Dropped unused param from default_params
---
 .../opperf/nd_operations/array_rearrange.py   |   8 +-
 .../opperf/nd_operations/binary_operators.py  |  26 +-
 .../opperf/nd_operations/gemm_operators.py    |  84 ++--
 .../opperf/nd_operations/indexing_routines.py |   8 +-
 .../opperf/nd_operations/linalg_operators.py  |   8 +-
 .../opperf/nd_operations/misc_operators.py    |  73 ++--
 .../nd_operations/nn_activation_operators.py  |  10 +-
 .../nd_operations/nn_basic_operators.py       |  78 +++-
 .../opperf/nd_operations/nn_conv_operators.py | 287 ++++++++++----
 .../opperf/nd_operations/nn_loss_operators.py |   8 +-
 .../nd_operations/nn_optimizer_operators.py   |  66 ++--
 .../random_sampling_operators.py              |   8 +-
 .../nd_operations/reduction_operators.py      |   8 +-
 .../sorting_searching_operators.py            |   8 +-
 .../opperf/nd_operations/unary_operators.py   |  26 +-
 benchmark/opperf/opperf.py                    |  56 +--
 benchmark/opperf/rules/default_params.py      | 371 +++++++++++++++++-
 benchmark/opperf/utils/benchmark_utils.py     |   4 +-
 benchmark/opperf/utils/op_registry_utils.py   |  57 +--
 19 files changed, 941 insertions(+), 253 deletions(-)

diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py
index 12af8345543e..631d0bb997bc 100644
--- a/benchmark/opperf/nd_operations/array_rearrange.py
+++ b/benchmark/opperf/nd_operations/array_rearrange.py
@@ -29,8 +29,8 @@
 """
 
 
-def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the
     rearrange operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -55,5 +57,5 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
     mx_rearrange_ops = get_all_rearrange_operators()
 
     # Run benchmarks
-    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, warmup, runs)
+    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_rearrange_op_results
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
index 5d95360a73db..4444219e6054 100644
--- a/benchmark/opperf/nd_operations/binary_operators.py
+++ b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,8 +38,8 @@
     get_all_elemen_wise_binary_operators, get_all_misc_binary_operators
 
 
-def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
+def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the miscellaneous
     binary operators in MXNet.
 
     Parameters
@@ -48,6 +48,10 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,12 +65,12 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
     # Fetch all Miscellaneous Binary Operators
     mx_binary_misc_ops = get_all_misc_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     broadcast operators in MXNet.
 
     Parameters
@@ -77,6 +81,8 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -90,12 +96,12 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
     # Fetch all Binary Broadcast Operators
     mx_binary_broadcast_ops = get_all_broadcast_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     element_wise operators in MXNet.
 
     Parameters
@@ -106,6 +112,8 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 10
         Number of times to run for warmup
     runs: int, default 50
@@ -119,5 +127,5 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
     # Fetch all Binary Element_wise Operators
     mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
index f1028123b421..55b3435a8f24 100644
--- a/benchmark/opperf/nd_operations/gemm_operators.py
+++ b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM
     operators (dot, batch_dot, khatri_rao) in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -57,43 +59,75 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
-    # Benchmark tests for dot operator
+    standard_inputs_dot = [{"lhs": (1024, 1024),
+                            "rhs": (1024, 1024)},
+                           {"lhs": (1000, 10),
+                            "rhs": (1000, 10),
+                            "transpose_b": True},
+                           {"lhs": (1000, 1),
+                            "rhs": (100, 1000),
+                            "transpose_a": True,
+                            "transpose_b": True}]
+    int64_tensor_inputs_dot = [{"lhs": (2**16, 2**16),
+                                "rhs": (2**16, 2**16)},
+                               {"lhs": (4, 2**30),
+                                "rhs": (4, 2**30),
+                                "transpose_b": True},
+                               {"lhs": (2**28, 16),
+                                "rhs": (16, 2**28),
+                                "transpose_a": True,
+                                "transpose_b": True}]
+    standard_inputs_batch_dot = [{"lhs": (32, 1024, 1024),
+                                  "rhs": (32, 1024, 1024)},
+                                 {"lhs": (32, 1000, 10),
+                                  "rhs": (32, 1000, 10),
+                                  "transpose_b": True},
+                                 {"lhs": (32, 1000, 1),
+                                  "rhs": (32, 100, 1000),
+                                  "transpose_a": True,
+                                  "transpose_b": True}]
+    int64_tensor_inputs_batch_dot = [{"lhs": (1, 2**16, 2**16),
+                                      "rhs": (1, 2**16, 2**16)},
+                                     {"lhs": (1, 4, 2**30),
+                                      "rhs": (1, 4, 2**30),
+                                      "transpose_b": True},
+                                     {"lhs": (1, 2**28, 16),
+                                      "rhs": (1, 16, 2**28),
+                                      "transpose_a": True,
+                                      "transpose_b": True}]
+    standard_inputs_khatri_rao = [{"args": [(32, 32), (32, 32)]},
+                                  {"args": [(64, 64), (64, 64)]}]
+    int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}]
+
+    if int64_tensor == 'on':
+        inputs_dot = int64_tensor_inputs_dot
+        inputs_batch_dot = int64_tensor_inputs_batch_dot
+        inputs_khatri_rao = int64_tensor_inputs_khatri_rao
+    else:
+        inputs_dot = standard_inputs_dot
+        inputs_batch_dot = standard_inputs_batch_dot
+        inputs_khatri_rao = standard_inputs_khatri_rao
+
+    # Benchmark tests for dot and batch_dot operators
     dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (1024, 1024),
-                 "rhs": (1024, 1024)},
-                {"lhs": (1000, 10),
-                 "rhs": (1000, 10),
-                 "transpose_b": True},
-                {"lhs": (1000, 1),
-                 "rhs": (100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Benchmark tests for batch_dot operator
+
     batch_dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (32, 1024, 1024),
-                 "rhs": (32, 1024, 1024)},
-                {"lhs": (32, 1000, 10),
-                 "rhs": (32, 1000, 10),
-                 "transpose_b": True},
-                {"lhs": (32, 1000, 1),
-                 "rhs": (32, 100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_batch_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Operator khatri_rao is not yet implemented for GPU
+        # Operator khatri_rao is not yet implemented for GPU
     khatri_rao_benchmark_res = []
     if ctx != mx.gpu():
         # Benchmark tests for khatri_rao operator
         khatri_rao_benchmark_res = run_performance_test(
             [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False,
             dtype=dtype, ctx=ctx,
-            inputs=[{"args": [(32, 32), (32, 32)]},
-                    {"args": [(64, 64), (64, 64)]}],
+            inputs=inputs_khatri_rao,
             warmup=warmup, runs=runs, profiler=profiler)
 
     # Prepare combined results for GEMM operators
diff --git a/benchmark/opperf/nd_operations/indexing_routines.py b/benchmark/opperf/nd_operations/indexing_routines.py
index a957785940a5..ee99de2b57bf 100644
--- a/benchmark/opperf/nd_operations/indexing_routines.py
+++ b/benchmark/opperf/nd_operations/indexing_routines.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the indexing routines
+def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the indexing routines
     in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,5 +63,5 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
     mx_indexing_ops = get_all_indexing_routines()
 
     # Run benchmarks
-    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, warmup, runs)
+    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_indexing_op_results
diff --git a/benchmark/opperf/nd_operations/linalg_operators.py b/benchmark/opperf/nd_operations/linalg_operators.py
index d2c1cee0a307..1d35ef1fc951 100644
--- a/benchmark/opperf/nd_operations/linalg_operators.py
+++ b/benchmark/opperf/nd_operations/linalg_operators.py
@@ -34,8 +34,8 @@
 from benchmark.opperf.utils.common_utils import merge_map_list
 from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
-def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the linear algebra
+def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the linear algebra
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -74,5 +76,5 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
     # Fetch all Linear Algebra Operators
     mx_linalg_ops = get_all_linalg_operators()
     # Run benchmarks
-    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, warmup, runs)
+    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results])
diff --git a/benchmark/opperf/nd_operations/misc_operators.py b/benchmark/opperf/nd_operations/misc_operators.py
index 5a0efc57de0d..fb8535a959a0 100644
--- a/benchmark/opperf/nd_operations/misc_operators.py
+++ b/benchmark/opperf/nd_operations/misc_operators.py
@@ -37,7 +37,7 @@
 from benchmark.opperf.custom_operations.custom_operations import CustomAddOneProp
 
 
-def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
     operators in MXNet.
 
@@ -49,6 +49,8 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -59,6 +61,48 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+
+    standard_inputs_array_ops = [{"args": [(1024, 1024)],
+                                  "num_arrays": 1},
+                                 {"args": [(10000, 1)],
+                                  "num_arrays": 1},
+                                 {"args": [(10000, 10)],
+                                  "num_arrays": 1}]
+    int64_tensor_inputs_array_ops = [{"args": [(2**32, 1)],
+                                      "num_arrays":1}]
+    standard_inputs_add_n = [{"args": [(1024, 1024)]},
+                             {"args": [(10000, 1)]},
+                             {"args": [(10000, 10)]}]
+    int64_tensor_inputs_add_n = [{"args": [(2**16, 2**16)]}]
+    standard_inputs_upsampling = [{"args": (32, 3, 256, 256),
+                                   "scale": 2,
+                                   "sample_type": "nearest"},
+                                  {"args": (32, 3, 10000, 1),
+                                   "scale": 4,
+                                   "sample_type": "nearest"}]
+    int64_tensor_inputs_upsampling = [{"args": (2**32 + 1, 1, 1, 1),
+                                       "scale": 2,
+                                       "sample_type": "nearest"}]
+    standard_inputs_custom = [{"args": [(1024, 1024)],
+                               "op_type": "CustomAddOne"},
+                              {"args": [(10000, 1)],
+                               "op_type": "CustomAddOne"},
+                              {"args": [(10000, 10)],
+                               "op_type": "CustomAddOne"}]
+    int64_tensor_inputs_custom = [{"args": [(2**32 + 1, 1)],
+                                   "op_type": "CustomAddOne"}]
+
+    if int64_tensor == 'on':
+        inputs_array_ops = int64_tensor_inputs_array_ops
+        inputs_add_n = int64_tensor_inputs_add_n
+        inputs_upsampling = int64_tensor_inputs_upsampling
+        inputs_custom = int64_tensor_inputs_custom
+    else:
+        inputs_array_ops = standard_inputs_array_ops
+        inputs_add_n = standard_inputs_add_n
+        inputs_upsampling = standard_inputs_upsampling
+        inputs_custom = standard_inputs_custom
+
     # Individual tests for ops with positional args
     array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"),
                                                 getattr(MX_OP_MODULE, "multi_all_finite"),
@@ -67,12 +111,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                dtype=dtype,
                                                ctx=ctx,
                                                profiler=profiler,
-                                               inputs=[{"args": [(1024, 1024)],
-                                                        "num_arrays": 1},
-                                                       {"args": [(10000, 1)],
-                                                        "num_arrays": 1},
-                                                       {"args": [(10000, 10)],
-                                                        "num_arrays": 1}],
+                                               inputs=inputs_array_ops,
                                                warmup=warmup,
                                                runs=runs)
     add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")],
@@ -80,9 +119,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                            dtype=dtype,
                                            ctx=ctx,
                                            profiler=profiler,
-                                           inputs=[{"args": [(1024, 1024)]},
-                                                   {"args": [(10000, 1)]},
-                                                   {"args": [(10000, 10)]}],
+                                           inputs=inputs_add_n,
                                            warmup=warmup,
                                            runs=runs)
     # There are currently issus with UpSampling with bilinear interpolation.
@@ -92,12 +129,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                 dtype=dtype,
                                                 ctx=ctx,
                                                 profiler=profiler,
-                                                inputs=[{"args": (32, 3, 256, 256),
-                                                         "scale": 2,
-                                                         "sample_type": "nearest"},
-                                                        {"args": (32, 3, 10000, 1),
-                                                         "scale": 4,
-                                                         "sample_type": "nearest"}],
+                                                inputs=inputs_upsampling,
                                                 warmup=warmup,
                                                 runs=runs)
     # Create and register CustomAddOne operator for use in Custom op testing
@@ -108,17 +140,12 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                             dtype=dtype,
                                             ctx=ctx,
                                             profiler=profiler,
-                                            inputs=[{"args": [(1024, 1024)],
-                                                     "op_type": "CustomAddOne"},
-                                                    {"args": [(10000, 1)],
-                                                     "op_type": "CustomAddOne"},
-                                                    {"args": [(10000, 10)],
-                                                     "op_type": "CustomAddOne"}],
+                                            inputs=inputs_custom,
                                             warmup=warmup,
                                             runs=runs)
 
     # Fetch remaining Miscellaneous Operators
     mx_misc_ops = get_remaining_miscellaneous_operators()
     # Run benchmarks
-    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results])
diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
index b77777cc04dd..161dfe72123e 100644
--- a/benchmark/opperf/nd_operations/nn_activation_operators.py
+++ b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -43,9 +43,9 @@
 """
 
 
-def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the activation
-    operators in MXNet.
+def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the activation
+    operators (relu, sigmoid, softmax) in MXNet.
 
     Parameters
     ----------
@@ -55,6 +55,8 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=
         Precision to use for benchmarks
     profiler: str, default 'native'
         Module to use for tracking benchmark excecution time
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -70,6 +72,6 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=
     mx_activation_ops = get_all_nn_activation_operators()
 
     # Run benchmarks
-    mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, warmup, runs)
+    mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_activation_op_results
     
\ No newline at end of file
diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
index a8273d4105dc..f3007bac188c 100644
--- a/benchmark/opperf/nd_operations/nn_basic_operators.py
+++ b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -20,6 +20,10 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_nn_basic_operators
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
+
 """Performance benchmark tests for MXNet NDArray basic NN Operators.
 
 1. FullyConnected
@@ -45,8 +49,8 @@
 """
 
 
-def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the NN basic
+def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the basic neural network
     operators in MXNet.
 
     Parameters
@@ -56,7 +60,9 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     dtype: str, default 'float32'
         Precision to use for benchmarks
     profiler: str, default 'native'
-        Module to use for tracking benchmark excecution time
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -68,9 +74,71 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
 
     """
 
+    standard_data_list = [(1024, 4, 4)]
+    int64_tensor_data_list = [(2**28, 4, 4)]
+
+    if int64_tensor == 'on':
+        data_list = int64_tensor_data_list
+    else:
+        data_list = standard_data_list
+
+    for data in data_list:
+        rnn_relu_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (7,),
+                                                           "state": (1, 4, 1),
+                                                           "mode": "rnn_relu",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_tanh_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (7,),
+                                                           "state": (1, 4, 1),
+                                                           "mode": "rnn_tanh",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_lstm_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (28,),
+                                                           "state": (1, 4, 1),
+                                                           "state_cell": (1, 4, 1),
+                                                           "mode": "lstm",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_gru_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                 run_backward=True,
+                                                 dtype=dtype,
+                                                 ctx=ctx,
+                                                 profiler=profiler,
+                                                 inputs=[{"data": data,
+                                                          "parameters": (21,),
+                                                          "state": (1, 4, 1),
+                                                          "mode": "gru",
+                                                          "state_size": 1,
+                                                          "num_layers": 1}],
+                                                 warmup=warmup,
+                                                 runs=runs)
     # Fetch all NN Basic Operators
     mx_nn_basic_ops = get_all_nn_basic_operators()
     
     # Run benchmarks
-    mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, warmup, runs)
-    return mx_nn_basic_op_results
+    mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
+    return merge_map_list(rnn_relu_benchmark + rnn_tanh_benchmark + rnn_lstm_benchmark + rnn_gru_benchmark + [mx_nn_basic_op_results])
diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
index 9c80f00c354b..d44b89117511 100644
--- a/benchmark/opperf/nd_operations/nn_conv_operators.py
+++ b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -52,16 +52,55 @@
 """
 
 
-def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the pooling
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     pool_types = ['avg', 'max', 'sum']
     global_pool_types = [0, 1]
 
+    standard_data_list_pool1d = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_pool1d = [(1, 1, 2**32)]
+    standard_data_list_pool2d = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_pool2d = [(2**28, 1, 4, 4)]
+    standard_data_list_roipool = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_roipool = [(32, 3, 2**13, 2**13)]
+
+    if int64_tensor == 'on':
+        data_list_pool1d = int64_tensor_data_list_pool1d
+        data_list_pool2d = int64_tensor_data_list_pool2d
+        data_list_roipool = int64_tensor_data_list_roipool
+    else:
+        data_list_pool1d = standard_data_list_pool1d
+        data_list_pool2d = standard_data_list_pool2d
+        data_list_roipool = standard_data_list_roipool
+
     # Run 1D and 2D Pooling performance runs
     pool1d_benchmark_res = []
     pool2d_benchmark_res = []
     for pool_type in pool_types:
         for global_pool in global_pool_types:
-            for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
+            for pool1d_data in data_list_pool1d:
                 pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
@@ -73,10 +112,10 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                                       "global_pool": global_pool,
                                                                       "stride": 1,
                                                                       "pad": 1}
-                                                                     ],
+                                                                    ],
                                                              warmup=warmup,
                                                              runs=runs)
-            for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+            for pool2d_data in data_list_pool2d:
                 pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
@@ -88,68 +127,118 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                                       "global_pool": global_pool,
                                                                       "stride": (1, 1),
                                                                       "pad": (0, 0)}
-                                                                     ],
+                                                                    ],
                                                              warmup=warmup,
                                                              runs=runs)
-    # Run ROI Pooling performance runs
-    roipool_benchmark_res = []
-    for roipool_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
-        roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
-                                                      run_backward=True,
-                                                      dtype=dtype,
-                                                      ctx=ctx,
-                                                      profiler=profiler,
-                                                      inputs=[{"data": roipool_data,
-                                                               "rois": (32, 5),
-                                                               "pooled_size": (2, 2),
-                                                               "spatial_scale": .5}
-                                                             ],
-                                                      warmup=warmup,
-                                                      runs=runs)
+            # Run ROI Pooling performance runs
+            roipool_benchmark_res = []
+            for roipool_data in data_list_roipool:
+                roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
+                                                              run_backward=True,
+                                                              dtype=dtype,
+                                                              ctx=ctx,
+                                                              profiler=profiler,
+                                                              inputs=[{"data": roipool_data,
+                                                                       "rois": (32, 5),
+                                                                       "pooled_size": (2, 2),
+                                                                       "spatial_scale": .5}
+                                                                     ],
+                                                              warmup=warmup,
+                                                              runs=runs)
     # Prepare combined results
     mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res)
     return mx_pooling_op_results
 
 
-def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    # Conv1D Benchmarks
+def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+
+    standard_data_list_conv1d = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_conv1d = [(2**30, 1, 4)]
+    standard_weight_conv1d = (1, 3, 3)
+    int64_tensor_weight_conv1d = (1, 1, 1)
+    standard_kernel_conv1d = (3,)
+    int64_tensor_kernel_conv1d = (1,)
+    standard_data_list_conv2d = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_conv2d = [(2**28, 1, 4, 4)]
+    standard_weight_conv2d = (1, 3, 3, 3)
+    int64_tensor_weight_conv2d = (1, 1, 1, 1)
+    standard_kernel_conv2d = (3, 3)
+    int64_tensor_kernel_conv2d = (1, 1)
+
+    if int64_tensor == 'on':
+        data_list_conv1d = int64_tensor_data_list_conv1d
+        weight_conv1d = int64_tensor_weight_conv1d
+        kernel_conv1d = int64_tensor_kernel_conv1d
+        data_list_conv2d = int64_tensor_data_list_conv2d
+        weight_conv2d = int64_tensor_weight_conv2d
+        kernel_conv2d = int64_tensor_kernel_conv2d
+    else:
+        data_list_conv1d = standard_data_list_conv1d
+        weight_conv1d = standard_weight_conv1d
+        kernel_conv1d = standard_kernel_conv1d
+        data_list_conv2d = standard_data_list_conv2d
+        weight_conv2d = standard_weight_conv2d
+        kernel_conv2d = standard_kernel_conv2d
+
     conv1d_benchmark_res = []
-    for conv_data in [(32, 3, 256), (32, 3, 64)]:
+    conv2d_benchmark_res = []
+    # Conv1D Benchmarks
+    for conv_data in data_list_conv1d:
         conv1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": conv_data,
-                                                              "weight": (64, 3, 3),
-                                                              "bias": (64,),
-                                                              "kernel": (3,),
+                                                              "weight": weight_conv1d,
+                                                              "bias": (1,),
+                                                              "kernel": kernel_conv1d,
                                                               "stride": (1,),
                                                               "dilate": (1,),
                                                               "pad": (0,),
-                                                              "num_filter": 64,
-                                                              "layout": 'NCW'}
-                                                             ],
+                                                              "num_filter": 1,
+                                                              "layout": 'NCW'}],
                                                      warmup=warmup,
                                                      runs=runs)
     # Conv2D Benchmarks
-    conv2d_benchmark_res = []
-    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+    for conv_data in data_list_conv2d:
         conv2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": conv_data,
-                                                              "weight": (64, 3, 3, 3),
-                                                              "bias": (64,),
-                                                              "kernel": (3, 3),
+                                                              "weight": weight_conv2d,
+                                                              "bias": (1,),
+                                                              "kernel": kernel_conv2d,
                                                               "stride": (1, 1),
                                                               "dilate": (1, 1),
                                                               "pad": (0, 0),
-                                                              "num_filter": 64,
-                                                              "layout": 'NCHW'}
-                                                             ],
+                                                              "num_filter": 1,
+                                                              "layout": 'NCHW'}],
                                                      warmup=warmup,
                                                      runs=runs)
     # Prepare combined results
@@ -157,50 +246,98 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler
     return mx_conv_op_results
 
 
-def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', dtype='float32', warmup=10, runs=50):
+def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', int64_tensor='off', dtype='float32', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the transpose convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+
+    standard_data_list_conv1d_transpose = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_conv1d_transpose = [(2**30, 1, 4)]
+    standard_weight_conv1d_transpose = (3, 1, 3)
+    int64_tensor_weight_conv1d_transpose = (1, 1, 1)
+    standard_kernel_conv1d_transpose = (3,)
+    int64_tensor_kernel_conv1d_transpose = (1,)
+    standard_data_list_conv2d_transpose = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_conv2d_transpose = [(2**28, 1, 4, 4)]
+    standard_weight_conv2d_transpose = (3, 1, 3, 3)
+    int64_tensor_weight_conv2d_transpose = (1, 1, 1, 1)
+    standard_kernel_conv2d_transpose = (3, 3)
+    int64_tensor_kernel_conv2d_transpose = (1, 1)
+
+    if int64_tensor == 'on':
+        data_list_conv1d_transpose = int64_tensor_data_list_conv1d_transpose
+        weight_conv1d_transpose = int64_tensor_weight_conv1d_transpose
+        kernel_conv1d_transpose = int64_tensor_kernel_conv1d_transpose
+        data_list_conv2d_transpose = int64_tensor_data_list_conv2d_transpose
+        weight_conv2d_transpose = int64_tensor_weight_conv2d_transpose
+        kernel_conv2d_transpose = int64_tensor_kernel_conv2d_transpose
+    else:
+        data_list_conv1d_transpose = standard_data_list_conv1d_transpose
+        weight_conv1d_transpose = standard_weight_conv1d_transpose
+        kernel_conv1d_transpose = standard_kernel_conv1d_transpose
+        data_list_conv2d_transpose = standard_data_list_conv2d_transpose
+        weight_conv2d_transpose = standard_weight_conv2d_transpose
+        kernel_conv2d_transpose = standard_kernel_conv2d_transpose
+
     # Conv1DTranspose Benchmarks
     conv1d_transpose_benchmark_res = []
-    for conv_data in [(32, 3, 256), (32, 3, 64)]:
+    for conv_data in data_list_conv1d_transpose:
         conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
-                                                               run_backward=True,
-                                                               dtype=dtype,
-                                                               ctx=ctx,
-                                                               profiler=profiler,
-                                                               inputs=[{"data": conv_data,
-                                                                        "weight": (3, 64, 3),
-                                                                        "bias": (64,),
-                                                                        "kernel": (3,),
-                                                                        "stride": (1,),
-                                                                        "dilate": (1,),
-                                                                        "pad": (0,),
-                                                                        "adj": (0,),
-                                                                        "num_filter": 64,
-                                                                        "no_bias": False,
-                                                                        "layout": 'NCW'}
-                                                                       ],
-                                                               warmup=warmup,
-                                                               runs=runs)
+                                                                   run_backward=True,
+                                                                   dtype=dtype,
+                                                                   ctx=ctx,
+                                                                   profiler=profiler,
+                                                                   inputs=[{"data": conv_data,
+                                                                            "weight": weight_conv1d_transpose,
+                                                                            "bias": (1,),
+                                                                            "kernel": kernel_conv1d_transpose,
+                                                                            "stride": (1,),
+                                                                            "dilate": (1,),
+                                                                            "pad": (0,),
+                                                                            "num_filter": 1,
+                                                                            "no_bias": False,
+                                                                            "layout": 'NCW'}],
+                                                                   warmup=warmup,
+                                                                   runs=runs)
     # Conv2DTranspose Benchmarks
     conv2d_transpose_benchmark_res = []
-    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+    for conv_data in data_list_conv2d_transpose:
         conv2d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
-                                                               run_backward=True,
-                                                               dtype=dtype,
-                                                               ctx=ctx,
-                                                               profiler=profiler,
-                                                               inputs=[{"data": conv_data,
-                                                                        "weight": (3, 64, 3, 3),
-                                                                        "bias": (64,),
-                                                                        "kernel": (3, 3),
-                                                                        "stride": (1, 1),
-                                                                        "dilate": (1, 1),
-                                                                        "pad": (0, 0),
-                                                                        "num_filter": 64,
-                                                                        "no_bias": False,
-                                                                        "layout": 'NCHW'}
-                                                                       ],
-                                                               warmup=warmup,
-                                                               runs=runs)
+                                                                   run_backward=True,
+                                                                   dtype=dtype,
+                                                                   ctx=ctx,
+                                                                   profiler=profiler,
+                                                                   inputs=[{"data": conv_data,
+                                                                            "weight": weight_conv2d_transpose,
+                                                                            "bias": (1,),
+                                                                            "kernel": kernel_conv2d_transpose,
+                                                                            "stride": (1, 1),
+                                                                            "pad": (0, 0),
+                                                                            "num_filter": 1,
+                                                                            "no_bias": False,
+                                                                            "layout": 'NCHW'}],
+                                                                   warmup=warmup,
+                                                                   runs=runs)
     # Prepare combined results
     mx_transpose_conv_op_results = merge_map_list(conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res)
     return mx_transpose_conv_op_results
diff --git a/benchmark/opperf/nd_operations/nn_loss_operators.py b/benchmark/opperf/nd_operations/nn_loss_operators.py
index 9d894087343b..dea19f14f1af 100644
--- a/benchmark/opperf/nd_operations/nn_loss_operators.py
+++ b/benchmark/opperf/nd_operations/nn_loss_operators.py
@@ -28,8 +28,8 @@
 """
 
 
-def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the
     Neural Network loss operators in MXNet.
 
     Parameters
@@ -40,6 +40,8 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -54,5 +56,5 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
     mx_loss_ops = get_all_loss_operators()
 
     # Run benchmarks
-    mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, warmup, runs)
+    mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_loss_op_results
diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
index ac380655d136..db18b30081d4 100644
--- a/benchmark/opperf/nd_operations/nn_optimizer_operators.py
+++ b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
@@ -54,8 +54,8 @@
 """
 
 
-def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the neural network
+def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the neural network
     optimizer update operators in MXNet.
 
     Parameters
@@ -66,6 +66,8 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -76,60 +78,68 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+    standard_shape = (5, 5)
+    int64_tensor_shape = (2**16, 2**16)
+
+    if int64_tensor == 'on':
+        arg_shape = int64_tensor_shape
+    else:
+        arg_shape = standard_shape
+
     # Run independent tests for ops that need specific input data
     multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")],
-                                                inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                                "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
-                                                "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
-                                                "out": nd.random_normal(shape=(5,5))}],run_backward=False)
+                                                inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                                "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
+                                                "args3": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2,
+                                                "out": nd.random_normal(shape=arg_shape)}],run_backward=False)
 
     multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")],
-                                             inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                             "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
-                                             "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                             inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                             "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape),
+                                             "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")],
-                                         inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                         "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
-                                         "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                         inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                         "args1": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2,
+                                         "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")],
-                                            inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                            "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
-                                            "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                            inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                            "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape),
+                                            "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_mp_sgd_res = run_performance_test(
                                  [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")],
-                                 inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                          "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
+                                 inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                          "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
                                           "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
-                                          "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                          "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_sgd_mom_res = run_performance_test(
                                   [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")],
-                                  inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                           "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
+                                  inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                           "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
                                            "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
-                                           "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                           "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_sgd_res = run_performance_test(
                               [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")],
-                              inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
+                              inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape),
                                        "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
-                                       "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                       "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_mp_sgd_mom_res = run_performance_test(
                                      [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")],
-                                     inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
-                                              "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)),
+                                     inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape),
+                                              "args2": nd.random_normal(shape=arg_shape), "args3": nd.random_normal(shape=arg_shape),
                                               "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
-                                              "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                              "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     # Fetch remaining optimizer operators
     mx_optimizer_ops = get_all_optimizer_operators()
 
     # Run benchmarks
-    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs)
+    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\
                           preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\
-                          [mx_optimizer_op_results])
+                          multi_mp_sgd_mom_res + preloaded_multi_sgd_res + [mx_optimizer_op_results])
diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
index b6a1f44dba25..777f26af317c 100644
--- a/benchmark/opperf/nd_operations/random_sampling_operators.py
+++ b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -34,8 +34,8 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators
 
 
-def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the random sampling
+def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the random sampling
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -59,5 +61,5 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p
     # Fetch all Random Sampling Operators
     mx_random_sample_ops = get_all_random_sampling_operators()
     # Run benchmarks
-    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, warmup, runs)
+    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_random_sample_op_results
diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
index 6cc0d49c899b..d6e4b6dd6c2d 100644
--- a/benchmark/opperf/nd_operations/reduction_operators.py
+++ b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -31,8 +31,8 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the reduction
+def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the reduction
     operators in MXNet.
 
     Parameters
@@ -43,6 +43,8 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -56,5 +58,5 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile
     # Fetch all Reduction Operators
     mx_reduction_broadcast_ops = get_all_reduction_operators()
     # Run benchmarks
-    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_reduction_op_results
diff --git a/benchmark/opperf/nd_operations/sorting_searching_operators.py b/benchmark/opperf/nd_operations/sorting_searching_operators.py
index 2d936cdc48ca..d0d9fc064888 100644
--- a/benchmark/opperf/nd_operations/sorting_searching_operators.py
+++ b/benchmark/opperf/nd_operations/sorting_searching_operators.py
@@ -29,8 +29,8 @@
 """
 
 
-def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the sorting and searching
+def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the sorting and searching
     operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -54,5 +56,5 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr
     # Fetch all Random Sampling Operators
     mx_sort_search_ops = get_all_sorting_searching_operators()
     # Run benchmarks
-    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, warmup, runs)
+    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_sort_search_op_results
diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
index 08075906fae5..53cab57cfe15 100644
--- a/benchmark/opperf/nd_operations/unary_operators.py
+++ b/benchmark/opperf/nd_operations/unary_operators.py
@@ -38,8 +38,8 @@
 from benchmark.opperf.utils.common_utils import merge_map_list
 from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
-def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the unary
+def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the unary
     operators in MXNet.
 
     Parameters
@@ -50,6 +50,8 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -60,16 +62,26 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+
+    standard_inputs = [{"args": [(1024, 1024)],
+                        "num_outputs":1},
+                       {"args": [(10000, 1)],
+                        "num_outputs":1}]
+    int64_tensor_inputs = [{"args": [(2**32, 1)],
+                            "num_outputs":1}]
+
+    if int64_tensor == 'on':
+        inputs = int64_tensor_inputs
+    else:
+        inputs = standard_inputs
+
     # Run amp_multicast as it needs data as positional argument
     amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")],
                                                    run_backward=True,
                                                    dtype=dtype,
                                                    ctx=ctx,
                                                    profiler=profiler,
-                                                   inputs=[{"args": [(1024, 1024)],
-                                                            "num_outputs":1},
-                                                           {"args": [(10000, 1)],
-                                                            "num_outputs":1}],
+                                                   inputs=inputs,
                                                    warmup=warmup,
                                                    runs=runs)
 
@@ -77,5 +89,5 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     mx_unary_broadcast_ops = get_all_unary_operators()
 
     # Run benchmarks
-    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results])
diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index 5b8c43f417da..c0ac7b7dcd98 100755
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -51,7 +51,7 @@
     get_current_runtime_features
 
 
-def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
     """Run all the MXNet operators (NDArray) benchmarks.
 
     Returns
@@ -63,64 +63,66 @@ def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************
 
     # Run all Unary operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                         dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
     mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
-                                                                                            dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                            dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     mxnet_operator_benchmark_results.append(run_mx_binary_misc_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                         dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all GEMM operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx,
-                                                                          dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                          dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Random sampling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Reduction operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Sorting and Searching operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Array Rearrange operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Indexing routines benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # ************************ MXNET NN OPERATOR BENCHMARKS ****************************
 
     # Run all basic NN operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Activation operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Pooling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Optimizer operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
-
+    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
+    
     # Run all Transpose Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all NN loss operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
     
     # Run all Miscellaneous operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
-    # Run all Linear Algebra operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    # Linear Algebra operators do not work with int64 tensor data. Issue tracked here: https://github.com/apache/incubator-mxnet/issues/17716
+    if int64_tensor == 'off':
+        # Run all Linear Algebra operations benchmarks with default input values
+        mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # ****************************** PREPARE FINAL RESULTS ********************************
     final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results)
@@ -162,6 +164,11 @@ def main():
                         help='Use built-in CPP profiler (native) or Python'
                              'time module.'
                              'Valid Inputs - native, python')
+    
+    parser.add_argument('--int64-tensor', type=str, default='off',
+                        help='Run performance tests with large tensor input'
+                             'data (dimension >= 2**32) or standard input data.'
+                             'Valid Inputs - on, off')
 
     parser.add_argument('-w', '--warmup', type=int, default=25,
                         help='Number of times to run for warmup.'
@@ -169,7 +176,7 @@ def main():
 
     parser.add_argument('-r', '--runs', type=int, default=100,
                         help='Number of runs to capture benchmark results.'
-                             'Valid Inputs - positive integers')
+                             'Valid Inputs - positive integers')    
 
     args = parser.parse_args()
     logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args))
@@ -180,9 +187,10 @@ def main():
     ctx = _parse_mxnet_context(args.ctx)
     dtype = args.dtype
     profiler = args.profiler
+    int64_tensor = args.int64_tensor
     warmup = args.warmup
     runs = args.runs
-    benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)
+    benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)
 
     # Sort benchmark results alphabetically by op name
     final_benchmark_results = dict()
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 15bcd72b0553..a4362fa63e11 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -35,15 +35,22 @@
 DEFAULT_DTYPE_INT = ['int32', 'int64', 'int32']  # randint works for int* types only
 DEFAULT_DTYPE_FLOAT = ['float16', 'float32', 'float64']  # random_exp works for float* types only
 
+DEFAULT_DATA_LARGE_TENSOR = [(2**16, 2**16)]
+
 # For Binary miscellaneous operators like choose_element0_index
 # argument data must be indexed via an NDArray.
 # NOTE: Data used is DEFAULT_DATA
 DEFAULT_INDEX = [(1, 1024), (1, 1), (1, 100)]
 
+DEFAULT_INDEX_LARGE_TENSOR = [(1, 2**16)]
+
 # For Binary broadcast operators like - broadcast_add/sub/mod/logical_and etc..
 DEFAULT_LHS = [(1024, 1024), (10000, 10), (10000, 1)]
 DEFAULT_RHS = [(1024, 1024), (10000, 10), (10000, 1)]
 
+DEFAULT_LHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)]
+DEFAULT_RHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)]
+
 # For operators like - random_uniform, random_normal etc..
 DEFAULT_SHAPE = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_SAMPLE = [(2,)]
@@ -52,6 +59,15 @@
 DEFAULT_K = [1]
 DEFAULT_P = [1]
 
+DEFAULT_SHAPE_LARGE_TENSOR = [(2**16, 2**16)]#, (2**32, 1), (2**25, 2**7)]
+DEFAULT_SAMPLE_LARGE_TENSOR = [(2**32,)]
+DEFAULT_DATA_RPD_LARGE_TENSOR = [(2**32 + 1, 5)]
+DEFAULT_ALPHA_RPD_LARGE_TENSOR = [(2**32,)]
+DEFAULT_SAMPLE_RPE_LARGE_TENSOR = [(1, 2**32)]
+DEFAULT_LAM_RPE_LARGE_TENSOR = [(1,)]
+DEFAULT_SAMPLE_RPG_LARGE_TENSOR = [(1, 2**32 + 1)]
+DEFAULT_ALPHA_RPG_LARGE_TENSOR = [(1,)]
+
 # For operators like - sample_uniform, sample_normal etc..
 # NOTE: There are many overlapping operators in random_* and sample_*,
 # Ex: random_uniform, sample_uniform. Parameter names are same, but, for
@@ -73,6 +89,24 @@
 DEFAULT_TARGET_SHAPE = [(256, 6)]
 DEFAULT_DATA_SM = [(32, 32), (64, 64)]
 
+DEFAULT_LOW_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_HIGH_ND_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16]
+DEFAULT_MU_ND_LARGE_TENSOR = [[2.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_SIGMA_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16]
+DEFAULT_ALPHA_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_BETA_ND_LARGE_TENSOR = [[1.0] * 2**16 + [0.7] * 2**16]
+DEFAULT_LAM_ND_LARGE_TENSOR = [[1.0] * 2**16 + [8.5] * 2**16]
+DEFAULT_K_ND_LARGE_TENSOR = [[20] * 2**16 + [49] * 2**16]
+DEFAULT_P_ND_LARGE_TENSOR = [[0.4] * 2**16 + [0.77] * 2**16]
+DEFAULT_DATA_BILINEAR_LARGE_TENSOR = [(2**32, 1, 1, 1)]
+DEFAULT_GRID_LARGE_TENSOR = [(2**32, 2, 1, 1)]
+DEFAULT_DATA_GRIDGEN_LARGE_TENSOR = [(2**31, 2, 1, 1), (1, 6)]
+DEFAULT_TARGET_SHAPE_LARGE_TENSOR = [(1, 6)]
+DEFAULT_DATA_SM_LARGE_TENSOR = [(2**32,)]
+DEFAULT_SHAPE_SE_LARGE_TENSOR = [(1,)]
+DEFAULT_LAM_SE_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_SHAPE_SU_LARGE_TENSOR = [(2**32,)]
+
 # For reduction operators
 # NOTE: Data used is DEFAULT_DATA
 DEFAULT_AXIS_SHAPE = [(), 0, (0, 1)]
@@ -107,7 +141,6 @@
 DEFAULT_NSIZE = [3]
 DEFAULT_PARAMETERS = [(7,), (104,)]
 DEFAULT_STATE = [(1, 4, 1), (2, 10000, 4)]
-DEFAULT_MODE = ["rnn_relu", "rnn_tanh"]
 DEFAULT_STATE_SIZE = [1, 4]
 DEFAULT_NUM_LAYERS = [1, 2]
 DEFAULT_NUM_GROUPS = [1, 10]
@@ -119,6 +152,30 @@
 DEFAULT_KERNEL = [(1, 1, 1), (1, 1, 1)]
 DEFAULT_STRIDE = [(2, 2, 2), (1, 1, 1)]
 
+DEFAULT_DATA_NN_BASIC_LARGE_TENSOR = [(2**32 + 1, 1)]
+DEFAULT_NUM_HIDDEN_LARGE_TENSOR = [(1,)]
+DEFAULT_BIAS_LARGE_TENSOR = [(1,)]
+DEFAULT_FLATTEN_LARGE_TENSOR = [False]
+DEFAULT_GAMMA_LARGE_TENSOR = [(1,)]
+DEFAULT_BETA_LARGE_TENSOR = [(1,)]
+DEFAULT_MOVING_MEAN_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_MOVING_VAR_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_INPUT_DIM_LARGE_TENSOR = [2**32]
+DEFAULT_OUTPUT_DIM_LARGE_TENSOR = [1]
+DEFAULT_KERNEL_SIZE_LARGE_TENSOR = [1]
+DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR = [1]
+DEFAULT_STRIDE_1_LARGE_TENSOR = [1]
+DEFAULT_STRIDE_2_LARGE_TENSOR = [1]
+DEFAULT_DILATE_LARGE_TENSOR = [[]]
+DEFAULT_PAD_LARGE_TENSOR = [[]]
+DEFAULT_OUTPUT_SIZE_LARGE_TENSOR = [(2, 2, 1)]
+DEFAULT_KERNEL_LARGE_TENSOR = [(1, 1, 1)]
+DEFAULT_STRIDE_LARGE_TENSOR = [[]]
+DEFAULT_PARAMETERS_LARGE_TENSOR = [(7,)]
+DEFAULT_STATE_LARGE_TENSOR = [(1, 4, 1)]
+DEFAULT_STATE_SIZE_LARGE_TENSOR = [1]
+DEFAULT_NUM_LAYERS_LARGE_TENSOR = [1]
+
 # BatchNorm
 DEFAULT_AXIS_BN = [1]
 
@@ -132,41 +189,81 @@
 # SVMOutput
 DEFAULT_LABEL_SVM = [(32, 3, 256), (32, 3, 10000)]
 
+DEFAULT_DATA_SVM_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_SVM_LARGE_TENSOR = [(2**29, 2, 2)]
+
 # SoftmaxOutput
 DEFAULT_LABEL_SM = [(32, 3, 256), (32, 3, 10000)]
 
+DEFAULT_DATA_SO_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_SO_LARGE_TENSOR = [(2**29, 2, 2)]
+
 # FullyConnected
 DEFAULT_WEIGHT_FC = [(64, 3 * 256 * 256), (64, 10)]
 
+DEFAULT_DATA_FC_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_WEIGHT_FC_LARGE_TENSOR = [(1, 1)]
+DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR = [1]
+
 # Embedding
 DEFAULT_WEIGHT_EMBEDDING = [(3, 4), (16, 9)]
 
+DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR = [(2**32, 1)]
+
 # GroupNorm
 DEFAULT_DATA_GN = [(32, 3, 256, 256), (32, 10, 10000, 10)]
 DEFAULT_BETA_GAMMA_GN = [(1,), (10,)]
 
+DEFAULT_DATA_GN_LARGE_TENSOR = [(2**27, 4, 4, 2)]
+DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR = [(1,)]
+
 # Dropout
 DEFAULT_DATA_DROPOUT = [(32, 3, 256, 256), (10000, 10)]
 DEFAULT_MODE_DROPOUT = ["always"]
 
+DEFAULT_DATA_DROPOUT_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_P_DROPOUT_LARGE_TENSOR = [.5]
+DEFAULT_AXES_DROPOUT_LARGE_TENSOR = [[]]
+
 # SpatialTransformer
 DEFAULT_DATA_ST = [(32, 3, 256, 6), (256, 3, 10000, 6)]
 DEFAULT_LOC_TAR_ST = [(32, 6), (256, 6)]
 
+DEFAULT_DATA_ST_LARGE_TENSOR = [(2, 2**29, 1, 6)]
+DEFAULT_LOC_TAR_ST_LARGE_TENSOR = [(2, 6)]
+
 # im2col
 DEFAULT_KERNEL_I2C = [(3,), (3, 3)]
 DEFAULT_STRIDE_I2C = [(1,), (1, 1)]
 
+DEFAULT_DATA_I2C_LARGE_TENSOR = [(2**29, 2, 2, 6)]
+DEFAULT_KERNEL_I2C_LARGE_TENSOR = [(1,)]
+DEFAULT_STRIDE_I2C_LARGE_TENSOR = [[]]
+
 # col2im
 DEFAULT_DATA_C2I = [(32, 64, 256), (32, 64, 256)]
 
-# RNN
-DEFAULT_DATA_RNN = [(32, 4, 4), (512, 10000, 10)]
-DEFAULT_P_RNN = [.5]
+DEFAULT_DATA_C2I_LARGE_TENSOR = [(1, 2**30, 4)]
 
 # LRN
 DEFAULT_BETA_LRN = [.2]
 
+DEFAULT_DATA_LRN_LARGE_TENSOR = [(2**27, 4, 4, 2)]
+
+# Correlation
+DEFAULT_DATA1_LARGE_TENSOR = [(2**23, 8, 8, 8)]
+DEFAULT_DATA2_LARGE_TENSOR = [(2**23, 8, 8, 8)]
+
+# For regression operators
+DEFAULT_DATA_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+
+# For normalization operators
+DEFAULT_DATA_NORM_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_GAMMA_NORM_LARGE_TENSOR = [(2,)]
+DEFAULT_BETA_NORM_LARGE_TENSOR = [(2,)]
+DEFAULT_AXIS_LARGE_TENSOR = [-1]
+
 # For optimizer operators
 DEFAULT_WEIGHT = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_GRAD = [(1024, 1024), (10000, 1), (10000, 100)]
@@ -194,6 +291,20 @@
 DEFAULT_CLIP_WEIGHTS = [-1.0, 0.8]
 DEFAULT_LAZY_UPDATE = [0, 1]
 
+DEFAULT_WEIGHT_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_GRAD_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_MOM_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_MEAN_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_VAR_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_N_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_D_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_V_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_Z_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_G_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_R1_LARGE_TENSOR = [(1,)]
+DEFAULT_R2_LARGE_TENSOR = [(1,)]
+DEFAULT_DELTA_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+
 # For rearrange operators
 # NOTE: Data needs to be a 4D tensor for  operators like space_to_depth and depth_to_space
 # Hence below we append 4d to mark the difference.
@@ -201,6 +312,9 @@
 DEFAULT_DATA_4d = [(1, 4, 2, 4), (10, 25, 10, 100)]
 DEFAULT_BLOCK_SIZE = [2, 5]
 
+DEFAULT_DATA_4d_LARGE_TENSOR = [(1, 4, 2, 2**29), (1,2**4,2**4,2**24)]
+DEFAULT_BLOCK_SIZE_LARGE_TENSOR = [2, 4]
+
 # For miscellaneous operators
 DEFAULT_DATA_SQUEEZE = [(1, 1024, 1024), (32, 1, 256, 256)]
 DEFAULT_AXIS_SQUEEZE = [0, 1]
@@ -217,6 +331,15 @@
 DEFAULT_MHS = [(1024,), (10000,), (10000,)]
 DEFAULT_RHS_FEI = [(1024,), (10000,), (10000,)]
 
+DEFAULT_DATA_SQUEEZE_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR = [1]
+DEFAULT_WSS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_GSS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_WDS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_LHS_FEI_LARGE_TENSOR = [(2, 2**32 + 1)]
+DEFAULT_RHS_FEI_LARGE_TENSOR = [(2,)]
+DEFAULT_MHS_LARGE_TENSOR = [(2,)]
+
 # For swapaxis operator
 DEFAULT_DIM_1 = [0]
 DEFAULT_DIM_2 = [1]
@@ -231,21 +354,33 @@
 DEFAULT_Y = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_COND = [(1024,), (10000,), (10000,)]
 DEFAULT_DEPTH = [0]
+
 # For ravel_multi_index op, ndim(shape) = 2; hence data NDArray's first dim = 2
 # First dimension of input of ravel operator should match shape parameter dimension
 # DEFAULT_SHAPE is reused for ravel_multi_index op
 RAVEL_DATA = [(2, 1024)]
 
+RAVEL_DATA_LARGE_TENSOR = [(2, 2**32)]
+DEFAULT_X_LARGE_TENSOR = [(2**32, 1)]
+
 # For loss operators
 DEFAULT_DATA_3d = [(1024, 100, 100)]
 DEFAULT_LABEL = [(100,100)]
 DEFAULT_DATA_SMCE = [(1024, 1024)]
 DEFAULT_LABEL_SMCE = [(1024,)]
+
+DEFAULT_LABEL_LARGE_TENSOR = [(1, 1)]
+DEFAULT_DATA_CTCLOSS = [(2**32, 1, 1)]
+DEFAULT_DATA_SMCE_LARGE_TENSOR = [(2**32 + 1, 1)]
+DEFAULT_LABEL_SMCE_LARGE_TENSOR = [(2**32 + 1,)]
+
 # For NN operators
 DEFAULT_ACT_TYPE_LR = ['leaky', 'elu', 'selu', 'gelu']
 DEFAULT_ACT_TYPE_ACTIVATION = ['relu', 'sigmoid', 'softrelu', 'softsign', 'tanh']
 DEFAULT_LABEL_SOFTMAX = [(1024, 1024), (10000, 1), (10000, 100)]
 
+DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR = [(2**32, 1)]
+
 # For linalg operators
 DEFAULT_A = [(1024, 1024)]
 DEFAULT_B = [(1024, 1024)]
@@ -253,6 +388,11 @@
 DEFAULT_A_MT = [(1024, 1035)]
 DEFAULT_AXES = [[0, 1]]
 
+DEFAULT_A_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_B_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_C_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_A_MT_LARGE_TENSOR = [(2**32 + 1, 1)]
+
 # Default Inputs. MXNet Op Param Name to Default Input mapping
 DEFAULTS_INPUTS = {"data": DEFAULT_DATA,
                    "dtype": DEFAULT_DTYPE,
@@ -363,13 +503,10 @@
                    "output_size": DEFAULT_OUTPUT_SIZE,
                    "kernel_col2im": DEFAULT_KERNEL,
                    "stride_col2im": DEFAULT_STRIDE,
-                   "data_rnn": DEFAULT_DATA_RNN,
-                   "p_rnn": DEFAULT_P_RNN,
                    "parameters": DEFAULT_PARAMETERS,
                    "state": DEFAULT_STATE,
                    "state_size": DEFAULT_STATE_SIZE,
                    "num_layers": DEFAULT_NUM_LAYERS,
-                   "mode_rnn": DEFAULT_MODE,
                    "data_groupnorm": DEFAULT_DATA_GN,
                    "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN,
                    "beta_groupnorm": DEFAULT_BETA_GAMMA_GN,
@@ -433,6 +570,222 @@
                    "data_layernorm": DEFAULT_DATA_NN_BASIC,
                    "axis_layernorm": DEFAULT_AXIS}
 
+# Default Inputs for Large Tensor. MXNet Op Param Name to Default Input mapping
+DEFAULTS_INPUTS_LARGE_TENSOR = {"data": DEFAULT_DATA_LARGE_TENSOR,
+                                "dtype": DEFAULT_DTYPE,
+                                "dtype_int": DEFAULT_DTYPE_INT,
+                                "dtype_float": DEFAULT_DTYPE_FLOAT,
+                                "sample": DEFAULT_SAMPLE_LARGE_TENSOR,
+                                "lhs": DEFAULT_LHS_LARGE_TENSOR,
+                                "rhs": DEFAULT_RHS_LARGE_TENSOR,
+                                "shape": DEFAULT_SHAPE_LARGE_TENSOR,
+                                "low": DEFAULT_LOW,
+                                "high": DEFAULT_HIGH,
+                                "low_nd": DEFAULT_LOW_ND_LARGE_TENSOR,
+                                "high_nd": DEFAULT_HIGH_ND_LARGE_TENSOR,
+                                "mu_nd": DEFAULT_MU_ND_LARGE_TENSOR,
+                                "sigma": DEFAULT_SIGMA_LARGE_TENSOR,
+                                "alpha_nd": DEFAULT_ALPHA_ND_LARGE_TENSOR,
+                                "beta_nd": DEFAULT_BETA_ND_LARGE_TENSOR,
+                                "lam_nd": DEFAULT_LAM_ND_LARGE_TENSOR,
+                                "lam_random_pdf_exponential": DEFAULT_LAM_RPE_LARGE_TENSOR,
+                                "sample_random_pdf_exponential": DEFAULT_SAMPLE_RPE_LARGE_TENSOR,
+                                "k": DEFAULT_K,
+                                "p": DEFAULT_P,
+                                "k_nd": DEFAULT_K_ND_LARGE_TENSOR,
+                                "p_nd": DEFAULT_P_ND_LARGE_TENSOR,
+                                "axis_shape": DEFAULT_AXIS_SHAPE,
+                                "axis": DEFAULT_AXIS,
+                                "weight" : DEFAULT_WEIGHT_LARGE_TENSOR,
+                                "weight32" : DEFAULT_WEIGHT_LARGE_TENSOR,
+                                "grad" : DEFAULT_GRAD_LARGE_TENSOR,
+                                "mean" : DEFAULT_MEAN_LARGE_TENSOR,
+                                "var" : DEFAULT_VAR_LARGE_TENSOR,
+                                "mom" : DEFAULT_MOM_LARGE_TENSOR,
+                                "r1": DEFAULT_R1_LARGE_TENSOR,
+                                "r2": DEFAULT_R2_LARGE_TENSOR,
+                                "n" : DEFAULT_N_LARGE_TENSOR,
+                                "d" : DEFAULT_D_LARGE_TENSOR,
+                                "v" : DEFAULT_V_LARGE_TENSOR,
+                                "z" : DEFAULT_Z_LARGE_TENSOR,
+                                "g" : DEFAULT_G_LARGE_TENSOR,
+                                "delta" : DEFAULT_DELTA_LARGE_TENSOR,
+                                "lr" : DEFAULT_LR,
+                                "lrs" : DEFAULT_LRS,
+                                "wds" : DEFAULT_LRS,
+                                "wd": DEFAULT_LR,
+                                "gamma1" : DEFAULT_GAMMA_1,
+                                "gamma2" : DEFAULT_GAMMA_2,
+                                "epsilon" : DEFAULT_EPSILON,
+                                "beta1" : DEFAULT_BETA_1,
+                                "beta2" : DEFAULT_BETA_2,
+                                "t" : DEFAULT_T,
+                                "rescale_grad" : DEFAULT_RESCALE_GRAD,
+                                "clip_grad" : DEFAULT_CLIP_GRADIENT,
+                                "lazy_update" : DEFAULT_LAZY_UPDATE,
+                                "data_4d": DEFAULT_DATA_4d_LARGE_TENSOR,
+                                "dim1": DEFAULT_DIM_1,
+                                "dim2": DEFAULT_DIM_2,
+                                "block_size": DEFAULT_BLOCK_SIZE_LARGE_TENSOR,
+                                "args": DEFAULT_ARGS,
+                                "index": DEFAULT_INDEX_LARGE_TENSOR,
+                                "data_smce": DEFAULT_DATA_SMCE_LARGE_TENSOR,
+                                "label_smce": DEFAULT_LABEL_SMCE_LARGE_TENSOR,
+                                "grid": DEFAULT_GRID_LARGE_TENSOR,
+                                "data_bilinearsampler": DEFAULT_DATA_BILINEAR_LARGE_TENSOR,
+                                "transform_type": DEFAULT_TRANSFORM_TYPE,
+                                "data_gridgenerator": DEFAULT_DATA_GRIDGEN_LARGE_TENSOR,
+                                "target_shape_gridgenerator": DEFAULT_TARGET_SHAPE_LARGE_TENSOR,
+                                "data_sample_multinomial": DEFAULT_DATA_SM_LARGE_TENSOR,
+                                "data_random_pdf_dirichlet": DEFAULT_DATA_RPD_LARGE_TENSOR,
+                                "alpha_random_pdf_dirichlet": DEFAULT_ALPHA_RPD_LARGE_TENSOR,
+                                "sample_random_pdf_gamma": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "alpha_random_pdf_gamma": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "beta_random_pdf_gamma": DEFAULT_BETA_LARGE_TENSOR,
+                                "sample_random_pdf_generalized_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "mu_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "alpha_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "k_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "p_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_normal": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "mu_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sigma_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_poisson": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "lam_random_pdf_poisson": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_uniform": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "low_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "high_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "shape_sample_exponential": DEFAULT_SHAPE_SE_LARGE_TENSOR,
+                                "lam_sample_exponential": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "mu_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "sigma_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "shape_sample_poisson": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "lam_sample_poisson": DEFAULT_SHAPE_SE_LARGE_TENSOR,
+                                "shape_sample_uniform": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "low_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "high_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "alpha_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "beta_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "mu_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "shape_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "alpha_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "shape_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "k_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "p_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "A": DEFAULT_A_LARGE_TENSOR,
+                                "B": DEFAULT_B_LARGE_TENSOR,
+                                "C": DEFAULT_C_LARGE_TENSOR,
+                                "A_linalg_maketrian": DEFAULT_A_MT_LARGE_TENSOR,
+                                "axes": DEFAULT_AXES,
+                                "act_type_leakyrelu": DEFAULT_ACT_TYPE_LR,
+                                "label_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR,
+                                "act_type_activation": DEFAULT_ACT_TYPE_ACTIVATION,
+                                "data_squeeze": DEFAULT_DATA_SQUEEZE_LARGE_TENSOR,
+                                "axis_squeeze": DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR,
+                                "a_min": DEFAULT_A_MIN,
+                                "a_max": DEFAULT_A_MAX,
+                                "weights_sum_sq": DEFAULT_WSS_LARGE_TENSOR,
+                                "grads_sum_sq": DEFAULT_GSS_LARGE_TENSOR,
+                                "wds": DEFAULT_WDS_LARGE_TENSOR,
+                                "eta": DEFAULT_ETA,
+                                "eps": DEFAULT_EPSILON,
+                                "stype": DEFAULT_STYPE,
+                                "indices": DEFAULT_INDICES,
+                                "begin": DEFAULT_BEGIN,
+                                "end": DEFAULT_END,
+                                "shape_like": DEFAULT_DATA_LARGE_TENSOR,
+                                "depth": DEFAULT_DEPTH,
+                                "condition": DEFAULT_X_LARGE_TENSOR,
+                                "x": DEFAULT_X_LARGE_TENSOR,
+                                "y": DEFAULT_X_LARGE_TENSOR,
+                                "ravel_data": RAVEL_DATA_LARGE_TENSOR,
+                                "a": DEFAULT_A_LARGE_TENSOR,
+                                "lhs_fill_element_0index": DEFAULT_LHS_FEI_LARGE_TENSOR,
+                                "rhs_fill_element_0index": DEFAULT_RHS_FEI_LARGE_TENSOR,
+                                "mhs": DEFAULT_MHS_LARGE_TENSOR,
+                                "lrs_multi_lars": DEFAULT_WSS_LARGE_TENSOR,
+                                "data_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR,
+                                "data_spatialtransformer": DEFAULT_DATA_ST_LARGE_TENSOR,
+                                "loc_spatialtransformer": DEFAULT_LOC_TAR_ST_LARGE_TENSOR,
+                                "target_shape": DEFAULT_LOC_TAR_ST_LARGE_TENSOR,
+                                "transform_type_spatialtransformer": DEFAULT_TRANSFORM,
+                                "sampler_type": DEFAULT_SAMPLER,
+                                "data_col2im": DEFAULT_DATA_C2I_LARGE_TENSOR,
+                                "output_size": DEFAULT_OUTPUT_SIZE_LARGE_TENSOR,
+                                "kernel_col2im": DEFAULT_KERNEL_LARGE_TENSOR,
+                                "stride_col2im": DEFAULT_STRIDE_LARGE_TENSOR,
+                                "data_ctcloss": DEFAULT_DATA_CTCLOSS,
+                                "label_ctcloss": DEFAULT_LABEL_LARGE_TENSOR,
+                                "data_ctc_loss": DEFAULT_DATA_CTCLOSS,
+                                "label_ctc_loss": DEFAULT_LABEL_LARGE_TENSOR,
+                                "parameters": DEFAULT_PARAMETERS_LARGE_TENSOR,
+                                "state": DEFAULT_STATE_LARGE_TENSOR,
+                                "state_size": DEFAULT_STATE_SIZE_LARGE_TENSOR,
+                                "num_layers": DEFAULT_NUM_LAYERS_LARGE_TENSOR,
+                                "data_groupnorm": DEFAULT_DATA_GN_LARGE_TENSOR,
+                                "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR,
+                                "beta_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR,
+                                "eps": DEFAULT_EPSILON,
+                                "data_dropout": DEFAULT_DATA_DROPOUT_LARGE_TENSOR,
+                                "mode_dropout": DEFAULT_MODE_DROPOUT,
+                                "p_dropout": DEFAULT_P_DROPOUT_LARGE_TENSOR,
+                                "axes_dropout": DEFAULT_AXES_DROPOUT_LARGE_TENSOR,
+                                "data_nn_basic": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR,
+                                "num_hidden": DEFAULT_NUM_HIDDEN_LARGE_TENSOR,
+                                "data_fullyconnected": DEFAULT_DATA_FC_LARGE_TENSOR,
+                                "weight_fullyconnected": DEFAULT_WEIGHT_FC_LARGE_TENSOR,
+                                "num_hidden_fullyconnected": DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR,
+                                "weight_embedding": DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR,
+                                "bias": DEFAULT_BIAS_LARGE_TENSOR,
+                                "flatten": DEFAULT_FLATTEN_LARGE_TENSOR,
+                                "data_batchnorm": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR,
+                                "gamma_batchnorm": DEFAULT_GAMMA_LARGE_TENSOR,
+                                "beta_batchnorm": DEFAULT_BETA_LARGE_TENSOR,
+                                "moving_mean_batchnorm": DEFAULT_MOVING_MEAN_LARGE_TENSOR,
+                                "moving_var_batchnorm": DEFAULT_MOVING_VAR_LARGE_TENSOR,
+                                "axis_batchnorm": DEFAULT_AXIS_BN,
+                                "data_softmaxoutput": DEFAULT_DATA_SO_LARGE_TENSOR,
+                                "label_softmaxoutput": DEFAULT_LABEL_SO_LARGE_TENSOR,
+                                "data_maeregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_maeregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_logisticregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_logisticregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_linearregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_linearregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_svmoutput": DEFAULT_DATA_SVM_LARGE_TENSOR,
+                                "label_svmoutput": DEFAULT_LABEL_SVM_LARGE_TENSOR,
+                                "grad_scale": DEFAULT_GRAD_SCALE,
+                                "normalization": DEFAULT_NORMALIZATION,
+                                "margin": DEFAULT_MARGIN,
+                                "regularization_coefficient": DEFAULT_REG_COEFF,
+                                "data_l2normalization": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "mode_l2normalization": DEFAULT_MODE_L2,
+                                "gamma_layernorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "beta_layernorm": DEFAULT_BETA_NORM_LARGE_TENSOR,
+                                "data_instancenorm": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "gamma_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "beta_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "input_dim": DEFAULT_INPUT_DIM_LARGE_TENSOR,
+                                "output_dim": DEFAULT_OUTPUT_DIM_LARGE_TENSOR,
+                                "sparse_grad": DEFAULT_SPARSE_GRAD,
+                                "data1": DEFAULT_DATA1_LARGE_TENSOR,
+                                "data2": DEFAULT_DATA2_LARGE_TENSOR,
+                                "kernel_size": DEFAULT_KERNEL_SIZE_LARGE_TENSOR,
+                                "max_displacement": DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR,
+                                "stride1": DEFAULT_STRIDE_1_LARGE_TENSOR,
+                                "stride2": DEFAULT_STRIDE_2_LARGE_TENSOR,
+                                "data_im2col": DEFAULT_DATA_I2C_LARGE_TENSOR,
+                                "kernel_im2col": DEFAULT_KERNEL_I2C_LARGE_TENSOR,
+                                "stride_im2col": DEFAULT_STRIDE_I2C_LARGE_TENSOR,
+                                "dilate_im2col": DEFAULT_DILATE_LARGE_TENSOR,
+                                "pad_im2col": DEFAULT_PAD_LARGE_TENSOR,
+                                "data_lrn": DEFAULT_DATA_LRN_LARGE_TENSOR,
+                                "alpha_lrn": DEFAULT_ALPHA,
+                                "beta_lrn": DEFAULT_BETA_LRN,
+                                "nsize": DEFAULT_NSIZE,
+                                "data_layernorm": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "axis_layernorm": DEFAULT_AXIS_LARGE_TENSOR}
 
 # These are names of MXNet operator parameters that is of type NDArray.
 # We maintain this list to automatically recognize these parameters are to be
@@ -446,4 +799,6 @@
                           "v", "z", "g", "delta", "args", "indices", "shape_like", "y",
                           "x", "condition", "a", "index", "raveL_data", "label", "grid",
                           "A", "B", "C", "r1", "r2", "rois", "lrs", "wds", "weights_sum_sq",
-                          "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state"]
+                          "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state",
+                          "state_cell"]
+
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index f6cdfe004215..f2cce0abec09 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -181,7 +181,7 @@ def run_performance_test(ops, inputs, run_backward=True,
     return op_benchmark_result
 
 
-def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs):
+def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs):
     # Running SoftmaxOutput backwards on GPU results in errors
     # track issue here: https://github.com/apache/incubator-mxnet/issues/880
     gpu_backwards_disabled_ops = ['SoftmaxOutput']
@@ -195,7 +195,7 @@ def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs):
     for op, op_params in ops.items():
         if ctx == mx.cpu() or op not in gpu_disabled_ops:
             # Prepare inputs for the operator
-            inputs = prepare_op_inputs(op, op_params)
+            inputs = prepare_op_inputs(op, op_params, int64_tensor)
 
             # setting backward false for ops with known issue
             if (ctx == mx.gpu() and op in gpu_backwards_disabled_ops) or op in no_backward:
diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py
index 99678b8d31a9..b27b8e4e73b5 100644
--- a/benchmark/opperf/utils/op_registry_utils.py
+++ b/benchmark/opperf/utils/op_registry_utils.py
@@ -20,7 +20,7 @@
 from mxnet import runtime
 import mxnet as mx
 
-from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, MX_OP_MODULE
+from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, DEFAULTS_INPUTS_LARGE_TENSOR, MX_OP_MODULE
 
 
 def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forward_backward=True):
@@ -109,7 +109,7 @@ def prepare_op_inputs(arg_params, arg_values):
     return inputs
 
 
-def prepare_op_inputs(op, arg_params):
+def prepare_op_inputs(op, arg_params, int64_tensor):
     inputs = []
 
     # 4d tensor is needed only by following two ops
@@ -120,14 +120,27 @@ def prepare_op_inputs(op, arg_params):
 
     # For ops with args that need to change shape/value for different ops
     custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian',
-                   'SpatialTransformer', 'col2im', 'RNN', 'GroupNorm', 'Dropout', 'FullyConnected',
+                   'SpatialTransformer', 'col2im', 'GroupNorm', 'Dropout', 'FullyConnected',
                    'SoftmaxOutput', 'LinearRegressionOutput', 'BatchNorm', 'LogisticRegressionOutput',
                    'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm',
                    'Embedding', 'Correlation', 'im2col', 'LRN', 'squeeze', 'fill_element_0index'}
 
+    custom_data_int64 = {'random_pdf_dirichlet', 'random_pdf_exponential', 'random_pdf_gamma',
+                         'random_pdf_generalized_negative_binomial', 'random_pdf_negative_binomial',
+                         'random_pdf_normal', 'random_pdf_poisson', 'random_pdf_uniform', 'sample_exponential',
+                         'sample_normal', 'sample_poisson', 'sample_uniform', 'sample_gamma',
+                         'sample_generalized_negative_binomial', 'sample_negative_binomial', 'CTCLoss',
+                         'ctc_loss', 'multi_lars'}
+
     int_only = {'random_randint'}
     float_only = {'log_softmax', 'softmax', 'softmin'}
 
+    if int64_tensor == 'on':
+        default_inputs = DEFAULTS_INPUTS_LARGE_TENSOR
+        custom_data |= custom_data_int64
+    else:
+        default_inputs = DEFAULTS_INPUTS
+
     # Prepare op to default input mapping
     arg_values = {}
     for arg_name, arg_type in zip(arg_params["params"]["arg_names"],
@@ -137,29 +150,29 @@ def prepare_op_inputs(op, arg_params):
         # same for randint (which is the only op that takes only int as input)
         # rest all operators take int as well as float
         if op in int_only and arg_name == "dtype":
-            arg_values[arg_name] = DEFAULTS_INPUTS["dtype_int"]
+            arg_values[arg_name] = default_inputs["dtype_int"]
         elif (op.startswith(('random','sample')) or op in float_only) and arg_name == "dtype":
-            arg_values[arg_name] = DEFAULTS_INPUTS["dtype_float"]
+            arg_values[arg_name] = default_inputs["dtype_float"]
         elif "NDArray" in arg_type and op == "ravel_multi_index":
-            arg_values[arg_name] = DEFAULTS_INPUTS["ravel_data"]
-        elif op in custom_data and arg_name + "_" + op.lower() in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_" + op.lower()]
-        elif "NDArray" in arg_type and arg_name + "_nd" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_nd"]
-        elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_4d"]
-        elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_3d"]
+            arg_values[arg_name] = default_inputs["ravel_data"]
+        elif op in custom_data and arg_name + "_" + op.lower() in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_" + op.lower()]
+        elif "NDArray" in arg_type and arg_name + "_nd" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_nd"]
+        elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_4d"]
+        elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_3d"]
         elif "NDArray" in arg_type and op == 'softmax_cross_entropy':
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_smce"]
-        elif arg_name in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name]
-        elif "float" in arg_type and arg_name + "_float" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_float"]
-        elif "Shape" in arg_type and arg_name + "_shape" in DEFAULTS_INPUTS:
+            arg_values[arg_name] = default_inputs[arg_name + "_smce"]
+        elif arg_name in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name]
+        elif "float" in arg_type and arg_name + "_float" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_float"]
+        elif "Shape" in arg_type and arg_name + "_shape" in default_inputs:
             # This is for cases where in some ops 'axis' is Int in some ops a shape tuple.
             # Ex: axis in sum is shape, axis in sort is int.
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_shape"]
+            arg_values[arg_name] = default_inputs[arg_name + "_shape"]
 
     # Number of different inputs we want to use to test
     # the operator
@@ -340,7 +353,7 @@ def get_all_nn_basic_operators():
     nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', 'SoftmaxOutput', 'LinearRegressionOutput',
                     'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization',
                     'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'SpatialTransformer', 'im2col',
-                    'col2im', 'GroupNorm', 'RNN', 'LRN']
+                    'col2im', 'GroupNorm', 'LRN']
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()