diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py
index b8a2d47f6943..ca14868a6278 100644
--- a/benchmark/opperf/nd_operations/array_rearrange.py
+++ b/benchmark/opperf/nd_operations/array_rearrange.py
@@ -30,7 +30,7 @@
 
 
 def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the
     rearrange operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
index b85bf1e2d239..4daa38b792c3 100644
--- a/benchmark/opperf/nd_operations/binary_operators.py
+++ b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,7 +38,7 @@
     get_all_elemen_wise_binary_operators, get_all_misc_binary_operators
 
 
-def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
     binary operators in MXNet.
 
@@ -61,12 +61,12 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
     # Fetch all Miscellaneous Binary Operators
     mx_binary_misc_ops = get_all_misc_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, large_tensor, warmup, runs)
     return mx_binary_op_results
 
 
 def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the binary
     broadcast operators in MXNet.
 
     Parameters
@@ -77,6 +77,8 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -95,7 +97,7 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
 
 
 def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the binary
     element_wise operators in MXNet.
 
     Parameters
@@ -106,6 +108,8 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 10
         Number of times to run for warmup
     runs: int, default 50
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
index b2ee65da18cf..1adf25f79488 100644
--- a/benchmark/opperf/nd_operations/gemm_operators.py
+++ b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -36,7 +36,7 @@
 
 
 def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the GEMM
     operators (dot, batch_dot, khatri_rao) in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
index 04307b5fa96c..acdbfac70f6e 100644
--- a/benchmark/opperf/nd_operations/nn_activation_operators.py
+++ b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -44,8 +44,8 @@
 
 
 def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the activation
-    operators in MXNet.
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the activation
+    operators (relu, sigmoid, softmax) in MXNet.
 
     Parameters
     ----------
@@ -55,6 +55,8 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=
         Precision to use for benchmarks
     profiler: str, default 'native'
         Module to use for tracking benchmark excecution time
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
index c876fab04f55..8c69b1d97a9d 100644
--- a/benchmark/opperf/nd_operations/nn_basic_operators.py
+++ b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -30,6 +30,27 @@
 
 
 def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (large_tensor) for all the basic neural network
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    large_tensor: str, default 'off'
+        Tensor size to use for tests
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     if large_tensor == 'on':
         # FullyConnnected operator benchmarks
         fc_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "FullyConnected")],
diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
index a8b452ea2e54..6606065a0df3 100644
--- a/benchmark/opperf/nd_operations/nn_conv_operators.py
+++ b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -53,6 +53,27 @@
 
 
 def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the pooling
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    large_tensor: str, default 'off'
+        Tensor size to use for tests
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     pool_types = ['avg', 'max', 'sum']
     global_pool_types = [0, 1]
 
@@ -159,6 +180,27 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
 
 
 def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    large_tensor: str, default 'off'
+        Tensor size to use for tests
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     conv1d_benchmark_res = []
     conv2d_benchmark_res = []
     if large_tensor == 'on':
@@ -245,6 +287,27 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler
 
 
 def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', large_tensor='off', dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the transpose convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    large_tensor: str, default 'off'
+        Tensor size to use for tests
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     # Conv1DTranspose Benchmarks
     conv1d_transpose_benchmark_res = []
     if large_tensor == 'on':
diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
index c2554d4cba51..5c972426d21e 100644
--- a/benchmark/opperf/nd_operations/nn_optimizer_operators.py
+++ b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
@@ -37,7 +37,7 @@
 
 
 def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the neural network
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the neural network
     optimizer update operators in MXNet.
 
     Parameters
@@ -48,6 +48,8 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
index aeded5dbae66..cd1fbd0363ad 100644
--- a/benchmark/opperf/nd_operations/random_sampling_operators.py
+++ b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -35,7 +35,7 @@
 
 
 def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the random sampling
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the random sampling
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
index 5cec167be4cd..b4e662b81d30 100644
--- a/benchmark/opperf/nd_operations/reduction_operators.py
+++ b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -32,7 +32,7 @@
 
 
 def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the reduction
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the reduction
     operators in MXNet.
 
     Parameters
@@ -43,6 +43,8 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/sorting_searching_operators.py b/benchmark/opperf/nd_operations/sorting_searching_operators.py
index 057d164514be..fbed82b01f43 100644
--- a/benchmark/opperf/nd_operations/sorting_searching_operators.py
+++ b/benchmark/opperf/nd_operations/sorting_searching_operators.py
@@ -30,7 +30,7 @@
 
 
 def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the sorting and searching
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the sorting and searching
     operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
index 012ea5172ae2..f468bebd414b 100644
--- a/benchmark/opperf/nd_operations/unary_operators.py
+++ b/benchmark/opperf/nd_operations/unary_operators.py
@@ -36,7 +36,7 @@
 
 
 def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', large_tensor='off', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the unary
+    """Runs benchmarks with the given context, precision (dtype), and input data size (large_tensor) for all the unary
     operators in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    large_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100