diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6931b40c667d9..0f26e790bfebd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -394,7 +394,6 @@ endif()
 if(USE_PROFILER)
   message(STATUS "Build with profiler...")
 
-  add_definitions(-DUSE_PROFILER=1)
   tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 83c26933be45b..be14d6fa2ddc4 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -554,10 +554,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
- * \param f_preproc The function to be executed before we excetute time evaluator.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+ *        defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ *        cooldown is activated.
+ * \param f_preproc The function to be executed before we excetute time
+ *        evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 48ed44ef19b78..ac79007539637 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index f144b3cb4a824..202dd0e39139d 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -222,13 +222,19 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self):
+    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
         """
         # Get timing.
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = self.run_individual(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
 
         # Get outputs.
         self._run_per_layer()
@@ -259,11 +265,46 @@ def debug_get_output(self, node, out=None):
 
         self._debug_get_output(node_index, out)
 
-    def run(self, **input_dict):
+    # pylint: disable=arguments-differ
+    def run(
+        self,
+        number=10,
+        repeat=1,
+        min_repeat_ms=1,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        **input_dict,
+    ):
         """Run forward execution of the graph with debug
 
         Parameters
         ----------
+        number: int, optional
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
         """
@@ -271,7 +312,13 @@ def run(self, **input_dict):
             self.set_input(**input_dict)
 
         # Step 1. Execute the graph
-        self._run_debug()
+        self._run_debug(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
         # Step 3. Dump the Chrome trace to the dump folder
@@ -279,11 +326,62 @@ def run(self, **input_dict):
         # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
+    def run_individual(
+        self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
+    ):
+        """Run each operation in the graph and get the time per op for all ops.
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
-    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
+        Returns
+        -------
+        A 2-dimensional array where the dimensions are: the index of the operation and
+        the repeat of the measurement.
+        """
+        ret = self._run_individual(
+            number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        for node_data in ret.strip(";").split(";"):
+            results.append([])
+            for repeat_data in node_data.strip(",").split(","):
+                if repeat_data:
+                    results[-1].append(float(repeat_data))
+        return results
+
+    def run_individual_node(
+        self,
+        index,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+    ):
         """Benchmark a single node in the serialized graph.
 
         This does not do any data transfers and uses arrays already on the device.
@@ -304,7 +402,7 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             The returned result contains `repeat` costs,
             each of which is an average of `number` costs.
 
-        min_repeat_ms: int, optional
+        min_repeat_ms : int, optional
             The minimum duration of one `repeat` in milliseconds.
             By default, one `repeat` contains `number` runs. If this parameter is set,
             the parameters `number` will be dynamically adjusted to meet the
@@ -312,19 +410,26 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         Returns
         -------
         A module BenchmarkResult
         """
         # Results are returned as serialized strings which we deserialize
-        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
-        answer = []
-        for value in ret.split(","):
-            if value.strip() == "":
-                continue
-            answer.append(float(value))
-
-        return BenchmarkResult(answer)
+        ret = self._run_individual_node(
+            index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        for repeat_data in ret.replace(" ", "").strip(",").split(","):
+            if repeat_data:
+                results.append(float(repeat_data))
+        return BenchmarkResult(results)
 
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 8185391e35519..006edd3458025 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
     def get_output_tensors(self):
         """Get the output tensors of each operation in numpy format"""
         eid = 0
-        order = 0
         output_tensors = {}
-        for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
+        for i, node in enumerate(self._nodes_list):
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
-                order += time[0]
 
                 # the node name is not unique, so we need a consistent
                 # indexing based on the list ordering in the nodes
@@ -157,7 +155,7 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])
 
         def node_to_events(node, times, starting_time):
             return [
@@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
                 ),
                 ChromeTraceEvent(
                     # Use start + duration instead of end to ensure precise timings.
-                    ts=s_to_us(times[0] + starting_time),
+                    ts=s_to_us(np.mean(times) + starting_time),
                     tid=1,
                     pid=1,
                     ph="E",
@@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = [
+            "Node Name",
+            "Ops",
+            "Time(us)",
+            "Time(%)",
+            "Shape",
+            "Inputs",
+            "Outputs",
+            "Measurements(us)",
+        ]
+        lines = [
+            "---------",
+            "---",
+            "--------",
+            "-------",
+            "-----",
+            "------",
+            "-------",
+            "----------------",
+        ]
         eid = 0
         data = []
-        total_time = sum(time[0] for time in self._time_list)
+        total_time = sum([np.mean(time) for time in self._time_list])
         for node, time in zip(self._nodes_list, self._time_list):
+            time_mean = np.mean(time)
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 op = node["op"]
@@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1e6, 3)
-                time_percent = round(((time[0] / total_time) * 100), 3)
+                time_us = round(time_mean * 1e6, 3)
+                time_percent = round(((time_mean / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
-                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
+                measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
                 data.append(node_data)
                 eid += 1
 
@@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
             data = sorted(data, key=lambda x: x[2], reverse=True)
             # Insert a row for total time at the end.
             rounded_total_time_us = round(total_time * 1e6, 3)
-            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
+            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])
 
         fmt = ""
         for i, _ in enumerate(header):
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index e8e2798ef7346..a4b90baf1d380 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -356,6 +356,8 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -395,7 +397,7 @@ def benchmark(
             `number` should be increased when the runtime of the function is small (less than a 1/10
             of a millisecond).
 
-        min_repeat_ms : Optional[float]
+        min_repeat_ms : Optional[int]
             If set, the inner loop will be run until it takes longer than `min_repeat_ms`
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
@@ -405,6 +407,13 @@ def benchmark(
             returned tensors in the total runtime. This will give accurate timings for end to end
             workloads.
 
+        cooldown_interval_ms: Optional[int]
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: Optional[int]
+            The number of repeats before the cooldown is activated.
+
         kwargs : Dict[str, Object]
             Named arguments to the function. These are cached before running timing code, so that
             data transfer costs are not counted in the runtime.
@@ -432,5 +441,11 @@ def benchmark(
         if kwargs:
             self.set_input(**kwargs)
         return self.module.time_evaluator(
-            func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            func_name,
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
         )()
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 8663eb460c4a8..0e47be865b106 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -204,7 +204,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index c614e5d757c3f..6123f33161711 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -270,7 +270,17 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f_preproc=""):
+    def time_evaluator(
+        self,
+        func_name,
+        dev,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        f_preproc="",
+    ):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -299,6 +309,14 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         f_preproc: str, optional
             The preprocess function name we want to execute before executing the time evaluator.
 
@@ -322,6 +340,8 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
                 number,
                 repeat,
                 min_repeat_ms,
+                cooldown_interval_ms,
+                repeats_to_cooldown,
                 f_preproc,
             )
 
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 6e59c3455a914..83f1656a0dd80 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -584,6 +584,8 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -623,7 +625,7 @@ def benchmark(
             `number` should be increased when the runtime of the function is small (less than a 1/10
             of a millisecond).
 
-        min_repeat_ms : Optional[float]
+        min_repeat_ms : Optional[int]
             If set, the inner loop will be run until it takes longer than `min_repeat_ms`
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
@@ -633,6 +635,13 @@ def benchmark(
             returned tensors in the total runtime. This will give accurate timings for end to end
             workloads.
 
+        cooldown_interval_ms: Optional[int]
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: Optional[int]
+            The number of repeats before the cooldown is activated.
+
         args : Sequence[Object]
             Arguments to the function. These are cached before running timing code, so that data
             transfer costs are not counted in the runtime.
@@ -667,5 +676,11 @@ def benchmark(
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
         return self.module.time_evaluator(
-            "invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            "invoke",
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
         )(func_name)
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index a8a17041f5eab..1a27b3db7ef8d 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -21,6 +21,7 @@
 
 #include <assert.h>
 #include <inttypes.h>
+#include <math.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdio.h>
@@ -33,6 +34,12 @@
 #include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
 #include <tvm/runtime/crt/platform.h>
 
+#if defined(_WIN32) || defined(WIN32)
+#include <windows.h>
+#elif __unix__
+#include <unistd.h>
+#endif
+
 // Handle internal errors
 
 static char g_last_error[1024];
@@ -471,6 +478,8 @@ typedef struct {
   int number;
   int repeat;
   int min_repeat_ms;
+  int cooldown_interval_ms;
+  int repeats_to_cooldown;
 } time_evaluator_state_t;
 
 static time_evaluator_state_t g_time_evaluator_state;
@@ -479,13 +488,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
                      int* ret_type_code) {
   ret_val[0].v_handle = NULL;
   ret_type_code[0] = kTVMNullptr;
-  if (num_args < 8) {
+  if (num_args < 10) {
     TVMAPIErrorf("not enough args");
     return kTvmErrorFunctionCallNumArguments;
   }
   if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
       type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
-      type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMStr) {
+      type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
+      type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
     TVMAPIErrorf("one or more invalid arg types");
     return kTvmErrorFunctionCallWrongArgType;
   }
@@ -497,6 +507,8 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
   g_time_evaluator_state.number = args[4].v_int64;
   g_time_evaluator_state.repeat = args[5].v_int64;
   g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
+  g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
+  g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
 
   int ret_code =
       TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
@@ -528,23 +540,34 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
   }
   result_byte_arr->data = NULL;
   size_t data_size = sizeof(double) * g_time_evaluator_state.repeat;
-  err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void*)&result_byte_arr->data);
+  err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void**)&result_byte_arr->data);
   if (err != kTvmErrorNoError) {
     goto release_and_return;
   }
   result_byte_arr->size = data_size;
+
+  // skip first time call, to activate lazy compilation components.
+  err = TVMFuncCall(g_time_evaluator_state.func_to_time, args, type_codes, num_args, ret_val,
+                    ret_type_code);
+  if (err != kTvmErrorNoError) {
+    goto release_and_return;
+  }
+
   double min_repeat_seconds = ((double)g_time_evaluator_state.min_repeat_ms) / 1000;
   double* iter = (double*)result_byte_arr->data;
   for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
-    double repeat_res_seconds = 0.0;
-    int exec_count = 0;
+    double curr_res_seconds = 0.0;
     // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
     do {
+      if (curr_res_seconds > 0.0) {
+        g_time_evaluator_state.number = (int64_t)(
+            fmax((min_repeat_seconds / (curr_res_seconds / g_time_evaluator_state.number) + 1),
+                 g_time_evaluator_state.number * 1.618));  // 1.618 is chosen by random
+      }
       err = TVMPlatformBeforeMeasurement();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-
       err = TVMPlatformTimerStart();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
@@ -557,23 +580,31 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
           goto release_and_return;
         }
       }
-      exec_count += g_time_evaluator_state.number;
-
-      double curr_res_seconds;
       err = TVMPlatformTimerStop(&curr_res_seconds);
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-      repeat_res_seconds += curr_res_seconds;
-
       err = TVMPlatformAfterMeasurement();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-    } while (repeat_res_seconds < min_repeat_seconds);
-    double mean_exec_seconds = repeat_res_seconds / exec_count;
+    } while (curr_res_seconds < min_repeat_seconds);
+    double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
     *iter = mean_exec_seconds;
     iter++;
+    if (g_time_evaluator_state.cooldown_interval_ms > 0 &&
+        (i % g_time_evaluator_state.repeats_to_cooldown) == 0) {
+#if defined(_WIN32) || defined(WIN32)
+      Sleep(g_time_evaluator_state.cooldown_interval_ms);
+#elif __unix__
+      usleep(g_time_evaluator_state.cooldown_interval_ms * 1000);
+#else
+      TVMAPIErrorf(
+          "No support for non-zero cooldown_interval_ms for this platform: Use "
+          "cooldown_interval_ms = 0");
+      goto release_and_return;
+#endif
+    }
   }
 
   *ret_type_code = kTVMBytes;
@@ -582,9 +613,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
 
 release_and_return : {
   tvm_crt_error_t release_err =
-      TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_dev);
+      TVMPlatformMemoryFree((void*)result_byte_arr->data, result_byte_dev);
   if (release_err != kTvmErrorNoError) {
-    release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_dev);
+    release_err = TVMPlatformMemoryFree((void*)result_byte_arr, result_byte_dev);
   }
 
   if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) {
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 4a950153954ff..0a8e407cd004a 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -28,6 +28,7 @@
 
 #include <chrono>
 #include <cmath>
+#include <numeric>
 #include <sstream>
 
 #include "../../rpc/rpc_session.h"
@@ -55,48 +56,54 @@ class GraphExecutorDebug : public GraphExecutor {
    *        By default, one `repeat` contains `number` runs. If this parameter is set,
    *        the parameters `number` will be dynamically adjusted to meet the
    *        minimum duration requirement of one `repeat`.
-   * \return Comma seperated string containing the elapsed time per op for the last
-   *         iteration only, because returning a long string over rpc can be expensive.
+   * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+   *        defined by `repeats_to_cooldown`.
+   * \param repeats_to_cooldown The number of repeats before the
+   *        cooldown is activated.
+   * \return Comma separated string containing the elapsed time per op for
+   *         the last iteration only, because returning a long string over rpc can be expensive.
    */
-  std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms,
+                            int repeats_to_cooldown) {
     // warmup run
     GraphExecutor::Run();
     std::string tkey = module_->type_key();
-    std::vector<double> time_sec_per_op(op_execs_.size(), 0);
+    std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
     if (tkey == "rpc") {
       // RPC modules rely on remote timing which implements the logic from the else branch.
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+        time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms,
+                                          cooldown_interval_ms, repeats_to_cooldown);
       }
     } else {
+      int op = 0;
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
-        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
-          time_sec_per_op[index] = results[cur_repeat];
-
-          LOG(INFO) << "Iteration: " << cur_repeat;
-          int op = 0;
-          if (op_execs_[index]) {
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-                      << time_sec_per_op[index] * 1e6 << " us/iter";
+        time_sec_per_op[index] = RunIndividualNode(index, number, repeat, min_repeat_ms,
+                                                   cooldown_interval_ms, repeats_to_cooldown);
+        if (op_execs_[index]) {
+          LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
+          for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
+            const auto& data = time_sec_per_op[index][cur_repeat];
+            LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter";
           }
+          ++op;
         }
       }
     }
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      double time = time_sec_per_op[index];
-      // To have good behavior when calculating total time, etc.
-      if (std::isnan(time)) {
-        time = 0;
+      for (const auto& repeat_data : time_sec_per_op[index]) {
+        // To have good behavior when calculating total time, etc.
+        os << (std::isnan(repeat_data) ? std::to_string(0) : std::to_string(repeat_data)) << ",";
       }
-      os << time << ",";
+      os << ";";
     }
     return os.str();
   }
 
-  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms,
+                                        int cooldown_interval_ms, int repeats_to_cooldown) {
     std::string tkey = module_->type_key();
 
     // results_in_seconds[a][b] is the bth index run of the ath index repeat
@@ -115,17 +122,18 @@ class GraphExecutorDebug : public GraphExecutor {
     Device& d = devices_[0];
     PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
         TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
-        repeat, min_repeat_ms);
-    std::string result = time_evaluator();
-    const double* results_arr = reinterpret_cast<const double*>(result.data());
-    size_t double_bytes = sizeof(double);
-    for (size_t i = 0; i < result.size() / double_bytes; i++) {
-      results_in_seconds[i] = results_arr[i];
+        repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown);
+    std::string result_str = time_evaluator();
+    const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
+    for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+      results_in_seconds[i] = *blob_ptr;
     }
     return results_in_seconds;
   }
 
-  double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
+  std::vector<double> RunOpRPC(int index, int number, int repeat, int min_repeat_ms,
+                               int cooldown_interval_ms, int repeats_to_cooldown) {
+    std::vector<double> results(repeat, 0);
     // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
     // which represent inputs/parameters to the graph. Other types may be supported in the
     // future, but consideration would be needed as to how to do that over RPC before we support
@@ -137,12 +145,12 @@ class GraphExecutorDebug : public GraphExecutor {
 
       // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
       // "null" and "null" is a placeholder node for a parameter or input.
-      return 0;
+      return results;
     }
 
     if (nodes_[index].param.func_name == "__nop") {
       LOG_INFO << "Skipping __nop function";
-      return 0;
+      return results;
     }
 
     const Device& dev = data_entry_[entry_id(index, 0)]->device;
@@ -151,10 +159,11 @@ class GraphExecutorDebug : public GraphExecutor {
     uint32_t num_inputs = param.num_inputs;
     uint32_t num_outputs = param.num_outputs;
 
-    PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator")
-                               ->
-                               operator()(module_, name, static_cast<int>(dev.device_type),
-                                          dev.device_id, number, repeat, min_repeat_ms, "");
+    PackedFunc time_eval =
+        runtime::Registry::Get("runtime.RPCTimeEvaluator")
+            ->
+            operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
+                       repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, "");
 
     int num_flat_args = num_inputs + num_outputs;
     std::unique_ptr<TVMValue> values(new TVMValue[num_flat_args]);
@@ -176,10 +185,18 @@ class GraphExecutorDebug : public GraphExecutor {
     }
     TVMRetValue rv;
     time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
-    std::string results = rv.operator std::string();
-    const double* results_arr = reinterpret_cast<const double*>(results.data());
-    LOG(INFO) << "Got op timing: " << results_arr[0];
-    return results_arr[0];
+    std::string results_str = rv.operator std::string();
+    const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
+    for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+      results[i] = *blob_ptr;
+    }
+
+    std::ostringstream os;
+    for (auto& repeat_data : results) {
+      os << std::to_string(repeat_data) + ", ";
+    }
+    LOG(WARNING) << "Got op timing: " << os.str();
+    return results;
   }
 
   Timer RunOpHost(int index) {
@@ -369,21 +386,29 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       int number = args[0];
       int repeat = args[1];
       int min_repeat_ms = args[2];
+      int cooldown_interval_ms = args[3];
+      int repeats_to_cooldown = args[4];
       ICHECK_GT(number, 0);
       ICHECK_GT(repeat, 0);
       ICHECK_GE(min_repeat_ms, 0);
-      *rv = this->RunIndividual(number, repeat, min_repeat_ms);
+      ICHECK_GE(cooldown_interval_ms, 0);
+      ICHECK_GT(repeats_to_cooldown, 0);
+      *rv = this->RunIndividual(number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                repeats_to_cooldown);
     });
   } else if (name == "run_individual_node") {
-    return TypedPackedFunc<std::string(int, int, int, int)>(
-        [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
+    return TypedPackedFunc<std::string(int, int, int, int, int, int_least32_t)>(
+        [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown) {
           ICHECK_GE(node_index, 0);
           ICHECK_LT(node_index, nodes_.size());
           ICHECK_GT(number, 0);
           ICHECK_GT(repeat, 0);
           ICHECK_GE(min_repeat_ms, 0);
-          std::vector<double> results =
-              this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+          ICHECK_GE(cooldown_interval_ms, 0);
+          ICHECK_GT(repeats_to_cooldown, 0);
+          std::vector<double> results = this->RunIndividualNode(
+              node_index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown);
 
           // Have problems returning FloatImm so serialize to string results as hack.
           std::stringstream s;
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 478ec181e899f..786b83e6b011d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -485,13 +485,11 @@ TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<void*>(ptr);
 });
 
-#ifdef USE_PROFILER
 TVM_REGISTER_OBJECT_TYPE(OpenCLTimerNode);
 
 TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });
-#endif
 
 }  // namespace cl
 }  // namespace runtime
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9f95bf18f74b2..8b9b7cde21ab6 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -36,6 +36,7 @@
 #include <iostream>
 #include <map>
 #include <numeric>
+#include <thread>
 
 namespace tvm {
 namespace runtime {
@@ -835,6 +836,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
     });
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc) {
   ICHECK(pf != nullptr);
 
@@ -844,8 +846,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
     return (*get_micro_time_evaluator)(pf, dev, number, repeat);
   }
 
-  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
-                                                                    TVMRetValue* rv) mutable {
+  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown,
+                 f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
@@ -865,9 +867,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
                                              number * 1.618));  // 1.618 is chosen by random
         }
 
-        Timer t = Timer::Start(dev);
         // start timing
-        for (int i = 0; i < number; ++i) {
+        Timer t = Timer::Start(dev);
+        for (int j = 0; j < number; ++j) {
           pf.CallPacked(args, &temp);
         }
         t->Stop();
@@ -877,6 +879,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
       double speed = duration_ms / 1e3 / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+
+      if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
+      }
     }
 
     std::string blob = os.str();
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 8e558fb6278e3..ff58895005923 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -26,8 +26,10 @@
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
+#include <chrono>
 #include <cstring>
 #include <memory>
+#include <thread>
 #if defined(_M_X64) || defined(__x86_64__)
 #include <immintrin.h>
 #endif
@@ -189,7 +191,8 @@ class RPCModuleNode final : public ModuleNode {
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
-                              int min_repeat_ms, const std::string& f_preproc_name) {
+                              int min_repeat_ms, int cooldown_interval_ms, int repeats_to_cooldown,
+                              const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass dev by parts.
     ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
@@ -197,13 +200,13 @@ class RPCModuleNode final : public ModuleNode {
     dev = RemoveRPCSessionMask(dev);
 
     if (module_handle_ != nullptr) {
-      return remote_get_time_evaluator_(GetRef<Module>(this), name,
-                                        static_cast<int>(dev.device_type), dev.device_id, number,
-                                        repeat, min_repeat_ms, f_preproc_name);
+      return remote_get_time_evaluator_(
+          GetRef<Module>(this), name, static_cast<int>(dev.device_type), dev.device_id, number,
+          repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
     } else {
-      return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
-                                        static_cast<int>(dev.device_type), dev.device_id, number,
-                                        repeat, min_repeat_ms, f_preproc_name);
+      return remote_get_time_evaluator_(
+          Optional<Module>(nullptr), name, static_cast<int>(dev.device_type), dev.device_id, number,
+          repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
     }
   }
 
@@ -241,7 +244,8 @@ class RPCModuleNode final : public ModuleNode {
   // The local channel
   std::shared_ptr<RPCSession> sess_;
   // remote function to get time evaluator
-  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, std::string)>
+  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, int, int,
+                             std::string)>
       remote_get_time_evaluator_;
   // remote function getter for modules.
   TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
@@ -359,7 +363,8 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
 
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
-                       int number, int repeat, int min_repeat_ms, std::string f_preproc_name) {
+                       int number, int repeat, int min_repeat_ms, int cooldown_interval_ms,
+                       int repeats_to_cooldown, std::string f_preproc_name) {
       Device dev;
       dev.device_type = static_cast<DLDeviceType>(device_type);
       dev.device_id = device_id;
@@ -368,7 +373,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, f_preproc_name);
+              ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                 repeats_to_cooldown, f_preproc_name);
         } else {
           PackedFunc f_preproc;
           if (!f_preproc_name.empty()) {
@@ -379,7 +385,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
           }
           PackedFunc pf = m.GetFunction(name, true);
           CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
-          return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
+          return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms,
+                                              cooldown_interval_ms, repeats_to_cooldown, f_preproc);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
@@ -391,7 +398,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
-        return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc);
+        return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms,
+                                            cooldown_interval_ms, repeats_to_cooldown, f_preproc);
       }
     });
 
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 904e5820e3ece..64a88a6c59434 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -151,6 +151,7 @@ def split_debug_line(i):
             "Shape",
             "Inputs",
             "Outputs",
+            "Measurements",
         ]
         myadd_lines = split_debug_line(2)
         assert myadd_lines[0] == "add"
@@ -250,6 +251,22 @@ def test_run_single_node(graph, n, A, myadd):
     elapsed_time_in_seconds = end - start
     assert elapsed_time_in_seconds >= 0.5
 
+    # Doing `cooldown_interval_ms` should have the execution time increases
+    start = time.time()
+    mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000)
+    end = time.time()
+    elapsed_time_in_seconds = end - start
+    assert elapsed_time_in_seconds >= 1.5
+
+    # Doing with `repeats_to_cooldown`should not trigger cooldown after each repeat
+    start = time.time()
+    mod.run_individual_node(
+        1, repeat=10, min_repeat_ms=50, cooldown_interval_ms=1000, repeats_to_cooldown=5
+    )
+    end = time.time()
+    elapsed_time_in_seconds = end - start
+    assert elapsed_time_in_seconds >= 1.5 and elapsed_time_in_seconds <= 9.5
+
     # Going out of bounds of node index throws a tvm error
     with pytest.raises(TVMError):
         mod.run_individual_node(2)
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index 5a5c9361fc923..8ea5d954deca7 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -168,8 +168,8 @@ class AsyncLocalSession : public LocalSession {
       // special handle time evaluator.
       try {
         TVMArgs args(arg_values, arg_type_codes, num_args);
-        PackedFunc retfunc =
-            this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
+        PackedFunc retfunc = this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4],
+                                                    args[5], args[6], args[7], arts[8]);
         TVMRetValue rv;
         rv = retfunc;
         this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) {
@@ -251,7 +251,8 @@ class AsyncLocalSession : public LocalSession {
 
   // time evaluator
   PackedFunc GetTimeEvaluator(Optional<Module> opt_mod, std::string name, int device_type,
-                              int device_id, int number, int repeat, int min_repeat_ms) {
+                              int device_id, int number, int repeat, int min_repeat_ms,
+                              int cooldown_interval_ms, int repeats_to_cooldown) {
     Device dev;
     dev.device_type = static_cast<DLDeviceType>(device_type);
     dev.device_id = device_id;
@@ -259,18 +260,22 @@ class AsyncLocalSession : public LocalSession {
     if (opt_mod.defined()) {
       Module m = opt_mod.value();
       std::string tkey = m->type_key();
-      return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
+                                   cooldown_interval_ms, repeats_to_cooldown);
     } else {
       auto* pf = runtime::Registry::Get(name);
       CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
-      return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                   repeats_to_cooldown);
     }
   }
 
   // time evaluator
   PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
-                                   int min_repeat_ms) {
-    auto ftimer = [pf, dev, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) {
+                                   int min_repeat_ms, int cooldown_interval_ms,
+                                   int repeats_to_cooldown) {
+    auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                   repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) {
       // the function is a async function.
       PackedFunc on_complete = args[args.size() - 1];
       // keep argument alive in finvoke so that they
@@ -288,7 +293,7 @@ class AsyncLocalSession : public LocalSession {
       auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution");
       CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
       (*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
-                   on_complete);
+                   cooldown_interval_ms, repeats_to_cooldown, on_complete);
     };
     return PackedFunc(ftimer);
   }
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 3d4745367308d..78747f9708730 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1057,7 +1057,9 @@ export class Instance implements Disposable {
       dev: DLDevice,
       nstep: number,
       repeat: number,
-      minRepeatMs: number
+      minRepeatMs: number,
+      cooldownIntervalMs: number,
+      repeatsToCooldown: number
     ): Promise<Uint8Array> => {
       finvoke(this.scalar(1, "int32"));
       await dev.sync();
@@ -1081,6 +1083,9 @@ export class Instance implements Disposable {
         } while (durationMs < minRepeatMs);
         const speed = durationMs / setupNumber / 1000;
         result.push(speed);
+        if (cooldownIntervalMs > 0.0 && (i % repeatsToCooldown) == 0 ) {
+          await new Promise(r => setTimeout(r, cooldownIntervalMs));
+        }
       }
       const ret = new Float64Array(result.length);
       ret.set(result);