apache · tkonolige · Jun 28, 2022 · Jun 17, 2022 · Jun 27, 2022 · Jun 28, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -394,7 +394,6 @@ endif()
 if(USE_PROFILER)
   message(STATUS "Build with profiler...")
 
-  add_definitions(-DUSE_PROFILER=1)
   tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
@@ -539,6 +539,25 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
 
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
+ *
+ * Approximate implementation:
+ * \code{.py}
+ * f() // warmup
+ * for i in range(repeat)
+ *   f_preproc()
+ *   while True:
+ *     start = time()
+ *     for j in range(number):
+ *       f()
+ *     duration_ms = time() - start
+ *     if duration_ms >= min_repeat_ms:
+ *       break
+ *     else:
+ *        number = (min_repeat_ms / (duration_ms / number) + 1
+ *   if cooldown_interval_ms and i % repeats_to_cooldown == 0:
+ *     sleep(cooldown_interval_ms)
+ * \endcode
+ *
  * \param f The function argument.
  * \param dev The device.
  * \param number The number of times to run this function for taking average.
@@ -554,10 +573,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
- * \param f_preproc The function to be executed before we excetute time evaluator.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+ *        defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ *        cooldown is activated.
+ * \param f_preproc The function to be executed before we execute time
+ *        evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling

diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -222,13 +222,19 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self):
+    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
         """
         # Get timing.
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = self.run_individual(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
 
         # Get outputs.
         self._run_per_layer()
@@ -259,31 +265,123 @@ def debug_get_output(self, node, out=None):
 
         self._debug_get_output(node_index, out)
 
-    def run(self, **input_dict):
+    # pylint: disable=arguments-differ
+    def run(
+        self,
+        number=10,
+        repeat=1,
+        min_repeat_ms=1,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        **input_dict,
+    ):
         """Run forward execution of the graph with debug
 
         Parameters
         ----------
+        number: int, optional
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
         """
         if input_dict:
             self.set_input(**input_dict)
 
         # Step 1. Execute the graph
-        self._run_debug()
+        self._run_debug(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
         # Step 3. Dump the Chrome trace to the dump folder
         self.debug_datum.dump_chrome_trace()
         # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
+    def run_individual(
+        self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
+    ):
+        """Run each operation in the graph and get the time per op for all ops.
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
-    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
+        Returns
+        -------
+        A 2-dimensional array where the dimensions are: the index of the operation and
+        the repeat of the measurement.
+        """
+        ret = self._run_individual(
+            number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        for node_data in ret.strip(";").split(";"):
+            results.append([])
+            for repeat_data in node_data.strip(",").split(","):
+                if repeat_data:
+                    results[-1].append(float(repeat_data))
+        return results
+
+    def run_individual_node(
+        self,
+        index,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+    ):
         """Benchmark a single node in the serialized graph.
 
         This does not do any data transfers and uses arrays already on the device.
@@ -304,27 +402,34 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             The returned result contains `repeat` costs,
             each of which is an average of `number` costs.
 
-        min_repeat_ms: int, optional
+        min_repeat_ms : int, optional
             The minimum duration of one `repeat` in milliseconds.
             By default, one `repeat` contains `number` runs. If this parameter is set,
             the parameters `number` will be dynamically adjusted to meet the
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         Returns
         -------
         A module BenchmarkResult
         """
         # Results are returned as serialized strings which we deserialize
-        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
-        answer = []
-        for value in ret.split(","):
-            if value.strip() == "":
-                continue
-            answer.append(float(value))
-
-        return BenchmarkResult(answer)
+        ret = self._run_individual_node(
+            index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        for repeat_data in ret.replace(" ", "").strip(",").split(","):
+            if repeat_data:
+                results.append(float(repeat_data))
+        return BenchmarkResult(results)
 
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
     def get_output_tensors(self):
         """Get the output tensors of each operation in numpy format"""
         eid = 0
-        order = 0
         output_tensors = {}
-        for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
+        for i, node in enumerate(self._nodes_list):
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
-                order += time[0]
 
                 # the node name is not unique, so we need a consistent
                 # indexing based on the list ordering in the nodes
@@ -157,7 +155,7 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])
 
         def node_to_events(node, times, starting_time):
             return [
@@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
                 ),
                 ChromeTraceEvent(
                     # Use start + duration instead of end to ensure precise timings.
-                    ts=s_to_us(times[0] + starting_time),
+                    ts=s_to_us(np.mean(times) + starting_time),
                     tid=1,
                     pid=1,
                     ph="E",
@@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = [
+            "Node Name",
+            "Ops",
+            "Time(us)",
+            "Time(%)",
+            "Shape",
+            "Inputs",
+            "Outputs",
+            "Measurements(us)",
+        ]
+        lines = [
+            "---------",
+            "---",
+            "--------",
+            "-------",
+            "-----",
+            "------",
+            "-------",
+            "----------------",
+        ]
         eid = 0
         data = []
-        total_time = sum(time[0] for time in self._time_list)
+        total_time = sum([np.mean(time) for time in self._time_list])
         for node, time in zip(self._nodes_list, self._time_list):
+            time_mean = np.mean(time)
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 op = node["op"]
@@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1e6, 3)
-                time_percent = round(((time[0] / total_time) * 100), 3)
+                time_us = round(time_mean * 1e6, 3)
+                time_percent = round(((time_mean / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
-                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
+                measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
                 data.append(node_data)
                 eid += 1
 
@@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
             data = sorted(data, key=lambda x: x[2], reverse=True)
             # Insert a row for total time at the end.
             rounded_total_time_us = round(total_time * 1e6, 3)
-            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
+            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])
 
         fmt = ""
         for i, _ in enumerate(header):

diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
@@ -356,6 +356,8 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -395,7 +397,7 @@ def benchmark(
             `number` should be increased when the runtime of the function is small (less than a 1/10
             of a millisecond).
 
-        min_repeat_ms : Optional[float]
+        min_repeat_ms : Optional[int]
             If set, the inner loop will be run until it takes longer than `min_repeat_ms`
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
@@ -405,6 +407,13 @@ def benchmark(
             returned tensors in the total runtime. This will give accurate timings for end to end
             workloads.
 
+        cooldown_interval_ms: Optional[int]
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: Optional[int]
+            The number of repeats before the cooldown is activated.
+
         kwargs : Dict[str, Object]
             Named arguments to the function. These are cached before running timing code, so that
             data transfer costs are not counted in the runtime.
@@ -432,5 +441,11 @@ def benchmark(
         if kwargs:
             self.set_input(**kwargs)
         return self.module.time_evaluator(
-            func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            func_name,
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
         )()