diff --git a/CMakeLists.txt b/CMakeLists.txt index 6931b40c667d9..0f26e790bfebd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -394,7 +394,6 @@ endif() if(USE_PROFILER) message(STATUS "Build with profiler...") - add_definitions(-DUSE_PROFILER=1) tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS}) set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS} diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 83c26933be45b..be14d6fa2ddc4 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -554,10 +554,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * minimum duration requirement of one `repeat`. * i.e., When the run time of one `repeat` falls below this time, * the `number` parameter will be automatically increased. - * \param f_preproc The function to be executed before we excetute time evaluator. + * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats + * defined by `repeats_to_cooldown`. + * \param repeats_to_cooldown The number of repeats before the + * cooldown is activated. + * \param f_preproc The function to be executed before we excetute time + * evaluator. * \return f_timer A timer function. */ PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown, PackedFunc f_preproc = nullptr); } // namespace profiling diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index 48ed44ef19b78..ac79007539637 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data): graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} + graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index f144b3cb4a824..202dd0e39139d 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -222,13 +222,19 @@ def _run_per_layer(self): output_tensors.append(self._get_node_output(i, j)) self.debug_datum.update_output_tensors(output_tensors) - def _run_debug(self): + def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown): """Execute the node specified with index will be executed. Each debug output will be copied to the buffer Time consumed for each execution will be set as debug output. """ # Get timing. - self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)] + self.debug_datum._time_list = self.run_individual( + number=number, + repeat=repeat, + min_repeat_ms=min_repeat_ms, + cooldown_interval_ms=cooldown_interval_ms, + repeats_to_cooldown=repeats_to_cooldown, + ) # Get outputs. self._run_per_layer() @@ -259,11 +265,46 @@ def debug_get_output(self, node, out=None): self._debug_get_output(node_index, out) - def run(self, **input_dict): + # pylint: disable=arguments-differ + def run( + self, + number=10, + repeat=1, + min_repeat_ms=1, + cooldown_interval_ms=0, + repeats_to_cooldown=1, + **input_dict, + ): """Run forward execution of the graph with debug Parameters ---------- + number: int, optional + The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. + + repeat: int, optional + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. + + cooldown_interval_ms: int, optional + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: int, optional + The number of repeats before the cooldown is activated. + input_dict : dict of str to NDArray List of input values to be feed to """ @@ -271,7 +312,13 @@ def run(self, **input_dict): self.set_input(**input_dict) # Step 1. Execute the graph - self._run_debug() + self._run_debug( + number=number, + repeat=repeat, + min_repeat_ms=min_repeat_ms, + cooldown_interval_ms=cooldown_interval_ms, + repeats_to_cooldown=repeats_to_cooldown, + ) # Step 2. Dump the output tensors to the dump folder self.debug_datum.dump_output_tensor() # Step 3. Dump the Chrome trace to the dump folder @@ -279,11 +326,62 @@ def run(self, **input_dict): # Step 4. Display the collected information self.debug_datum.display_debug_result() - def run_individual(self, number, repeat=1, min_repeat_ms=0): - ret = self._run_individual(number, repeat, min_repeat_ms) - return ret.strip(",").split(",") if ret else [] + def run_individual( + self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1 + ): + """Run each operation in the graph and get the time per op for all ops. + + number: int + The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. - def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0): + repeat: int, optional + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. + + cooldown_interval_ms: int, optional + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: int, optional + The number of repeats before the cooldown is activated. + + Returns + ------- + A 2-dimensional array where the dimensions are: the index of the operation and + the repeat of the measurement. + """ + ret = self._run_individual( + number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + ) + results = [] + for node_data in ret.strip(";").split(";"): + results.append([]) + for repeat_data in node_data.strip(",").split(","): + if repeat_data: + results[-1].append(float(repeat_data)) + return results + + def run_individual_node( + self, + index, + number=10, + repeat=1, + min_repeat_ms=0, + cooldown_interval_ms=0, + repeats_to_cooldown=1, + ): """Benchmark a single node in the serialized graph. This does not do any data transfers and uses arrays already on the device. @@ -304,7 +402,7 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0): The returned result contains `repeat` costs, each of which is an average of `number` costs. - min_repeat_ms: int, optional + min_repeat_ms : int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the @@ -312,19 +410,26 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0): i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + cooldown_interval_ms: int, optional + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: int, optional + The number of repeats before the cooldown is activated. + Returns ------- A module BenchmarkResult """ # Results are returned as serialized strings which we deserialize - ret = self._run_individual_node(index, number, repeat, min_repeat_ms) - answer = [] - for value in ret.split(","): - if value.strip() == "": - continue - answer.append(float(value)) - - return BenchmarkResult(answer) + ret = self._run_individual_node( + index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + ) + results = [] + for repeat_data in ret.replace(" ", "").strip(",").split(","): + if repeat_data: + results.append(float(repeat_data)) + return BenchmarkResult(results) def profile(self, collectors=None, **input_dict): """Run forward execution of the graph and collect overall and per-op diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py index 8185391e35519..006edd3458025 100644 --- a/python/tvm/contrib/debugger/debug_result.py +++ b/python/tvm/contrib/debugger/debug_result.py @@ -114,12 +114,10 @@ def get_graph_node_dtypes(self): def get_output_tensors(self): """Get the output tensors of each operation in numpy format""" eid = 0 - order = 0 output_tensors = {} - for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)): + for i, node in enumerate(self._nodes_list): num_outputs = self.get_graph_node_output_num(node) for j in range(num_outputs): - order += time[0] # the node name is not unique, so we need a consistent # indexing based on the list ordering in the nodes @@ -157,7 +155,7 @@ def s_to_us(t): return t * 10**6 starting_times = np.zeros(len(self._time_list) + 1) - starting_times[1:] = np.cumsum([times[0] for times in self._time_list]) + starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list]) def node_to_events(node, times, starting_time): return [ @@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time): ), ChromeTraceEvent( # Use start + duration instead of end to ensure precise timings. - ts=s_to_us(times[0] + starting_time), + ts=s_to_us(np.mean(times) + starting_time), tid=1, pid=1, ph="E", @@ -205,12 +203,31 @@ def _dump_graph_json(self, graph): def get_debug_result(self, sort_by_time=True): """Return the debugger result""" - header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"] - lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"] + header = [ + "Node Name", + "Ops", + "Time(us)", + "Time(%)", + "Shape", + "Inputs", + "Outputs", + "Measurements(us)", + ] + lines = [ + "---------", + "---", + "--------", + "-------", + "-----", + "------", + "-------", + "----------------", + ] eid = 0 data = [] - total_time = sum(time[0] for time in self._time_list) + total_time = sum([np.mean(time) for time in self._time_list]) for node, time in zip(self._nodes_list, self._time_list): + time_mean = np.mean(time) num_outputs = self.get_graph_node_output_num(node) for j in range(num_outputs): op = node["op"] @@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True): continue name = node["name"] shape = str(self._output_tensor_list[eid].shape) - time_us = round(time[0] * 1e6, 3) - time_percent = round(((time[0] / total_time) * 100), 3) + time_us = round(time_mean * 1e6, 3) + time_percent = round(((time_mean / total_time) * 100), 3) inputs = str(node["attrs"]["num_inputs"]) outputs = str(node["attrs"]["num_outputs"]) - node_data = [name, op, time_us, time_percent, shape, inputs, outputs] + measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time]) + node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements] data.append(node_data) eid += 1 @@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True): data = sorted(data, key=lambda x: x[2], reverse=True) # Insert a row for total time at the end. rounded_total_time_us = round(total_time * 1e6, 3) - data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"]) + data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"]) fmt = "" for i, _ in enumerate(header): diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index e8e2798ef7346..a4b90baf1d380 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -356,6 +356,8 @@ def benchmark( number=5, min_repeat_ms=None, end_to_end=False, + cooldown_interval_ms=0, + repeats_to_cooldown=1, **kwargs, ): """Calculate runtime of a function by repeatedly calling it. @@ -395,7 +397,7 @@ def benchmark( `number` should be increased when the runtime of the function is small (less than a 1/10 of a millisecond). - min_repeat_ms : Optional[float] + min_repeat_ms : Optional[int] If set, the inner loop will be run until it takes longer than `min_repeat_ms` milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. @@ -405,6 +407,13 @@ def benchmark( returned tensors in the total runtime. This will give accurate timings for end to end workloads. + cooldown_interval_ms: Optional[int] + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: Optional[int] + The number of repeats before the cooldown is activated. + kwargs : Dict[str, Object] Named arguments to the function. These are cached before running timing code, so that data transfer costs are not counted in the runtime. @@ -432,5 +441,11 @@ def benchmark( if kwargs: self.set_input(**kwargs) return self.module.time_evaluator( - func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms + func_name, + device, + repeat=repeat, + number=number, + min_repeat_ms=min_repeat_ms, + cooldown_interval_ms=cooldown_interval_ms, + repeats_to_cooldown=repeats_to_cooldown, )() diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index 8663eb460c4a8..0e47be865b106 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -204,7 +204,7 @@ def f_per_layer(rt_mod, dev, input_data): graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} + graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index c614e5d757c3f..6123f33161711 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -270,7 +270,17 @@ def save(self, file_name, fmt=""): """ _ffi_api.ModuleSaveToFile(self, file_name, fmt) - def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f_preproc=""): + def time_evaluator( + self, + func_name, + dev, + number=10, + repeat=1, + min_repeat_ms=0, + cooldown_interval_ms=0, + repeats_to_cooldown=1, + f_preproc="", + ): """Get an evaluator that measures time cost of running function. Parameters @@ -299,6 +309,14 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + + cooldown_interval_ms: int, optional + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: int, optional + The number of repeats before the cooldown is activated. + f_preproc: str, optional The preprocess function name we want to execute before executing the time evaluator. @@ -322,6 +340,8 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f number, repeat, min_repeat_ms, + cooldown_interval_ms, + repeats_to_cooldown, f_preproc, ) diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index 6e59c3455a914..83f1656a0dd80 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -584,6 +584,8 @@ def benchmark( number=5, min_repeat_ms=None, end_to_end=False, + cooldown_interval_ms=0, + repeats_to_cooldown=1, **kwargs, ): """Calculate runtime of a function by repeatedly calling it. @@ -623,7 +625,7 @@ def benchmark( `number` should be increased when the runtime of the function is small (less than a 1/10 of a millisecond). - min_repeat_ms : Optional[float] + min_repeat_ms : Optional[int] If set, the inner loop will be run until it takes longer than `min_repeat_ms` milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. @@ -633,6 +635,13 @@ def benchmark( returned tensors in the total runtime. This will give accurate timings for end to end workloads. + cooldown_interval_ms: Optional[int] + The cooldown interval in milliseconds between the number of repeats defined by + `repeats_to_cooldown`. + + repeats_to_cooldown: Optional[int] + The number of repeats before the cooldown is activated. + args : Sequence[Object] Arguments to the function. These are cached before running timing code, so that data transfer costs are not counted in the runtime. @@ -667,5 +676,11 @@ def benchmark( if args or kwargs: self.set_input(func_name, *args, **kwargs) return self.module.time_evaluator( - "invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms + "invoke", + device, + repeat=repeat, + number=number, + min_repeat_ms=min_repeat_ms, + cooldown_interval_ms=cooldown_interval_ms, + repeats_to_cooldown=repeats_to_cooldown, )(func_name) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index a8a17041f5eab..1a27b3db7ef8d 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,12 @@ #include #include +#if defined(_WIN32) || defined(WIN32) +#include +#elif __unix__ +#include +#endif + // Handle internal errors static char g_last_error[1024]; @@ -471,6 +478,8 @@ typedef struct { int number; int repeat; int min_repeat_ms; + int cooldown_interval_ms; + int repeats_to_cooldown; } time_evaluator_state_t; static time_evaluator_state_t g_time_evaluator_state; @@ -479,13 +488,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re int* ret_type_code) { ret_val[0].v_handle = NULL; ret_type_code[0] = kTVMNullptr; - if (num_args < 8) { + if (num_args < 10) { TVMAPIErrorf("not enough args"); return kTvmErrorFunctionCallNumArguments; } if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr || type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt || - type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMStr) { + type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt || + type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) { TVMAPIErrorf("one or more invalid arg types"); return kTvmErrorFunctionCallWrongArgType; } @@ -497,6 +507,8 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re g_time_evaluator_state.number = args[4].v_int64; g_time_evaluator_state.repeat = args[5].v_int64; g_time_evaluator_state.min_repeat_ms = args[6].v_int64; + g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64; + g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64; int ret_code = TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time); @@ -528,23 +540,34 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* } result_byte_arr->data = NULL; size_t data_size = sizeof(double) * g_time_evaluator_state.repeat; - err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void*)&result_byte_arr->data); + err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void**)&result_byte_arr->data); if (err != kTvmErrorNoError) { goto release_and_return; } result_byte_arr->size = data_size; + + // skip first time call, to activate lazy compilation components. + err = TVMFuncCall(g_time_evaluator_state.func_to_time, args, type_codes, num_args, ret_val, + ret_type_code); + if (err != kTvmErrorNoError) { + goto release_and_return; + } + double min_repeat_seconds = ((double)g_time_evaluator_state.min_repeat_ms) / 1000; double* iter = (double*)result_byte_arr->data; for (int i = 0; i < g_time_evaluator_state.repeat; i++) { - double repeat_res_seconds = 0.0; - int exec_count = 0; + double curr_res_seconds = 0.0; // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0). do { + if (curr_res_seconds > 0.0) { + g_time_evaluator_state.number = (int64_t)( + fmax((min_repeat_seconds / (curr_res_seconds / g_time_evaluator_state.number) + 1), + g_time_evaluator_state.number * 1.618)); // 1.618 is chosen by random + } err = TVMPlatformBeforeMeasurement(); if (err != kTvmErrorNoError) { goto release_and_return; } - err = TVMPlatformTimerStart(); if (err != kTvmErrorNoError) { goto release_and_return; @@ -557,23 +580,31 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* goto release_and_return; } } - exec_count += g_time_evaluator_state.number; - - double curr_res_seconds; err = TVMPlatformTimerStop(&curr_res_seconds); if (err != kTvmErrorNoError) { goto release_and_return; } - repeat_res_seconds += curr_res_seconds; - err = TVMPlatformAfterMeasurement(); if (err != kTvmErrorNoError) { goto release_and_return; } - } while (repeat_res_seconds < min_repeat_seconds); - double mean_exec_seconds = repeat_res_seconds / exec_count; + } while (curr_res_seconds < min_repeat_seconds); + double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number; *iter = mean_exec_seconds; iter++; + if (g_time_evaluator_state.cooldown_interval_ms > 0 && + (i % g_time_evaluator_state.repeats_to_cooldown) == 0) { +#if defined(_WIN32) || defined(WIN32) + Sleep(g_time_evaluator_state.cooldown_interval_ms); +#elif __unix__ + usleep(g_time_evaluator_state.cooldown_interval_ms * 1000); +#else + TVMAPIErrorf( + "No support for non-zero cooldown_interval_ms for this platform: Use " + "cooldown_interval_ms = 0"); + goto release_and_return; +#endif + } } *ret_type_code = kTVMBytes; @@ -582,9 +613,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* release_and_return : { tvm_crt_error_t release_err = - TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_dev); + TVMPlatformMemoryFree((void*)result_byte_arr->data, result_byte_dev); if (release_err != kTvmErrorNoError) { - release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_dev); + release_err = TVMPlatformMemoryFree((void*)result_byte_arr, result_byte_dev); } if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) { diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 4a950153954ff..0a8e407cd004a 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -28,6 +28,7 @@ #include #include +#include #include #include "../../rpc/rpc_session.h" @@ -55,48 +56,54 @@ class GraphExecutorDebug : public GraphExecutor { * By default, one `repeat` contains `number` runs. If this parameter is set, * the parameters `number` will be dynamically adjusted to meet the * minimum duration requirement of one `repeat`. - * \return Comma seperated string containing the elapsed time per op for the last - * iteration only, because returning a long string over rpc can be expensive. + * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats + * defined by `repeats_to_cooldown`. + * \param repeats_to_cooldown The number of repeats before the + * cooldown is activated. + * \return Comma separated string containing the elapsed time per op for + * the last iteration only, because returning a long string over rpc can be expensive. */ - std::string RunIndividual(int number, int repeat, int min_repeat_ms) { + std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, + int repeats_to_cooldown) { // warmup run GraphExecutor::Run(); std::string tkey = module_->type_key(); - std::vector time_sec_per_op(op_execs_.size(), 0); + std::vector> time_sec_per_op(op_execs_.size()); if (tkey == "rpc") { // RPC modules rely on remote timing which implements the logic from the else branch. for (size_t index = 0; index < op_execs_.size(); ++index) { - time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms); + time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms, + cooldown_interval_ms, repeats_to_cooldown); } } else { + int op = 0; for (size_t index = 0; index < op_execs_.size(); ++index) { - std::vector results = RunIndividualNode(index, number, repeat, min_repeat_ms); - for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) { - time_sec_per_op[index] = results[cur_repeat]; - - LOG(INFO) << "Iteration: " << cur_repeat; - int op = 0; - if (op_execs_[index]) { - LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " - << time_sec_per_op[index] * 1e6 << " us/iter"; + time_sec_per_op[index] = RunIndividualNode(index, number, repeat, min_repeat_ms, + cooldown_interval_ms, repeats_to_cooldown); + if (op_execs_[index]) { + LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":"; + for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) { + const auto& data = time_sec_per_op[index][cur_repeat]; + LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter"; } + ++op; } } } std::ostringstream os; for (size_t index = 0; index < time_sec_per_op.size(); index++) { - double time = time_sec_per_op[index]; - // To have good behavior when calculating total time, etc. - if (std::isnan(time)) { - time = 0; + for (const auto& repeat_data : time_sec_per_op[index]) { + // To have good behavior when calculating total time, etc. + os << (std::isnan(repeat_data) ? std::to_string(0) : std::to_string(repeat_data)) << ","; } - os << time << ","; + os << ";"; } return os.str(); } - std::vector RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) { + std::vector RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown) { std::string tkey = module_->type_key(); // results_in_seconds[a][b] is the bth index run of the ath index repeat @@ -115,17 +122,18 @@ class GraphExecutorDebug : public GraphExecutor { Device& d = devices_[0]; PackedFunc time_evaluator = profiling::WrapTimeEvaluator( TypedPackedFunc([this, node_index]() { this->RunOpHost(node_index); }), d, number, - repeat, min_repeat_ms); - std::string result = time_evaluator(); - const double* results_arr = reinterpret_cast(result.data()); - size_t double_bytes = sizeof(double); - for (size_t i = 0; i < result.size() / double_bytes; i++) { - results_in_seconds[i] = results_arr[i]; + repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown); + std::string result_str = time_evaluator(); + const double* blob_ptr = reinterpret_cast(result_str.data()); + for (int i = 0; i < repeat; ++i, ++blob_ptr) { + results_in_seconds[i] = *blob_ptr; } return results_in_seconds; } - double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) { + std::vector RunOpRPC(int index, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown) { + std::vector results(repeat, 0); // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes // which represent inputs/parameters to the graph. Other types may be supported in the // future, but consideration would be needed as to how to do that over RPC before we support @@ -137,12 +145,12 @@ class GraphExecutorDebug : public GraphExecutor { // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or // "null" and "null" is a placeholder node for a parameter or input. - return 0; + return results; } if (nodes_[index].param.func_name == "__nop") { LOG_INFO << "Skipping __nop function"; - return 0; + return results; } const Device& dev = data_entry_[entry_id(index, 0)]->device; @@ -151,10 +159,11 @@ class GraphExecutorDebug : public GraphExecutor { uint32_t num_inputs = param.num_inputs; uint32_t num_outputs = param.num_outputs; - PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator") - -> - operator()(module_, name, static_cast(dev.device_type), - dev.device_id, number, repeat, min_repeat_ms, ""); + PackedFunc time_eval = + runtime::Registry::Get("runtime.RPCTimeEvaluator") + -> + operator()(module_, name, static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, ""); int num_flat_args = num_inputs + num_outputs; std::unique_ptr values(new TVMValue[num_flat_args]); @@ -176,10 +185,18 @@ class GraphExecutorDebug : public GraphExecutor { } TVMRetValue rv; time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv); - std::string results = rv.operator std::string(); - const double* results_arr = reinterpret_cast(results.data()); - LOG(INFO) << "Got op timing: " << results_arr[0]; - return results_arr[0]; + std::string results_str = rv.operator std::string(); + const double* blob_ptr = reinterpret_cast(results_str.data()); + for (int i = 0; i < repeat; ++i, ++blob_ptr) { + results[i] = *blob_ptr; + } + + std::ostringstream os; + for (auto& repeat_data : results) { + os << std::to_string(repeat_data) + ", "; + } + LOG(WARNING) << "Got op timing: " << os.str(); + return results; } Timer RunOpHost(int index) { @@ -369,21 +386,29 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[0]; int repeat = args[1]; int min_repeat_ms = args[2]; + int cooldown_interval_ms = args[3]; + int repeats_to_cooldown = args[4]; ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); - *rv = this->RunIndividual(number, repeat, min_repeat_ms); + ICHECK_GE(cooldown_interval_ms, 0); + ICHECK_GT(repeats_to_cooldown, 0); + *rv = this->RunIndividual(number, repeat, min_repeat_ms, cooldown_interval_ms, + repeats_to_cooldown); }); } else if (name == "run_individual_node") { - return TypedPackedFunc( - [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) { + return TypedPackedFunc( + [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown) { ICHECK_GE(node_index, 0); ICHECK_LT(node_index, nodes_.size()); ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); - std::vector results = - this->RunIndividualNode(node_index, number, repeat, min_repeat_ms); + ICHECK_GE(cooldown_interval_ms, 0); + ICHECK_GT(repeats_to_cooldown, 0); + std::vector results = this->RunIndividualNode( + node_index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown); // Have problems returning FloatImm so serialize to string results as hack. std::stringstream s; diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 478ec181e899f..786b83e6b011d 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -485,13 +485,11 @@ TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* *rv = static_cast(ptr); }); -#ifdef USE_PROFILER TVM_REGISTER_OBJECT_TYPE(OpenCLTimerNode); TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) { return Timer(make_object(dev)); }); -#endif } // namespace cl } // namespace runtime diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 9f95bf18f74b2..8b9b7cde21ab6 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -36,6 +36,7 @@ #include #include #include +#include namespace tvm { namespace runtime { @@ -835,6 +836,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") }); PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown, PackedFunc f_preproc) { ICHECK(pf != nullptr); @@ -844,8 +846,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, return (*get_micro_time_evaluator)(pf, dev, number, repeat); } - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args, - TVMRetValue* rv) mutable { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, + f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. @@ -865,9 +867,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, number * 1.618)); // 1.618 is chosen by random } - Timer t = Timer::Start(dev); // start timing - for (int i = 0; i < number; ++i) { + Timer t = Timer::Start(dev); + for (int j = 0; j < number; ++j) { pf.CallPacked(args, &temp); } t->Stop(); @@ -877,6 +879,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, double speed = duration_ms / 1e3 / number; os.write(reinterpret_cast(&speed), sizeof(speed)); + + if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms)); + } } std::string blob = os.str(); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 8e558fb6278e3..ff58895005923 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -26,8 +26,10 @@ #include #include +#include #include #include +#include #if defined(_M_X64) || defined(__x86_64__) #include #endif @@ -189,7 +191,8 @@ class RPCModuleNode final : public ModuleNode { } PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, - int min_repeat_ms, const std::string& f_preproc_name) { + int min_repeat_ms, int cooldown_interval_ms, int repeats_to_cooldown, + const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); // Remove session mask because we pass dev by parts. ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) @@ -197,13 +200,13 @@ class RPCModuleNode final : public ModuleNode { dev = RemoveRPCSessionMask(dev); if (module_handle_ != nullptr) { - return remote_get_time_evaluator_(GetRef(this), name, - static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, f_preproc_name); + return remote_get_time_evaluator_( + GetRef(this), name, static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } else { - return remote_get_time_evaluator_(Optional(nullptr), name, - static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, f_preproc_name); + return remote_get_time_evaluator_( + Optional(nullptr), name, static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } } @@ -241,7 +244,8 @@ class RPCModuleNode final : public ModuleNode { // The local channel std::shared_ptr sess_; // remote function to get time evaluator - TypedPackedFunc, std::string, int, int, int, int, int, std::string)> + TypedPackedFunc, std::string, int, int, int, int, int, int, int, + std::string)> remote_get_time_evaluator_; // remote function getter for modules. TypedPackedFunc remote_mod_get_function_; @@ -359,7 +363,8 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, - int number, int repeat, int min_repeat_ms, std::string f_preproc_name) { + int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, + int repeats_to_cooldown, std::string f_preproc_name) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -368,7 +373,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") std::string tkey = m->type_key(); if (tkey == "rpc") { return static_cast(m.operator->()) - ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, f_preproc_name); + ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + repeats_to_cooldown, f_preproc_name); } else { PackedFunc f_preproc; if (!f_preproc_name.empty()) { @@ -379,7 +385,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") } PackedFunc pf = m.GetFunction(name, true); CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry"; - return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc); + return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, + cooldown_interval_ms, repeats_to_cooldown, f_preproc); } } else { auto* pf = runtime::Registry::Get(name); @@ -391,7 +398,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") << "Cannot find " << f_preproc_name << " in the global function"; f_preproc = *pf_preproc; } - return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc); + return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, + cooldown_interval_ms, repeats_to_cooldown, f_preproc); } }); diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py index 904e5820e3ece..64a88a6c59434 100644 --- a/tests/python/unittest/test_runtime_graph_debug.py +++ b/tests/python/unittest/test_runtime_graph_debug.py @@ -151,6 +151,7 @@ def split_debug_line(i): "Shape", "Inputs", "Outputs", + "Measurements", ] myadd_lines = split_debug_line(2) assert myadd_lines[0] == "add" @@ -250,6 +251,22 @@ def test_run_single_node(graph, n, A, myadd): elapsed_time_in_seconds = end - start assert elapsed_time_in_seconds >= 0.5 + # Doing `cooldown_interval_ms` should have the execution time increases + start = time.time() + mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000) + end = time.time() + elapsed_time_in_seconds = end - start + assert elapsed_time_in_seconds >= 1.5 + + # Doing with `repeats_to_cooldown`should not trigger cooldown after each repeat + start = time.time() + mod.run_individual_node( + 1, repeat=10, min_repeat_ms=50, cooldown_interval_ms=1000, repeats_to_cooldown=5 + ) + end = time.time() + elapsed_time_in_seconds = end - start + assert elapsed_time_in_seconds >= 1.5 and elapsed_time_in_seconds <= 9.5 + # Going out of bounds of node index throws a tvm error with pytest.raises(TVMError): mod.run_individual_node(2) diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 5a5c9361fc923..8ea5d954deca7 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -168,8 +168,8 @@ class AsyncLocalSession : public LocalSession { // special handle time evaluator. try { TVMArgs args(arg_values, arg_type_codes, num_args); - PackedFunc retfunc = - this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], args[5], args[6]); + PackedFunc retfunc = this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], + args[5], args[6], args[7], arts[8]); TVMRetValue rv; rv = retfunc; this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) { @@ -251,7 +251,8 @@ class AsyncLocalSession : public LocalSession { // time evaluator PackedFunc GetTimeEvaluator(Optional opt_mod, std::string name, int device_type, - int device_id, int number, int repeat, int min_repeat_ms) { + int device_id, int number, int repeat, int min_repeat_ms, + int cooldown_interval_ms, int repeats_to_cooldown) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -259,18 +260,22 @@ class AsyncLocalSession : public LocalSession { if (opt_mod.defined()) { Module m = opt_mod.value(); std::string tkey = m->type_key(); - return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms); + return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms, + cooldown_interval_ms, repeats_to_cooldown); } else { auto* pf = runtime::Registry::Get(name); CHECK(pf != nullptr) << "Cannot find " << name << " in the global function"; - return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms); + return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + repeats_to_cooldown); } } // time evaluator PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, - int min_repeat_ms) { - auto ftimer = [pf, dev, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) { + int min_repeat_ms, int cooldown_interval_ms, + int repeats_to_cooldown) { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { // the function is a async function. PackedFunc on_complete = args[args.size() - 1]; // keep argument alive in finvoke so that they @@ -288,7 +293,7 @@ class AsyncLocalSession : public LocalSession { auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution"); CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, - on_complete); + cooldown_interval_ms, repeats_to_cooldown, on_complete); }; return PackedFunc(ftimer); } diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 3d4745367308d..78747f9708730 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -1057,7 +1057,9 @@ export class Instance implements Disposable { dev: DLDevice, nstep: number, repeat: number, - minRepeatMs: number + minRepeatMs: number, + cooldownIntervalMs: number, + repeatsToCooldown: number ): Promise => { finvoke(this.scalar(1, "int32")); await dev.sync(); @@ -1081,6 +1083,9 @@ export class Instance implements Disposable { } while (durationMs < minRepeatMs); const speed = durationMs / setupNumber / 1000; result.push(speed); + if (cooldownIntervalMs > 0.0 && (i % repeatsToCooldown) == 0 ) { + await new Promise(r => setTimeout(r, cooldownIntervalMs)); + } } const ret = new Float64Array(result.length); ret.set(result);