Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cooldown interval logic for the profiling functional #11465

Merged
merged 3 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ endif()
if(USE_PROFILER)
message(STATUS "Build with profiler...")

add_definitions(-DUSE_PROFILER=1)
tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
Expand Down
27 changes: 26 additions & 1 deletion include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,25 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i

/*!
* \brief Wrap a timer function to measure the time cost of a given packed function.
*
* Approximate implementation:
* \code{.py}
* f() // warmup
* for i in range(repeat)
* f_preproc()
* while True:
* start = time()
* for j in range(number):
* f()
* duration_ms = time() - start
* if duration_ms >= min_repeat_ms:
* break
* else:
* number = (min_repeat_ms / (duration_ms / number) + 1
* if cooldown_interval_ms and i % repeats_to_cooldown == 0:
* sleep(cooldown_interval_ms)
* \endcode
*
* \param f The function argument.
* \param dev The device.
* \param number The number of times to run this function for taking average.
Expand All @@ -554,10 +573,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* minimum duration requirement of one `repeat`.
* i.e., When the run time of one `repeat` falls below this time,
* the `number` parameter will be automatically increased.
* \param f_preproc The function to be executed before we excetute time evaluator.
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
* cooldown is activated.
Icemist marked this conversation as resolved.
Show resolved Hide resolved
* \param f_preproc The function to be executed before we execute time
* evaluator.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int cooldown_interval_ms, int repeats_to_cooldown,
PackedFunc f_preproc = nullptr);

} // namespace profiling
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/auto_scheduler/testing/tune_relay.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
print("|graph_nodes| = ", len(graph_nodes))
print("|graph_time| = ", len(graph_time))
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
for k, v in graph_nodes_time.items():
print(f"{k} : {v:.3f}")

Expand Down
139 changes: 122 additions & 17 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,19 @@ def _run_per_layer(self):
output_tensors.append(self._get_node_output(i, j))
self.debug_datum.update_output_tensors(output_tensors)

def _run_debug(self):
def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
self.debug_datum._time_list = self.run_individual(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)

# Get outputs.
self._run_per_layer()
Expand Down Expand Up @@ -259,31 +265,123 @@ def debug_get_output(self, node, out=None):

self._debug_get_output(node_index, out)

def run(self, **input_dict):
# pylint: disable=arguments-differ
def run(
self,
number=10,
repeat=1,
min_repeat_ms=1,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**input_dict,
):
"""Run forward execution of the graph with debug
Parameters
----------
number: int, optional
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
input_dict : dict of str to NDArray
List of input values to be feed to
"""
if input_dict:
self.set_input(**input_dict)

# Step 1. Execute the graph
self._run_debug()
self._run_debug(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
# Step 2. Dump the output tensors to the dump folder
self.debug_datum.dump_output_tensor()
# Step 3. Dump the Chrome trace to the dump folder
self.debug_datum.dump_chrome_trace()
# Step 4. Display the collected information
self.debug_datum.display_debug_result()

def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
def run_individual(
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
):
"""Run each operation in the graph and get the time per op for all ops.
number: int
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
Returns
-------
A 2-dimensional array where the dimensions are: the index of the operation and
the repeat of the measurement.
"""
ret = self._run_individual(
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
results = []
for node_data in ret.strip(";").split(";"):
results.append([])
for repeat_data in node_data.strip(",").split(","):
if repeat_data:
results[-1].append(float(repeat_data))
return results

def run_individual_node(
self,
index,
number=10,
repeat=1,
min_repeat_ms=0,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
"""Benchmark a single node in the serialized graph.
This does not do any data transfers and uses arrays already on the device.
Expand All @@ -304,27 +402,34 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
min_repeat_ms : int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
Returns
-------
A module BenchmarkResult
"""
# Results are returned as serialized strings which we deserialize
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
answer = []
for value in ret.split(","):
if value.strip() == "":
continue
answer.append(float(value))

return BenchmarkResult(answer)
ret = self._run_individual_node(
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
results = []
for repeat_data in ret.replace(" ", "").strip(",").split(","):
if repeat_data:
results.append(float(repeat_data))
return BenchmarkResult(results)

def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
Expand Down
42 changes: 30 additions & 12 deletions python/tvm/contrib/debugger/debug_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
def get_output_tensors(self):
"""Get the output tensors of each operation in numpy format"""
eid = 0
order = 0
output_tensors = {}
for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
for i, node in enumerate(self._nodes_list):
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
order += time[0]

# the node name is not unique, so we need a consistent
# indexing based on the list ordering in the nodes
Expand Down Expand Up @@ -157,7 +155,7 @@ def s_to_us(t):
return t * 10**6

starting_times = np.zeros(len(self._time_list) + 1)
starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])

def node_to_events(node, times, starting_time):
return [
Expand All @@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
),
ChromeTraceEvent(
# Use start + duration instead of end to ensure precise timings.
ts=s_to_us(times[0] + starting_time),
ts=s_to_us(np.mean(times) + starting_time),
tid=1,
pid=1,
ph="E",
Expand Down Expand Up @@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):

def get_debug_result(self, sort_by_time=True):
"""Return the debugger result"""
header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
header = [
"Node Name",
"Ops",
"Time(us)",
"Time(%)",
"Shape",
"Inputs",
"Outputs",
"Measurements(us)",
]
lines = [
"---------",
"---",
"--------",
"-------",
"-----",
"------",
"-------",
"----------------",
]
eid = 0
data = []
total_time = sum(time[0] for time in self._time_list)
total_time = sum([np.mean(time) for time in self._time_list])
for node, time in zip(self._nodes_list, self._time_list):
time_mean = np.mean(time)
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
op = node["op"]
Expand All @@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
continue
name = node["name"]
shape = str(self._output_tensor_list[eid].shape)
time_us = round(time[0] * 1e6, 3)
time_percent = round(((time[0] / total_time) * 100), 3)
time_us = round(time_mean * 1e6, 3)
time_percent = round(((time_mean / total_time) * 100), 3)
inputs = str(node["attrs"]["num_inputs"])
outputs = str(node["attrs"]["num_outputs"])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
data.append(node_data)
eid += 1

Expand All @@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
data = sorted(data, key=lambda x: x[2], reverse=True)
# Insert a row for total time at the end.
rounded_total_time_us = round(total_time * 1e6, 3)
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])

fmt = ""
for i, _ in enumerate(header):
Expand Down
19 changes: 17 additions & 2 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def benchmark(
number=5,
min_repeat_ms=None,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.
Expand Down Expand Up @@ -395,7 +397,7 @@ def benchmark(
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).
min_repeat_ms : Optional[float]
min_repeat_ms : Optional[int]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
Expand All @@ -405,6 +407,13 @@ def benchmark(
returned tensors in the total runtime. This will give accurate timings for end to end
workloads.
cooldown_interval_ms: Optional[int]
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: Optional[int]
The number of repeats before the cooldown is activated.
kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.
Expand Down Expand Up @@ -432,5 +441,11 @@ def benchmark(
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
func_name,
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)()
Loading