Skip to content

Commit

Permalink
Add cooldown interval logic for the profiling functional.
Browse files Browse the repository at this point in the history
  • Loading branch information
Icemist committed Jun 20, 2022
1 parent eb49311 commit 58c20ea
Show file tree
Hide file tree
Showing 17 changed files with 394 additions and 122 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ endif()
if(USE_PROFILER)
message(STATUS "Build with profiler...")

add_definitions(-DUSE_PROFILER=1)
tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
Expand Down
8 changes: 7 additions & 1 deletion include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -554,10 +554,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* minimum duration requirement of one `repeat`.
* i.e., When the run time of one `repeat` falls below this time,
* the `number` parameter will be automatically increased.
* \param f_preproc The function to be executed before we excetute time evaluator.
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
* cooldown is activated.
* \param f_preproc The function to be executed before we excetute time
* evaluator.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int cooldown_interval_ms, int repeats_to_cooldown,
PackedFunc f_preproc = nullptr);

} // namespace profiling
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/auto_scheduler/testing/tune_relay.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def f_per_layer(rt_mod, dev, input_data):
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
print("|graph_nodes| = ", len(graph_nodes))
print("|graph_time| = ", len(graph_time))
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
for k, v in graph_nodes_time.items():
print(f"{k} : {v:.3f}")

Expand Down
139 changes: 122 additions & 17 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,19 @@ def _run_per_layer(self):
output_tensors.append(self._get_node_output(i, j))
self.debug_datum.update_output_tensors(output_tensors)

def _run_debug(self):
def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
self.debug_datum._time_list = self.run_individual(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)

# Get outputs.
self._run_per_layer()
Expand Down Expand Up @@ -259,31 +265,123 @@ def debug_get_output(self, node, out=None):

self._debug_get_output(node_index, out)

def run(self, **input_dict):
# pylint: disable=arguments-differ
def run(
self,
number=10,
repeat=1,
min_repeat_ms=1,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**input_dict,
):
"""Run forward execution of the graph with debug
Parameters
----------
number: int, optional
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
input_dict : dict of str to NDArray
List of input values to be feed to
"""
if input_dict:
self.set_input(**input_dict)

# Step 1. Execute the graph
self._run_debug()
self._run_debug(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
# Step 2. Dump the output tensors to the dump folder
self.debug_datum.dump_output_tensor()
# Step 3. Dump the Chrome trace to the dump folder
self.debug_datum.dump_chrome_trace()
# Step 4. Display the collected information
self.debug_datum.display_debug_result()

def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
def run_individual(
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
):
"""Run each operation in the graph and get the time per op for all ops.
number: int
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
Returns
-------
A 2-dimensional array where the dimensions are: the index of the operation and
the repeat of the measurement.
"""
ret = self._run_individual(
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
results = []
for node_data in ret.strip(";").split(";"):
results.append([])
for repeat_data in node_data.strip(",").split(","):
if repeat_data:
results[-1].append(float(repeat_data))
return results

def run_individual_node(
self,
index,
number=10,
repeat=1,
min_repeat_ms=0,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
"""Benchmark a single node in the serialized graph.
This does not do any data transfers and uses arrays already on the device.
Expand All @@ -304,27 +402,34 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
min_repeat_ms : int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
Returns
-------
A module BenchmarkResult
"""
# Results are returned as serialized strings which we deserialize
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
answer = []
for value in ret.split(","):
if value.strip() == "":
continue
answer.append(float(value))

return BenchmarkResult(answer)
ret = self._run_individual_node(
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
)
results = []
for repeat_data in ret.replace(" ", "").strip(",").split(","):
if repeat_data:
results.append(float(repeat_data))
return BenchmarkResult(results)

def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
Expand Down
42 changes: 30 additions & 12 deletions python/tvm/contrib/debugger/debug_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
def get_output_tensors(self):
"""Get the output tensors of each operation in numpy format"""
eid = 0
order = 0
output_tensors = {}
for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
for i, node in enumerate(self._nodes_list):
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
order += time[0]

# the node name is not unique, so we need a consistent
# indexing based on the list ordering in the nodes
Expand Down Expand Up @@ -157,7 +155,7 @@ def s_to_us(t):
return t * 10**6

starting_times = np.zeros(len(self._time_list) + 1)
starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])

def node_to_events(node, times, starting_time):
return [
Expand All @@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
),
ChromeTraceEvent(
# Use start + duration instead of end to ensure precise timings.
ts=s_to_us(times[0] + starting_time),
ts=s_to_us(np.mean(times) + starting_time),
tid=1,
pid=1,
ph="E",
Expand Down Expand Up @@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):

def get_debug_result(self, sort_by_time=True):
"""Return the debugger result"""
header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
header = [
"Node Name",
"Ops",
"Time(us)",
"Time(%)",
"Shape",
"Inputs",
"Outputs",
"Measurements(us)",
]
lines = [
"---------",
"---",
"--------",
"-------",
"-----",
"------",
"-------",
"----------------",
]
eid = 0
data = []
total_time = sum(time[0] for time in self._time_list)
total_time = sum([np.mean(time) for time in self._time_list])
for node, time in zip(self._nodes_list, self._time_list):
time_mean = np.mean(time)
num_outputs = self.get_graph_node_output_num(node)
for j in range(num_outputs):
op = node["op"]
Expand All @@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
continue
name = node["name"]
shape = str(self._output_tensor_list[eid].shape)
time_us = round(time[0] * 1e6, 3)
time_percent = round(((time[0] / total_time) * 100), 3)
time_us = round(time_mean * 1e6, 3)
time_percent = round(((time_mean / total_time) * 100), 3)
inputs = str(node["attrs"]["num_inputs"])
outputs = str(node["attrs"]["num_outputs"])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
data.append(node_data)
eid += 1

Expand All @@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
data = sorted(data, key=lambda x: x[2], reverse=True)
# Insert a row for total time at the end.
rounded_total_time_us = round(total_time * 1e6, 3)
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])

fmt = ""
for i, _ in enumerate(header):
Expand Down
19 changes: 17 additions & 2 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def benchmark(
number=5,
min_repeat_ms=None,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.
Expand Down Expand Up @@ -395,7 +397,7 @@ def benchmark(
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).
min_repeat_ms : Optional[float]
min_repeat_ms : Optional[int]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
Expand All @@ -405,6 +407,13 @@ def benchmark(
returned tensors in the total runtime. This will give accurate timings for end to end
workloads.
cooldown_interval_ms: Optional[int]
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
repeats_to_cooldown: Optional[int]
The number of repeats before the cooldown is activated.
kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.
Expand Down Expand Up @@ -432,5 +441,11 @@ def benchmark(
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
func_name,
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)()
2 changes: 1 addition & 1 deletion python/tvm/meta_schedule/testing/tune_relay.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def f_per_layer(rt_mod, dev, input_data):
graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
print("|graph_nodes| = ", len(graph_nodes))
print("|graph_time| = ", len(graph_time))
graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
for k, v in graph_nodes_time.items():
print(f"{k} : {v:.3f}")

Expand Down
Loading

0 comments on commit 58c20ea

Please sign in to comment.