Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Graph Executor, VM] Add end to end benchmarking of models #8858

Merged
merged 1 commit into from
Aug 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/tvm/runtime/vm/vm.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,16 @@ class VirtualMachine : public runtime::ModuleNode {
*/
void InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args);

/*!
* \brief Set inputs to a function.
* \param name The function name
* \param args args[offset:] are arguments to the
* function. If the arguments are not of the correct device for the function,
* they will be copied to the device.
* \param offset Starting offset of the arguments in `args`.
*/
void SetInput(std::string name, TVMArgs args, int offset);

protected:
/*! \brief The virtual machine's packed function table. */
std::vector<PackedFunc> packed_funcs_;
Expand Down
34 changes: 31 additions & 3 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,25 @@ def __getitem__(self, key):
"""
return self.module[key]

def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
def benchmark(
self,
device,
func_name="run",
repeat=5,
number=5,
min_repeat_ms=None,
end_to_end=False,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.

Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
for fair comparison of runtimes across different functions and models. The `end_to_end` flag
switches this behavior to include data transfer operations in the runtime.

The benchmarking loop looks approximately like so:

Expand All @@ -346,7 +356,7 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
Parameters
----------
func_name : str
The function to benchmark
The function to benchmark. This is ignored if `end_to_end` is true.

repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
Expand All @@ -363,6 +373,11 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
workloads.

kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.
Expand All @@ -374,6 +389,19 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if end_to_end:
# Have to unpack kwargs into a single list
args = []
for k, v in kwargs.items():
args.append(k)
args.append(v)
return self.module.time_evaluator(
"run_from_inputs",
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
)(device.device_type, device.device_id, *args)
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
Expand Down
37 changes: 34 additions & 3 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,16 +509,25 @@ def get_input_index(self, input_name, func_name="main"):
return self._get_input_index(input_name, func_name)

def benchmark(
self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
self,
device,
*args,
func_name="main",
repeat=5,
number=5,
min_repeat_ms=None,
end_to_end=False,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.

Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
for fair comparison of runtimes across different functions and models. The `end_to_end` flag
switches this behavior to include data transfer operations in the runtime.

The benchmarking loop looks approximately like so:

Expand Down Expand Up @@ -552,6 +561,11 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure "accurate" is exactly the right thing to say here since there is probably jitter in the transfer...but the gist makes sense.

workloads.

args : Sequence[Object]
Arguments to the function. These are cached before running timing code, so that data
transfer costs are not counted in the runtime.
Expand All @@ -566,6 +580,23 @@ def benchmark(
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if end_to_end:
# We need to unpack keyword arguments into positional arguments
packed_args = list(args)
for k, v in kwargs.items():
i = self.get_input_index(k, func_name)
if i < 0:
raise TypeError(f"{func_name}() got an unexpected keyword argument '{k}'")
while i >= len(packed_args):
packed_args.append(None)
packed_args[i] = v
return self.module.time_evaluator(
"invoke_return_to_device",
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
)(func_name, device.device_type, device.device_id, *packed_args)
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
return self.module.time_evaluator(
Expand Down
28 changes: 28 additions & 0 deletions src/runtime/graph_executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,34 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
} else if (name == "run") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
} else if (name == "run_from_inputs") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
CHECK(args.size() % 2 == 0)
<< "Number of arguments to run_from_inputs must be an even number of key-value pairs";
Device host{static_cast<DLDeviceType>(args[0].operator int()), args[1].operator int()};
for (int i = 2; i < args.size(); i += 2) {
if (String::CanConvertFrom(args[i])) {
int in_idx = this->GetInputIndex(args[i].operator String());
if (in_idx >= 0) {
this->SetInput(in_idx, args[i + 1]);
} else {
LOG(FATAL) << args[i].operator String() << " is not a valid input name";
}
} else {
this->SetInput(args[i], args[i + 1]);
}
}
this->Run();
Array<NDArray> outputs;
for (int i = 0; i < this->NumOutputs(); i++) {
NDArray out = this->GetOutput(i);
NDArray a = NDArray::Empty(out.Shape(), out.DataType(), host);
a.CopyFrom(out);
outputs.push_back(a);
}
*rv = outputs;
});
} else if (name == "load_params") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
this->LoadParams(args[0].operator std::string());
Expand Down
93 changes: 58 additions & 35 deletions src/runtime/vm/vm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
if (name == "invoke") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
ICHECK(exec_) << "The executable is not created yet.";

std::string func_name = args[0];
auto git = exec_->global_map.find(func_name);
ICHECK(git != exec_->global_map.end())
Expand All @@ -140,6 +141,26 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
TVMRetValue rv_;
invoke.CallPacked(args, &rv_);
});
} else if (name == "invoke_return_to_device") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
Device host{static_cast<DLDeviceType>(args[1].operator int()), args[2].operator int()};

SetInput(args[0].operator std::string(), args, 3);
PackedFunc invoke = GetFunction("invoke", sptr_to_self);
TVMRetValue rv_;
invoke.CallPacked(args, &rv_); // Invoke only uses the first arg, so the rest of the args
// should not cause an issue
if (rv_.type_code() == kTVMObjectHandle) {
ADT adt = Downcast<ADT>(rv_.operator ObjectRef());
std::vector<ObjectRef> transfered;
for (size_t i = 0; i < adt.size(); i++) {
transfered.push_back(CopyTo(adt[i], host));
}
*rv = ADT(adt.tag(), transfered);
} else {
*rv = CopyTo(rv_, host);
}
});
} else if (name == "get_output") {
return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) {
if (this->return_register_.as<ADTObj>()) {
Expand Down Expand Up @@ -191,47 +212,49 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
this->Init(devices, alloc_types);
});
} else if (name == "set_input") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
ICHECK(exec_) << "The executable is not created yet.";
std::string func_name = args[0];
auto gvit = exec_->global_map.find(func_name);
ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
auto func_index = gvit->second;
const auto& vm_func = exec_->functions[func_index];
const auto& param_names = vm_func.params;
ICHECK_EQ(args.size() - 1, param_names.size())
<< "The number of provided parameters doesn't match the number of arguments";
ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
<< "The number of provided parameters doesn't match the number of assigned devices";
std::vector<ObjectRef> func_args(param_names.size());
for (int i = 1; i < args.size(); ++i) {
Index device_type = vm_func.params_device_type[i - 1];
Device dev = GetDevice(device_type);

if (args[i].type_code() == kTVMDLTensorHandle) {
// Automatically convert input DLTensors to NDArray
DLTensor* tensor = args[i];
std::vector<int64_t> shape;
for (int64_t i = 0; i < tensor->ndim; i++) {
shape.push_back(tensor->shape[i]);
}
NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
ary.CopyFrom(tensor);
func_args[i - 1] = ary;
} else {
ObjectRef obj = CopyTo(args[i], dev);
func_args[i - 1] = obj;
}
}
inputs_.erase(func_name);
inputs_.emplace(func_name, func_args);
});
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args[0], args, 1); });
} else {
LOG(FATAL) << "Unknown packed function: " << name;
return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
}
}

void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
ICHECK(exec_) << "The executable is not created yet.";
auto gvit = exec_->global_map.find(func_name);
ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
auto func_index = gvit->second;
const auto& vm_func = exec_->functions[func_index];
const auto& param_names = vm_func.params;
ICHECK_EQ(args.size() - offset, param_names.size())
<< "The number of provided parameters doesn't match the number of arguments";
ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
<< "The number of provided parameters doesn't match the number of assigned devices";
std::vector<ObjectRef> func_args(param_names.size());
for (int i = offset; i < args.size(); ++i) {
Index device_type = vm_func.params_device_type[i - offset];
Device dev = GetDevice(device_type);

if (args[i].type_code() == kTVMDLTensorHandle) {
// Automatically convert input DLTensors to NDArray
DLTensor* tensor = args[i];
std::vector<int64_t> shape;
for (int64_t i = 0; i < tensor->ndim; i++) {
shape.push_back(tensor->shape[i]);
}
NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
ary.CopyFrom(tensor);
func_args[i - offset] = ary;
} else {
ObjectRef obj = CopyTo(args[i], dev);
func_args[i - offset] = obj;
}
}
inputs_.erase(func_name);
inputs_.emplace(func_name, func_args);
}

inline Device VirtualMachine::GetDevice(Index device_type) const {
ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type;

Expand Down
36 changes: 36 additions & 0 deletions tests/python/relay/test_backend_graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from tvm.relay.op import add
import tvm.testing
from tvm.relay.testing import mlp
from tvm import rpc
from tvm.contrib import utils

# @tq, @jr should we put this in testing ns?
def check_rts(expr, args, expected_result, mod=None):
Expand Down Expand Up @@ -348,5 +350,39 @@ def test_benchmark():
assert result.std == 1.5


@tvm.testing.parametrize_targets("cuda", "llvm")
def test_benchmark_end_to_end(dev, target):
mod, params = mlp.get_workload(1)
lib = relay.build(mod, target=target, params=params)
exe = graph_executor.create(lib.get_graph_json(), lib.lib, dev)
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
assert result.mean > 0
assert len(result.results) == 2


@tvm.testing.requires_llvm
def test_benchmark_end_to_end_rpc():
server = rpc.Server("127.0.0.1")
remote = rpc.connect(server.host, server.port)

mod, params = mlp.get_workload(1)
lib = relay.build(mod, target="llvm", params=params)

temp = utils.tempdir()
path = temp.relpath("library.so")
lib.export_library(path)
remote.upload(path)
rlib = remote.load_module("library.so")

dev = remote.cpu()
exe = graph_executor.create(lib.get_graph_json(), rlib, dev)

data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
assert result.mean > 0
assert len(result.results) == 2


if __name__ == "__main__":
pytest.main([__file__])
32 changes: 32 additions & 0 deletions tests/python/relay/test_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,5 +981,37 @@ def test_benchmark():
assert result.std == 1.5


@tvm.testing.parametrize_targets("cuda", "llvm")
def test_benchmark_end_to_end(dev, target):
mod, params = mlp.get_workload(1)
lib = vm.compile(mod, target=target, params=params)
exe = runtime.vm.VirtualMachine(lib, dev)
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1, end_to_end=True)
assert result.mean > 0


@tvm.testing.requires_llvm
def test_benchmark_end_to_end_rpc():
server = rpc.Server("127.0.0.1")
remote = rpc.connect(server.host, server.port)

mod, params = mlp.get_workload(1)
lib = vm.compile(mod, target="llvm", params=params)

temp = utils.tempdir()
path = temp.relpath("vm_library.so")
lib.mod.export_library(path)
remote.upload(path)
rlib = remote.load_module("vm_library.so")

exe = runtime.vm.VirtualMachine(rlib, remote.cpu())
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=remote.cpu())
result = exe.benchmark(
remote.cpu(), data=data, func_name="main", repeat=2, number=1, end_to_end=True
)
assert result.mean > 0


if __name__ == "__main__":
pytest.main([__file__])