Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add multi stream allocations benchmark. #841

Merged
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,22 @@ function(ConfigureBench BENCH_NAME)

endfunction(ConfigureBench)

# benchmark sources

# random allocations benchmark
ConfigureBench(RANDOM_ALLOCATIONS_BENCH random_allocations/random_allocations.cpp)
ConfigureBench(RANDOM_ALLOCATIONS_BENCH
random_allocations/random_allocations.cpp)

# replay benchmark
ConfigureBench(REPLAY_BENCH replay/replay.cpp)
ConfigureBench(REPLAY_BENCH
replay/replay.cpp)

# uvector benchmark
ConfigureBench(UVECTOR_BENCH device_uvector/device_uvector_bench.cu)
ConfigureBench(UVECTOR_BENCH
device_uvector/device_uvector_bench.cu)

# cuda_stream_pool benchmark
ConfigureBench(CUDA_STREAM_POOL_BENCH cuda_stream_pool/cuda_stream_pool_bench.cpp)
ConfigureBench(CUDA_STREAM_POOL_BENCH
cuda_stream_pool/cuda_stream_pool_bench.cpp)

# multi stream allocations
cwharris marked this conversation as resolved.
Show resolved Hide resolved
ConfigureBench(MULTI_STREAM_ALLOCATIONS_BENCH
multi_stream_allocations/multi_stream_allocations_bench.cu)
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either ex ess or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/utilities/cxxopts.hpp>

#include <benchmark/benchmark.h>

#include <rmm/cuda_stream.hpp>
#include <rmm/cuda_stream_pool.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/arena_memory_resource.hpp>
#include <rmm/mr/device/binning_memory_resource.hpp>
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

#include <cuda_runtime_api.h>

__global__ void compute_bound_kernel(int64_t* out)
{
clock_t clock_begin = clock64();
clock_t clock_current = clock_begin;

if (threadIdx.x == 0) {
while (clock_current - clock_begin < 1000000) {
clock_current = clock64();
}
}

*out = static_cast<int64_t>(clock_current);
}

using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;

static void run_prewarm(rmm::cuda_stream_pool& stream_pool, rmm::mr::device_memory_resource* mr)
{
auto buffers = std::vector<rmm::device_uvector<int64_t>>();
for (int32_t i = 0; i < stream_pool.get_pool_size(); i++) {
auto stream = stream_pool.get_stream(i);
buffers.emplace_back(rmm::device_uvector<int64_t>(1, stream, mr));
}
}

static void run_test(size_t num_kernels,
rmm::cuda_stream_pool& stream_pool,
rmm::mr::device_memory_resource* mr)
{
for (int32_t i = 0; i < num_kernels; i++) {
auto stream = stream_pool.get_stream(i);
auto buffer = rmm::device_uvector<int64_t>(1, stream, mr);
compute_bound_kernel<<<1, 1, 0, stream.value()>>>(buffer.data());
}
}

static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc factory)
{
auto mr = factory();

rmm::mr::set_current_device_resource(mr.get());

auto num_streams = state.range(0);
auto num_kernels = state.range(1);
auto do_prewarm = state.range(2);

auto stream_pool = rmm::cuda_stream_pool(num_streams);

if (do_prewarm) { run_prewarm(stream_pool, mr.get()); }

for (auto _ : state) {
run_test(num_kernels, stream_pool, mr.get());
cudaDeviceSynchronize();
}

state.SetItemsProcessed(state.iterations() * num_kernels);

rmm::mr::set_current_device_resource(nullptr);
}

inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }

inline auto make_pool()
{
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
}

inline auto make_arena()
{
return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
}

inline auto make_binning()
{
auto pool = make_pool();
// Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
// Larger allocations will use the pool resource
auto mr = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(pool, 18, 22);
return mr;
}

static void benchmark_range(benchmark::internal::Benchmark* b)
{
b //
->RangeMultiplier(2)
->Ranges({{1, 4}, {4, 4}, {false, true}})
->Unit(benchmark::kMicrosecond);
}

MRFactoryFunc get_mr_factory(std::string resource_name)
{
if (resource_name == "cuda") { return &make_cuda; }
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (resource_name == "cuda_async") { return &make_cuda_async; }
#endif
if (resource_name == "pool") { return &make_pool; }
if (resource_name == "arena") { return &make_arena; }
if (resource_name == "binning") { return &make_binning; }

std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl;

RMM_FAIL();
}

void declare_benchmark(std::string name)
{
if (name == "cuda") {
BENCHMARK_CAPTURE(BM_MultiStreamAllocations, cuda, &make_cuda) //
->Apply(benchmark_range);
return;
}

#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (name == "cuda_async") {
BENCHMARK_CAPTURE(BM_MultiStreamAllocations, cuda_async, &make_cuda_async) //
->Apply(benchmark_range);
return;
}
#endif

if (name == "pool") {
BENCHMARK_CAPTURE(BM_MultiStreamAllocations, pool_mr, &make_pool) //
->Apply(benchmark_range);
return;
}

if (name == "arena") {
BENCHMARK_CAPTURE(BM_MultiStreamAllocations, arena, &make_arena) //
->Apply(benchmark_range);
return;
}

if (name == "binning") {
BENCHMARK_CAPTURE(BM_MultiStreamAllocations, binning, &make_binning) //
->Apply(benchmark_range);
return;
}

std::cout << "Error: invalid memory_resource name: " << name << std::endl;
}

void run_profile(std::string resource_name, int kernel_count, int stream_count, bool prewarm)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed? Can't you do the same thing via GBench command line args?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you mean by "this". gbench runs multiple times no matter what. We want a way to run only once for profiling.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want a way to run only once for profiling.

Right, that's what I meant. I thought there was a "num_iterations` gbench option.

Related NVIDIA/nvbench#10

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't found an option in gbench to limit the number of iterations. I looked briefly prior to implementing it this way, but gbench documentation is... not great?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gbenchmark does not have a way to control number of iterations. Here's one of the authors explanations of why. https://stackoverflow.com/a/61888885/749748

{
auto mr_factory = get_mr_factory(resource_name);
auto mr = mr_factory();
auto stream_pool = rmm::cuda_stream_pool(stream_count);

if (prewarm) { run_prewarm(stream_pool, mr.get()); }

run_test(kernel_count, stream_pool, mr.get());
}

int main(int argc, char** argv)
{
::benchmark::Initialize(&argc, argv);

// Parse for replay arguments:
cxxopts::Options options(
"RMM Multi Stream Allocations Benchmark",
"Benchmarks interleaving temporary allocations with compute-bound kernels.");

options.add_options()( //
"p,profile",
"Profiling mode: run once",
cxxopts::value<bool>()->default_value("false"));

options.add_options()( //
"r,resource",
"Type of device_memory_resource",
cxxopts::value<std::string>()->default_value("pool"));

options.add_options()( //
"k,kernels",
"Number of kernels to run: (default: 8)",
cxxopts::value<int>()->default_value("8"));

options.add_options()( //
"s,streams",
"Number of streams in stream pool (default: 8)",
cxxopts::value<int>()->default_value("8"));

options.add_options()( //
"w,warm",
"Ensure each stream has enough memory to satisfy allocations.",
cxxopts::value<bool>()->default_value("false"));

auto args = options.parse(argc, argv);

if (args.count("profile") > 0) {
auto resource_name = args["resource"].as<std::string>();
auto num_kernels = args["kernels"].as<int>();
auto num_streams = args["streams"].as<int>();
auto prewarm = args["warm"].as<bool>();
run_profile(resource_name, num_kernels, num_streams, prewarm);
} else {
auto resource_names = std::vector<std::string>();

if (args.count("resource") > 0) {
resource_names.emplace_back(args["resource"].as<std::string>());
} else {
resource_names.emplace_back("cuda");
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
resource_names.emplace_back("cuda_async");
#endif
resource_names.emplace_back("pool");
resource_names.emplace_back("arena");
resource_names.emplace_back("binning");
}

for (auto& resource_name : resource_names) {
declare_benchmark(resource_name);
}

::benchmark::RunSpecifiedBenchmarks();
}
}