Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce thrust benchmarks noise #1203

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 120 additions & 3 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#pragma once

#include <cuda/std/complex>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>

#include <cuda/std/complex>
#include <cuda/std/span>

#include <limits>
#include <map>
#include <stdexcept>

#include <nvbench/nvbench.cuh>
#include <cuda/std/span>

#if defined(_MSC_VER)
#define NVBENCH_HELPER_HAS_I128 0
Expand Down Expand Up @@ -478,3 +480,118 @@ struct max_t
return less(lhs, rhs) ? rhs : lhs;
}
};

namespace
{
struct caching_allocator_t
{
using value_type = char;

caching_allocator_t() = default;
~caching_allocator_t()
{
free_all();
}

char* allocate(std::ptrdiff_t num_bytes)
{
value_type* result{};
auto free_block = free_blocks.find(num_bytes);

if (free_block != free_blocks.end())
{
result = free_block->second;
free_blocks.erase(free_block);
}
else
{
result = do_allocate(num_bytes);
}

allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}

void deallocate(char* ptr, size_t)
{
auto iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
{
throw std::runtime_error("Memory was not allocated by this allocator");
}

std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
free_blocks.insert(std::make_pair(num_bytes, ptr));
}

private:
using free_blocks_type = std::multimap<std::ptrdiff_t, char*>;
using allocated_blocks_type = std::map<char*, std::ptrdiff_t>;

free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;

void free_all()
{
for (auto i : free_blocks)
{
do_deallocate(i.second);
}

for (auto i : allocated_blocks)
{
do_deallocate(i.first);
}
}

value_type* do_allocate(std::size_t num_bytes)
{
value_type* result{};
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
const cudaError_t status = cudaMalloc(&result, num_bytes);
if (cudaSuccess != status)
{
throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status));
}
#else
result = new value_type[num_bytes];
#endif
return result;
}

void do_deallocate(value_type* ptr)
{
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
cudaFree(ptr);
#else
delete[] ptr;
#endif
}
};

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
auto policy(caching_allocator_t& alloc)
{
return thrust::cuda::par(alloc);
}
#else
auto policy(caching_allocator_t&)
{
return thrust::device;
}
#endif

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
auto policy(caching_allocator_t& alloc, nvbench::launch& launch)
{
return thrust::cuda::par(alloc).on(launch.get_stream());
}
#else
auto policy(caching_allocator_t&, nvbench::launch&)
{
return thrust::device;
}
#endif

} // namespace
7 changes: 5 additions & 2 deletions thrust/benchmarks/bench/adjacent_difference/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin());
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
}

Expand Down
16 changes: 11 additions & 5 deletions thrust/benchmarks/bench/adjacent_difference/custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ struct custom_op
};

template <typename T>
static void basic(nvbench::state &state, nvbench::type_list<T>)
{
static void basic(nvbench::state &state, nvbench::type_list<T>) {
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> input = generate(elements);
Expand All @@ -60,9 +59,16 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});
});
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(),
output.begin(), custom_op<T>{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::adjacent_difference(policy(alloc, launch),
input.cbegin(), input.cend(),
output.begin(), custom_op<T>{42});
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/adjacent_difference/in_place.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin());
});
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), vec.begin(), vec.end(), vec.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::adjacent_difference(policy(alloc, launch), vec.begin(),
vec.end(), vec.begin());
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
Expand Down
15 changes: 9 additions & 6 deletions thrust/benchmarks/bench/copy/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
*
******************************************************************************/

#include <nvbench/nvbench.cuh>
#include <nvbench_helper.cuh>

#include <thrust/count.h>
#include <thrust/device_vector.h>
Expand All @@ -45,11 +45,14 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::copy(input.cbegin(),
input.cend(),
output.begin());
});
caching_allocator_t alloc;
thrust::copy(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::copy(policy(alloc, launch), input.cbegin(), input.cend(),
output.begin());
});
}

using types = nvbench::type_list<nvbench::uint8_t,
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/copy/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,14 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(selected_elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op);
});
caching_allocator_t alloc;
thrust::copy_if(policy(alloc), input.cbegin(), input.cend(), output.begin(), select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::copy_if(policy(alloc, launch), input.cbegin(),
input.cend(), output.begin(), select_op);
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
Expand Down
10 changes: 7 additions & 3 deletions thrust/benchmarks/bench/fill/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,13 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_element_count(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::fill(output.begin(), output.end(), T{42});
});
caching_allocator_t alloc;
thrust::fill(policy(alloc), output.begin(), output.end(), T{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::fill(policy(alloc, launch), output.begin(), output.end(), T{42});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/inner_product/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements * 2);
state.add_global_memory_writes<T>(1);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0});
});
caching_allocator_t alloc;
thrust::inner_product(policy(alloc), lhs.begin(), lhs.end(), rhs.begin(), T{0});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::inner_product(policy(alloc, launch), lhs.begin(),
lhs.end(), rhs.begin(), T{0});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types))
Expand Down
18 changes: 11 additions & 7 deletions thrust/benchmarks/bench/merge/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::merge(in.cbegin(),
in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs,
in.cend(),
out.begin());
});
caching_allocator_t alloc;
thrust::merge(policy(alloc), in.cbegin(), in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs, in.cend(), out.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::merge(policy(alloc, launch), in.cbegin(),
in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs, in.cend(),
out.begin());
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
20 changes: 13 additions & 7 deletions thrust/benchmarks/bench/partition/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,19 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::partition_copy(input.cbegin(),
input.cend(),
output.begin(),
thrust::make_reverse_iterator(output.begin() + elements),
select_op);
});
caching_allocator_t alloc;
thrust::partition_copy(
policy(alloc), input.cbegin(), input.cend(), output.begin(),
thrust::make_reverse_iterator(output.begin() + elements), select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::partition_copy(
policy(alloc, launch), input.cbegin(), input.cend(),
output.begin(),
thrust::make_reverse_iterator(output.begin() + elements),
select_op);
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
7 changes: 5 additions & 2 deletions thrust/benchmarks/bench/reduce/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(1);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
do_not_optimize(thrust::reduce(in.begin(), in.end()));
caching_allocator_t alloc;
do_not_optimize(thrust::reduce(policy(alloc), in.begin(), in.end()));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & launch) {
do_not_optimize(thrust::reduce(policy(alloc, launch), in.begin(), in.end()));
});
}

Expand Down
17 changes: 10 additions & 7 deletions thrust/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<KeyT>(unique_keys);
state.add_global_memory_writes<ValueT>(unique_keys);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::reduce_by_key(in_keys.begin(),
in_keys.end(),
in_vals.begin(),
out_keys.begin(),
out_vals.begin());
});
caching_allocator_t alloc;
thrust::reduce_by_key(policy(alloc), in_keys.begin(), in_keys.end(),
in_vals.begin(), out_keys.begin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::reduce_by_key(policy(alloc, launch), in_keys.begin(),
in_keys.end(), in_vals.begin(),
out_keys.begin(), out_vals.begin());
});
}

using key_types = nvbench::type_list<int8_t,
Expand Down
13 changes: 10 additions & 3 deletions thrust/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,16 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_reads<ValueT>(elements);
state.add_global_memory_writes<ValueT>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
});
caching_allocator_t alloc;
thrust::exclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(),
in_vals.cbegin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::exclusive_scan_by_key(
policy(alloc, launch), keys.cbegin(), keys.cend(),
in_vals.cbegin(), out_vals.begin());
});
}

using key_types = all_types;
Expand Down
Loading
Loading