diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 3bedf5841ae..3a6532fa827 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -1,14 +1,16 @@ #pragma once -#include - #include +#include + +#include +#include #include +#include #include #include -#include #if defined(_MSC_VER) #define NVBENCH_HELPER_HAS_I128 0 @@ -478,3 +480,118 @@ struct max_t return less(lhs, rhs) ? rhs : lhs; } }; + +namespace +{ +struct caching_allocator_t +{ + using value_type = char; + + caching_allocator_t() = default; + ~caching_allocator_t() + { + free_all(); + } + + char* allocate(std::ptrdiff_t num_bytes) + { + value_type* result{}; + auto free_block = free_blocks.find(num_bytes); + + if (free_block != free_blocks.end()) + { + result = free_block->second; + free_blocks.erase(free_block); + } + else + { + result = do_allocate(num_bytes); + } + + allocated_blocks.insert(std::make_pair(result, num_bytes)); + return result; + } + + void deallocate(char* ptr, size_t) + { + auto iter = allocated_blocks.find(ptr); + if (iter == allocated_blocks.end()) + { + throw std::runtime_error("Memory was not allocated by this allocator"); + } + + std::ptrdiff_t num_bytes = iter->second; + allocated_blocks.erase(iter); + free_blocks.insert(std::make_pair(num_bytes, ptr)); + } + +private: + using free_blocks_type = std::multimap; + using allocated_blocks_type = std::map; + + free_blocks_type free_blocks; + allocated_blocks_type allocated_blocks; + + void free_all() + { + for (auto i : free_blocks) + { + do_deallocate(i.second); + } + + for (auto i : allocated_blocks) + { + do_deallocate(i.first); + } + } + + value_type* do_allocate(std::size_t num_bytes) + { + value_type* result{}; +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + const cudaError_t status = cudaMalloc(&result, num_bytes); + if (cudaSuccess != status) + { + throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status)); + } +#else + result = new value_type[num_bytes]; +#endif + return result; + } + + void do_deallocate(value_type* ptr) + { +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + cudaFree(ptr); +#else + delete[] ptr; +#endif + } +}; + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA +auto policy(caching_allocator_t& alloc) +{ + return thrust::cuda::par(alloc); +} +#else +auto policy(caching_allocator_t&) +{ + return thrust::device; +} +#endif + +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA +auto policy(caching_allocator_t& alloc, nvbench::launch& launch) +{ + return thrust::cuda::par(alloc).on(launch.get_stream()); +} +#else +auto policy(caching_allocator_t&, nvbench::launch&) +{ + return thrust::device; +} +#endif + +} // namespace diff --git a/thrust/benchmarks/bench/adjacent_difference/basic.cu b/thrust/benchmarks/bench/adjacent_difference/basic.cu index 47f93f382b0..dfbfef0ab96 100644 --- a/thrust/benchmarks/bench/adjacent_difference/basic.cu +++ b/thrust/benchmarks/bench/adjacent_difference/basic.cu @@ -43,8 +43,11 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin()); + caching_allocator_t alloc; + thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin()); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/adjacent_difference/custom.cu b/thrust/benchmarks/bench/adjacent_difference/custom.cu index e8e892bf8d4..f7f7377f3f9 100644 --- a/thrust/benchmarks/bench/adjacent_difference/custom.cu +++ b/thrust/benchmarks/bench/adjacent_difference/custom.cu @@ -49,8 +49,7 @@ struct custom_op }; template -static void basic(nvbench::state &state, nvbench::type_list) -{ +static void basic(nvbench::state &state, nvbench::type_list) { const auto elements = static_cast(state.get_int64("Elements")); thrust::device_vector input = generate(elements); @@ -60,9 +59,16 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op{42}); - }); + caching_allocator_t alloc; + thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), + output.begin(), custom_op{42}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::adjacent_difference(policy(alloc, launch), + input.cbegin(), input.cend(), + output.begin(), custom_op{42}); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/adjacent_difference/in_place.cu b/thrust/benchmarks/bench/adjacent_difference/in_place.cu index 4e91e8471c0..84fc356b3fc 100644 --- a/thrust/benchmarks/bench/adjacent_difference/in_place.cu +++ b/thrust/benchmarks/bench/adjacent_difference/in_place.cu @@ -42,9 +42,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin()); - }); + caching_allocator_t alloc; + thrust::adjacent_difference(policy(alloc), vec.begin(), vec.end(), vec.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::adjacent_difference(policy(alloc, launch), vec.begin(), + vec.end(), vec.begin()); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/copy/basic.cu b/thrust/benchmarks/bench/copy/basic.cu index 1b2b96214df..49e68a77f97 100644 --- a/thrust/benchmarks/bench/copy/basic.cu +++ b/thrust/benchmarks/bench/copy/basic.cu @@ -25,7 +25,7 @@ * ******************************************************************************/ -#include +#include #include #include @@ -45,11 +45,14 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::copy(input.cbegin(), - input.cend(), - output.begin()); - }); + caching_allocator_t alloc; + thrust::copy(policy(alloc), input.cbegin(), input.cend(), output.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::copy(policy(alloc, launch), input.cbegin(), input.cend(), + output.begin()); + }); } using types = nvbench::type_list(elements); state.add_global_memory_writes(selected_elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op); - }); + caching_allocator_t alloc; + thrust::copy_if(policy(alloc), input.cbegin(), input.cend(), output.begin(), select_op); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::copy_if(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin(), select_op); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/fill/basic.cu b/thrust/benchmarks/bench/fill/basic.cu index 3c29f3c7043..a2b0875ff1a 100644 --- a/thrust/benchmarks/bench/fill/basic.cu +++ b/thrust/benchmarks/bench/fill/basic.cu @@ -41,9 +41,13 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::fill(output.begin(), output.end(), T{42}); - }); + caching_allocator_t alloc; + thrust::fill(policy(alloc), output.begin(), output.end(), T{42}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::fill(policy(alloc, launch), output.begin(), output.end(), T{42}); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) diff --git a/thrust/benchmarks/bench/inner_product/basic.cu b/thrust/benchmarks/bench/inner_product/basic.cu index aa3b5d467e9..e2a2e545cbe 100644 --- a/thrust/benchmarks/bench/inner_product/basic.cu +++ b/thrust/benchmarks/bench/inner_product/basic.cu @@ -44,9 +44,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements * 2); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0}); - }); + caching_allocator_t alloc; + thrust::inner_product(policy(alloc), lhs.begin(), lhs.end(), rhs.begin(), T{0}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::inner_product(policy(alloc, launch), lhs.begin(), + lhs.end(), rhs.begin(), T{0}); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types)) diff --git a/thrust/benchmarks/bench/merge/basic.cu b/thrust/benchmarks/bench/merge/basic.cu index 854baf8ec0e..d0b1ce771d0 100644 --- a/thrust/benchmarks/bench/merge/basic.cu +++ b/thrust/benchmarks/bench/merge/basic.cu @@ -50,13 +50,17 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::merge(in.cbegin(), - in.cbegin() + elements_in_lhs, - in.cbegin() + elements_in_lhs, - in.cend(), - out.begin()); - }); + caching_allocator_t alloc; + thrust::merge(policy(alloc), in.cbegin(), in.cbegin() + elements_in_lhs, + in.cbegin() + elements_in_lhs, in.cend(), out.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::merge(policy(alloc, launch), in.cbegin(), + in.cbegin() + elements_in_lhs, + in.cbegin() + elements_in_lhs, in.cend(), + out.begin()); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) diff --git a/thrust/benchmarks/bench/partition/basic.cu b/thrust/benchmarks/bench/partition/basic.cu index aafdc892236..840a9a4df00 100644 --- a/thrust/benchmarks/bench/partition/basic.cu +++ b/thrust/benchmarks/bench/partition/basic.cu @@ -72,13 +72,19 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::partition_copy(input.cbegin(), - input.cend(), - output.begin(), - thrust::make_reverse_iterator(output.begin() + elements), - select_op); - }); + caching_allocator_t alloc; + thrust::partition_copy( + policy(alloc), input.cbegin(), input.cend(), output.begin(), + thrust::make_reverse_iterator(output.begin() + elements), select_op); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::partition_copy( + policy(alloc, launch), input.cbegin(), input.cend(), + output.begin(), + thrust::make_reverse_iterator(output.begin() + elements), + select_op); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) diff --git a/thrust/benchmarks/bench/reduce/basic.cu b/thrust/benchmarks/bench/reduce/basic.cu index e6e31c22a05..c3cd5571b14 100644 --- a/thrust/benchmarks/bench/reduce/basic.cu +++ b/thrust/benchmarks/bench/reduce/basic.cu @@ -42,8 +42,11 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - do_not_optimize(thrust::reduce(in.begin(), in.end())); + caching_allocator_t alloc; + do_not_optimize(thrust::reduce(policy(alloc), in.begin(), in.end())); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & launch) { + do_not_optimize(thrust::reduce(policy(alloc, launch), in.begin(), in.end())); }); } diff --git a/thrust/benchmarks/bench/reduce/by_key.cu b/thrust/benchmarks/bench/reduce/by_key.cu index 282dff7d940..893f39a1b39 100644 --- a/thrust/benchmarks/bench/reduce/by_key.cu +++ b/thrust/benchmarks/bench/reduce/by_key.cu @@ -57,13 +57,16 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(unique_keys); state.add_global_memory_writes(unique_keys); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::reduce_by_key(in_keys.begin(), - in_keys.end(), - in_vals.begin(), - out_keys.begin(), - out_vals.begin()); - }); + caching_allocator_t alloc; + thrust::reduce_by_key(policy(alloc), in_keys.begin(), in_keys.end(), + in_vals.begin(), out_keys.begin(), out_vals.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::reduce_by_key(policy(alloc, launch), in_keys.begin(), + in_keys.end(), in_vals.begin(), + out_keys.begin(), out_vals.begin()); + }); } using key_types = nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); - }); + caching_allocator_t alloc; + thrust::exclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(), + in_vals.cbegin(), out_vals.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::exclusive_scan_by_key( + policy(alloc, launch), keys.cbegin(), keys.cend(), + in_vals.cbegin(), out_vals.begin()); + }); } using key_types = all_types; diff --git a/thrust/benchmarks/bench/scan/exclusive/max.cu b/thrust/benchmarks/bench/scan/exclusive/max.cu index a18a3c96cb1..4aeff00406e 100644 --- a/thrust/benchmarks/bench/scan/exclusive/max.cu +++ b/thrust/benchmarks/bench/scan/exclusive/max.cu @@ -43,9 +43,15 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin(), T{}, max_t{}); - }); + caching_allocator_t alloc; + thrust::exclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin(), T{}, max_t{}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::exclusive_scan(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin(), T{}, + max_t{}); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types)) diff --git a/thrust/benchmarks/bench/scan/exclusive/sum.cu b/thrust/benchmarks/bench/scan/exclusive/sum.cu index 29b82b68a81..bb09f18666b 100644 --- a/thrust/benchmarks/bench/scan/exclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/exclusive/sum.cu @@ -43,9 +43,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin()); - }); + caching_allocator_t alloc; + thrust::exclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::exclusive_scan(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin()); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types)) diff --git a/thrust/benchmarks/bench/scan/inclusive/by_key.cu b/thrust/benchmarks/bench/scan/inclusive/by_key.cu index 10e0cbc408b..f9f1da4209d 100644 --- a/thrust/benchmarks/bench/scan/inclusive/by_key.cu +++ b/thrust/benchmarks/bench/scan/inclusive/by_key.cu @@ -45,9 +45,16 @@ static void scan(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::inclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); - }); + caching_allocator_t alloc; + thrust::inclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(), + in_vals.cbegin(), out_vals.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::inclusive_scan_by_key( + policy(alloc, launch), keys.cbegin(), keys.cend(), + in_vals.cbegin(), out_vals.begin()); + }); } using key_types = all_types; diff --git a/thrust/benchmarks/bench/scan/inclusive/max.cu b/thrust/benchmarks/bench/scan/inclusive/max.cu index 40d84942ec6..a0ec75c496f 100644 --- a/thrust/benchmarks/bench/scan/inclusive/max.cu +++ b/thrust/benchmarks/bench/scan/inclusive/max.cu @@ -43,9 +43,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin(), max_t{}); - }); + caching_allocator_t alloc; + thrust::inclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin(), max_t{}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::inclusive_scan(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin(), max_t{}); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types)) diff --git a/thrust/benchmarks/bench/scan/inclusive/sum.cu b/thrust/benchmarks/bench/scan/inclusive/sum.cu index ea98b7bcf31..d208f357b67 100644 --- a/thrust/benchmarks/bench/scan/inclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/inclusive/sum.cu @@ -43,9 +43,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin()); - }); + caching_allocator_t alloc; + thrust::inclusive_scan(policy(alloc), input.cbegin(), input.cend(), output.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::inclusive_scan(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin()); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types)) diff --git a/thrust/benchmarks/bench/set_operations/base.cuh b/thrust/benchmarks/bench/set_operations/base.cuh index 9f5ab563ac0..d09d8000dc9 100644 --- a/thrust/benchmarks/bench/set_operations/base.cuh +++ b/thrust/benchmarks/bench/set_operations/base.cuh @@ -50,24 +50,22 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT op) thrust::sort(input.begin(), input.begin() + elements_in_A); thrust::sort(input.begin() + elements_in_A, input.end()); - const std::size_t elements_in_AB = thrust::distance(output.begin(), - op(input.cbegin(), - input.cbegin() + elements_in_A, - input.cbegin() + elements_in_A, - input.cend(), - output.begin())); + caching_allocator_t alloc; + const std::size_t elements_in_AB = thrust::distance( + output.begin(), + op(policy(alloc), input.cbegin(), input.cbegin() + elements_in_A, + input.cbegin() + elements_in_A, input.cend(), output.begin())); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - op(input.cbegin(), - input.cbegin() + elements_in_A, - input.cbegin() + elements_in_A, - input.cend(), - output.begin()); - }); + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + op(policy(alloc, launch), input.cbegin(), + input.cbegin() + elements_in_A, + input.cbegin() + elements_in_A, input.cend(), output.begin()); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/set_operations/by_key.cuh b/thrust/benchmarks/bench/set_operations/by_key.cuh index 6e71601f85c..15799d8b39f 100644 --- a/thrust/benchmarks/bench/set_operations/by_key.cuh +++ b/thrust/benchmarks/bench/set_operations/by_key.cuh @@ -53,14 +53,11 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT o thrust::sort(in_keys.begin(), in_keys.begin() + elements_in_A); thrust::sort(in_keys.begin() + elements_in_A, in_keys.end()); - auto result_ends = op(in_keys.cbegin(), - in_keys.cbegin() + elements_in_A, - in_keys.cbegin() + elements_in_A, - in_keys.cend(), - in_vals.cbegin(), - in_vals.cbegin() + elements_in_A, - out_keys.begin(), - out_vals.begin()); + caching_allocator_t alloc; + auto result_ends = + op(policy(alloc), in_keys.cbegin(), in_keys.cbegin() + elements_in_A, + in_keys.cbegin() + elements_in_A, in_keys.cend(), in_vals.cbegin(), + in_vals.cbegin() + elements_in_A, out_keys.begin(), out_vals.begin()); const std::size_t elements_in_AB = thrust::distance(out_keys.begin(), result_ends.first); @@ -70,16 +67,14 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT o state.add_global_memory_reads(OpT::read_all_values ? elements : elements_in_A); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - op(in_keys.cbegin(), - in_keys.cbegin() + elements_in_A, - in_keys.cbegin() + elements_in_A, - in_keys.cend(), - in_vals.cbegin(), - in_vals.cbegin() + elements_in_A, - out_keys.begin(), - out_vals.begin()); - }); + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + op(policy(alloc, launch), in_keys.cbegin(), + in_keys.cbegin() + elements_in_A, + in_keys.cbegin() + elements_in_A, in_keys.cend(), + in_vals.cbegin(), in_vals.cbegin() + elements_in_A, + out_keys.begin(), out_vals.begin()); + }); } using key_types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/set_operations/difference.cu b/thrust/benchmarks/bench/set_operations/difference.cu index 6f74d08bb77..6a55e8829aa 100644 --- a/thrust/benchmarks/bench/set_operations/difference.cu +++ b/thrust/benchmarks/bench/set_operations/difference.cu @@ -29,16 +29,18 @@ struct op_t { - template - __host__ OutputIterator operator()(InputIterator1 first1, + template + __host__ OutputIterator operator()(const PolicyT& policy, + InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) const { - return thrust::set_difference(first1, last1, first2, last2, result); + return thrust::set_difference(policy, first1, last1, first2, last2, result); } }; diff --git a/thrust/benchmarks/bench/set_operations/difference_by_key.cu b/thrust/benchmarks/bench/set_operations/difference_by_key.cu index f169fffbfc8..794061ddaba 100644 --- a/thrust/benchmarks/bench/set_operations/difference_by_key.cu +++ b/thrust/benchmarks/bench/set_operations/difference_by_key.cu @@ -31,14 +31,16 @@ struct op_t { static constexpr bool read_all_values = true; - template __host__ thrust::pair - operator()(InputIterator1 keys_first1, + operator()(const PolicyT& policy, + InputIterator1 keys_first1, InputIterator1 keys_last1, InputIterator2 keys_first2, InputIterator2 keys_last2, @@ -47,7 +49,8 @@ struct op_t OutputIterator1 keys_result, OutputIterator2 values_result) const { - return thrust::set_difference_by_key(keys_first1, + return thrust::set_difference_by_key(policy, + keys_first1, keys_last1, keys_first2, keys_last2, diff --git a/thrust/benchmarks/bench/set_operations/intersection.cu b/thrust/benchmarks/bench/set_operations/intersection.cu index b18755eb938..650a1f1ff57 100644 --- a/thrust/benchmarks/bench/set_operations/intersection.cu +++ b/thrust/benchmarks/bench/set_operations/intersection.cu @@ -29,16 +29,18 @@ struct op_t { - template - __host__ OutputIterator operator()(InputIterator1 first1, + template + __host__ OutputIterator operator()(const PolicyT& policy, + InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) const { - return thrust::set_intersection(first1, last1, first2, last2, result); + return thrust::set_intersection(policy, first1, last1, first2, last2, result); } }; diff --git a/thrust/benchmarks/bench/set_operations/intersection_by_key.cu b/thrust/benchmarks/bench/set_operations/intersection_by_key.cu index 8014757bc9b..d7022d72ddf 100644 --- a/thrust/benchmarks/bench/set_operations/intersection_by_key.cu +++ b/thrust/benchmarks/bench/set_operations/intersection_by_key.cu @@ -31,14 +31,16 @@ struct op_t { static constexpr bool read_all_values = false; - template __host__ thrust::pair - operator()(InputIterator1 keys_first1, + operator()(const PolicyT& policy, + InputIterator1 keys_first1, InputIterator1 keys_last1, InputIterator2 keys_first2, InputIterator2 keys_last2, @@ -47,7 +49,8 @@ struct op_t OutputIterator1 keys_result, OutputIterator2 values_result) const { - return thrust::set_intersection_by_key(keys_first1, + return thrust::set_intersection_by_key(policy, + keys_first1, keys_last1, keys_first2, keys_last2, diff --git a/thrust/benchmarks/bench/set_operations/symmetric_difference.cu b/thrust/benchmarks/bench/set_operations/symmetric_difference.cu index b8efc5204cd..411da07c6a6 100644 --- a/thrust/benchmarks/bench/set_operations/symmetric_difference.cu +++ b/thrust/benchmarks/bench/set_operations/symmetric_difference.cu @@ -29,16 +29,18 @@ struct op_t { - template - __host__ OutputIterator operator()(InputIterator1 first1, + template + __host__ OutputIterator operator()(const PolicyT& policy, + InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) const { - return thrust::set_symmetric_difference(first1, last1, first2, last2, result); + return thrust::set_symmetric_difference(policy, first1, last1, first2, last2, result); } }; diff --git a/thrust/benchmarks/bench/set_operations/symmetric_difference_by_key.cu b/thrust/benchmarks/bench/set_operations/symmetric_difference_by_key.cu index 2313dc3ce35..d324d0653a6 100644 --- a/thrust/benchmarks/bench/set_operations/symmetric_difference_by_key.cu +++ b/thrust/benchmarks/bench/set_operations/symmetric_difference_by_key.cu @@ -31,14 +31,16 @@ struct op_t { static constexpr bool read_all_values = true; - template __host__ thrust::pair - operator()(InputIterator1 keys_first1, + operator()(const PolicyT& policy, + InputIterator1 keys_first1, InputIterator1 keys_last1, InputIterator2 keys_first2, InputIterator2 keys_last2, @@ -47,7 +49,8 @@ struct op_t OutputIterator1 keys_result, OutputIterator2 values_result) const { - return thrust::set_symmetric_difference_by_key(keys_first1, + return thrust::set_symmetric_difference_by_key(policy, + keys_first1, keys_last1, keys_first2, keys_last2, diff --git a/thrust/benchmarks/bench/set_operations/union.cu b/thrust/benchmarks/bench/set_operations/union.cu index 173e35d6c93..40d357c3299 100644 --- a/thrust/benchmarks/bench/set_operations/union.cu +++ b/thrust/benchmarks/bench/set_operations/union.cu @@ -29,16 +29,18 @@ struct op_t { - template - __host__ OutputIterator operator()(InputIterator1 first1, + template + __host__ OutputIterator operator()(const PolicyT& policy, + InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) const { - return thrust::set_union(first1, last1, first2, last2, result); + return thrust::set_union(policy, first1, last1, first2, last2, result); } }; diff --git a/thrust/benchmarks/bench/set_operations/union_by_key.cu b/thrust/benchmarks/bench/set_operations/union_by_key.cu index 545d4164db2..3791aeea0dd 100644 --- a/thrust/benchmarks/bench/set_operations/union_by_key.cu +++ b/thrust/benchmarks/bench/set_operations/union_by_key.cu @@ -31,14 +31,16 @@ struct op_t { static constexpr bool read_all_values = true; - template __host__ thrust::pair - operator()(InputIterator1 keys_first1, + operator()(const PolicyT& policy, + InputIterator1 keys_first1, InputIterator1 keys_last1, InputIterator2 keys_first2, InputIterator2 keys_last2, @@ -47,7 +49,8 @@ struct op_t OutputIterator1 keys_result, OutputIterator2 values_result) const { - return thrust::set_union_by_key(keys_first1, + return thrust::set_union_by_key(policy, + keys_first1, keys_last1, keys_first2, keys_last2, diff --git a/thrust/benchmarks/bench/shuffle/basic.cu b/thrust/benchmarks/bench/shuffle/basic.cu index cc24d267851..e47bc84a964 100644 --- a/thrust/benchmarks/bench/shuffle/basic.cu +++ b/thrust/benchmarks/bench/shuffle/basic.cu @@ -44,8 +44,11 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(elements); auto do_engine = [&](auto &&engine_constructor) { - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::shuffle(data.begin(), data.end(), engine_constructor()); + caching_allocator_t alloc; + thrust::shuffle(policy(alloc), data.begin(), data.end(), engine_constructor()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & launch) { + thrust::shuffle(policy(alloc, launch), data.begin(), data.end(), engine_constructor()); }); }; diff --git a/thrust/benchmarks/bench/sort/keys.cu b/thrust/benchmarks/bench/sort/keys.cu index d52dd6e7d1e..cefea506125 100644 --- a/thrust/benchmarks/bench/sort/keys.cu +++ b/thrust/benchmarks/bench/sort/keys.cu @@ -45,11 +45,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); + caching_allocator_t alloc; + thrust::sort(policy(alloc), vec.begin(), vec.end()); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, - [&](nvbench::launch &/* launch */, auto &timer) { + [&](nvbench::launch &launch, auto &timer) { vec = input; timer.start(); - thrust::sort(vec.begin(), vec.end()); + thrust::sort(policy(alloc, launch), vec.begin(), vec.end()); timer.stop(); }); } diff --git a/thrust/benchmarks/bench/sort/keys_custom.cu b/thrust/benchmarks/bench/sort/keys_custom.cu index f1eb8c2fdf1..868f3717745 100644 --- a/thrust/benchmarks/bench/sort/keys_custom.cu +++ b/thrust/benchmarks/bench/sort/keys_custom.cu @@ -45,11 +45,14 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); + caching_allocator_t alloc; + thrust::sort(policy(alloc), vec.begin(), vec.end(), less_t{}); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, - [&](nvbench::launch & /* launch */, auto &timer) { + [&](nvbench::launch & launch, auto &timer) { vec = input; timer.start(); - thrust::sort(vec.begin(), vec.end(), less_t{}); + thrust::sort(policy(alloc, launch), vec.begin(), vec.end(), less_t{}); timer.stop(); }); } diff --git a/thrust/benchmarks/bench/sort/pairs.cu b/thrust/benchmarks/bench/sort/pairs.cu index 9d2f06b2f5c..bec53d3f588 100644 --- a/thrust/benchmarks/bench/sort/pairs.cu +++ b/thrust/benchmarks/bench/sort/pairs.cu @@ -26,7 +26,6 @@ ******************************************************************************/ #include -#include #include #include "nvbench_helper.cuh" @@ -49,12 +48,15 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(elements); state.add_global_memory_writes(elements); + caching_allocator_t alloc; + thrust::sort_by_key(policy(alloc), keys.begin(), keys.end(), vals.begin()); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, - [&](nvbench::launch & /* launch */, auto &timer) { + [&](nvbench::launch & launch, auto &timer) { keys = in_keys; vals = in_vals; timer.start(); - thrust::sort_by_key(keys.begin(), keys.end(), vals.begin()); + thrust::sort_by_key(policy(alloc, launch), keys.begin(), keys.end(), vals.begin()); timer.stop(); }); } diff --git a/thrust/benchmarks/bench/sort/pairs_custom.cu b/thrust/benchmarks/bench/sort/pairs_custom.cu index bb731e03c6f..7840a138b67 100644 --- a/thrust/benchmarks/bench/sort/pairs_custom.cu +++ b/thrust/benchmarks/bench/sort/pairs_custom.cu @@ -49,12 +49,15 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(elements); state.add_global_memory_writes(elements); + caching_allocator_t alloc; + thrust::sort_by_key(policy(alloc), keys.begin(), keys.end(), vals.begin(), less_t{}); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, - [&](nvbench::launch & /* launch */, auto &timer) { + [&](nvbench::launch & launch, auto &timer) { keys = in_keys; vals = in_vals; timer.start(); - thrust::sort_by_key(keys.begin(), keys.end(), vals.begin(), less_t{}); + thrust::sort_by_key(policy(alloc, launch), keys.begin(), keys.end(), vals.begin(), less_t{}); timer.stop(); }); } diff --git a/thrust/benchmarks/bench/transform_reduce/sum.cu b/thrust/benchmarks/bench/transform_reduce/sum.cu index d9b2005d31e..94da31fbac1 100644 --- a/thrust/benchmarks/bench/transform_reduce/sum.cu +++ b/thrust/benchmarks/bench/transform_reduce/sum.cu @@ -51,9 +51,17 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - do_not_optimize(thrust::transform_reduce(in.begin(), in.end(), square_t{}, T{}, thrust::plus{})); - }); + caching_allocator_t alloc; + do_not_optimize(thrust::transform_reduce(policy(alloc), in.begin(), in.end(), + square_t{}, T{}, + thrust::plus{})); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + do_not_optimize(thrust::transform_reduce( + policy(alloc, launch), in.begin(), in.end(), square_t{}, + T{}, thrust::plus{})); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) diff --git a/thrust/benchmarks/bench/unique/basic.cu b/thrust/benchmarks/bench/unique/basic.cu index 2f01fb30453..9b1880459d4 100644 --- a/thrust/benchmarks/bench/unique/basic.cu +++ b/thrust/benchmarks/bench/unique/basic.cu @@ -43,17 +43,20 @@ static void basic(nvbench::state &state, nvbench::type_list) generate.uniform.key_segments(elements, min_segment_size, max_segment_size); thrust::device_vector output(elements); - const std::size_t unique_items = - thrust::distance(output.begin(), - thrust::unique_copy(input.cbegin(), input.cend(), output.begin())); + caching_allocator_t alloc; + const std::size_t unique_items = thrust::distance( + output.begin(), thrust::unique_copy(policy(alloc), input.cbegin(), + input.cend(), output.begin())); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_items); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::unique_copy(input.cbegin(), input.cend(), output.begin()); - }); + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::unique_copy(policy(alloc, launch), input.cbegin(), + input.cend(), output.begin()); + }); } NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) diff --git a/thrust/benchmarks/bench/unique/by_key.cu b/thrust/benchmarks/bench/unique/by_key.cu index ed43e64cb9b..f1f70ae0050 100644 --- a/thrust/benchmarks/bench/unique/by_key.cu +++ b/thrust/benchmarks/bench/unique/by_key.cu @@ -44,9 +44,10 @@ static void basic(nvbench::state &state, nvbench::type_list) thrust::device_vector out_keys(elements); thrust::device_vector in_vals(elements); - const std::size_t unique_elements = - thrust::distance(out_keys.begin(), - thrust::unique_copy(in_keys.cbegin(), in_keys.cend(), out_keys.begin())); + caching_allocator_t alloc; + const std::size_t unique_elements = thrust::distance( + out_keys.begin(), thrust::unique_copy(policy(alloc), in_keys.cbegin(), + in_keys.cend(), out_keys.begin())); thrust::device_vector out_vals(unique_elements); @@ -56,13 +57,12 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_elements); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::unique_by_key_copy(in_keys.cbegin(), - in_keys.cend(), - in_vals.cbegin(), - out_keys.begin(), - out_vals.begin()); - }); + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::unique_by_key_copy( + policy(alloc, launch), in_keys.cbegin(), in_keys.cend(), + in_vals.cbegin(), out_keys.begin(), out_vals.begin()); + }); } using key_types = nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::binary_search(data.begin(), - data.begin() + elements, - data.begin() + elements, - data.end(), - result.begin()); - }); + caching_allocator_t alloc; + thrust::binary_search(policy(alloc), + data.begin(), + data.begin() + elements, + data.begin() + elements, + data.end(), + result.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::binary_search( + policy(alloc, launch), data.begin(), data.begin() + elements, + data.begin() + elements, data.end(), result.begin()); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu index e3fbd6e6cbd..068aa7c322f 100644 --- a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu @@ -46,13 +46,20 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::lower_bound(data.begin(), - data.begin() + elements, - data.begin() + elements, - data.end(), - result.begin()); - }); + caching_allocator_t alloc; + thrust::lower_bound(policy(alloc), + data.begin(), + data.begin() + elements, + data.begin() + elements, + data.end(), + result.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::lower_bound( + policy(alloc, launch), data.begin(), data.begin() + elements, + data.begin() + elements, data.end(), result.begin()); + }); } using types = nvbench::type_list; diff --git a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu index 6b412ca299c..c92a7416e2f 100644 --- a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu @@ -46,13 +46,16 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { - thrust::upper_bound(data.begin(), - data.begin() + elements, - data.begin() + elements, - data.end(), - result.begin()); - }); + caching_allocator_t alloc; + thrust::upper_bound(policy(alloc), data.begin(), data.begin() + elements, + data.begin() + elements, data.end(), result.begin()); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch &launch) { + thrust::upper_bound( + policy(alloc, launch), data.begin(), data.begin() + elements, + data.begin() + elements, data.end(), result.begin()); + }); } using types = nvbench::type_list; diff --git a/thrust/cmake/ThrustBuildCompilerTargets.cmake b/thrust/cmake/ThrustBuildCompilerTargets.cmake index 0473b3a5057..f65cb1bd95b 100644 --- a/thrust/cmake/ThrustBuildCompilerTargets.cmake +++ b/thrust/cmake/ThrustBuildCompilerTargets.cmake @@ -26,9 +26,6 @@ function(thrust_build_compiler_targets) thrust_update_system_found_flags() - # Ensure that we build our tests without treating ourself as system header - list(APPEND cxx_compile_definitions "_CCCL_NO_SYSTEM_HEADER") - if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") append_option_if_available("/W4" cxx_compile_options) diff --git a/thrust/testing/CMakeLists.txt b/thrust/testing/CMakeLists.txt index 7f212b32422..cc95325b2ef 100644 --- a/thrust/testing/CMakeLists.txt +++ b/thrust/testing/CMakeLists.txt @@ -79,6 +79,9 @@ function(thrust_add_test target_name_var test_name test_src thrust_target) target_compile_definitions(${test_target} PRIVATE THRUST_TEST_DEVICE_SIDE) endif() + # Ensure that we build our tests without treating ourself as system header + target_compile_definitions(${test_target} PRIVATE "_CCCL_NO_SYSTEM_HEADER") + thrust_fix_clang_nvcc_build_for(${test_target}) # Add to the active configuration's meta target