NVIDIA · gevtushenko · Dec 12, 2023 · Dec 10, 2023
@@ -1,14 +1,16 @@
 #pragma once
 
-#include <cuda/std/complex>
-
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include <cuda/std/complex>
+#include <cuda/std/span>
 
 #include <limits>
+#include <map>
 #include <stdexcept>
 
 #include <nvbench/nvbench.cuh>
-#include <cuda/std/span>
 
 #if defined(_MSC_VER)
 #define NVBENCH_HELPER_HAS_I128 0
@@ -478,3 +480,118 @@ struct max_t
     return less(lhs, rhs) ? rhs : lhs;
   }
 };
+
+namespace
+{
+struct caching_allocator_t
+{
+  using value_type = char;
+
+  caching_allocator_t() = default;
+  ~caching_allocator_t()
+  {
+    free_all();
+  }
+
+  char* allocate(std::ptrdiff_t num_bytes)
+  {
+    value_type* result{};
+    auto free_block = free_blocks.find(num_bytes);
+
+    if (free_block != free_blocks.end())
+    {
+      result = free_block->second;
+      free_blocks.erase(free_block);
+    }
+    else
+    {
+      result = do_allocate(num_bytes);
+    }
+
+    allocated_blocks.insert(std::make_pair(result, num_bytes));
+    return result;
+  }
+
+  void deallocate(char* ptr, size_t)
+  {
+    auto iter = allocated_blocks.find(ptr);
+    if (iter == allocated_blocks.end())
+    {
+      throw std::runtime_error("Memory was not allocated by this allocator");
+    }
+
+    std::ptrdiff_t num_bytes = iter->second;
+    allocated_blocks.erase(iter);
+    free_blocks.insert(std::make_pair(num_bytes, ptr));
+  }
+
+private:
+  using free_blocks_type      = std::multimap<std::ptrdiff_t, char*>;
+  using allocated_blocks_type = std::map<char*, std::ptrdiff_t>;
+
+  free_blocks_type free_blocks;
+  allocated_blocks_type allocated_blocks;
+
+  void free_all()
+  {
+    for (auto i : free_blocks)
+    {
+      do_deallocate(i.second);
+    }
+
+    for (auto i : allocated_blocks)
+    {
+      do_deallocate(i.first);
+    }
+  }
+
+  value_type* do_allocate(std::size_t num_bytes)
+  {
+    value_type* result{};
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    const cudaError_t status = cudaMalloc(&result, num_bytes);
+    if (cudaSuccess != status)
+    {
+      throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status));
+    }
+#else
+    result = new value_type[num_bytes];
+#endif
+    return result;
+  }
+
+  void do_deallocate(value_type* ptr)
+  {
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    cudaFree(ptr);
+#else
+    delete[] ptr;
+#endif
+  }
+};
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+auto policy(caching_allocator_t& alloc)
+{
+  return thrust::cuda::par(alloc);
+}
+#else
+auto policy(caching_allocator_t&)
+{
+  return thrust::device;
+}
+#endif
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+auto policy(caching_allocator_t& alloc, nvbench::launch& launch)
+{
+  return thrust::cuda::par(alloc).on(launch.get_stream());
+}
+#else
+auto policy(caching_allocator_t&, nvbench::launch&)
+{
+  return thrust::device;
+}
+#endif
+
+} // namespace
@@ -43,8 +43,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin());
+  caching_allocator_t alloc;
+  thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin());
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
   });
 }
 

@@ -49,8 +49,7 @@ struct custom_op
 };
 
 template <typename T>
-static void basic(nvbench::state &state, nvbench::type_list<T>)
-{
+static void basic(nvbench::state &state, nvbench::type_list<T>) {
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
 
   thrust::device_vector<T> input = generate(elements);
@@ -60,9 +59,16 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});
-  });
+  caching_allocator_t alloc;
+  thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(),
+                              output.begin(), custom_op<T>{42});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::adjacent_difference(policy(alloc, launch),
+                                           input.cbegin(), input.cend(),
+                                           output.begin(), custom_op<T>{42});
+             });
 }
 
 using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;

@@ -42,9 +42,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin());
-  });
+  caching_allocator_t alloc;
+  thrust::adjacent_difference(policy(alloc), vec.begin(), vec.end(), vec.begin());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::adjacent_difference(policy(alloc, launch), vec.begin(),
+                                           vec.end(), vec.begin());
+             });
 }
 
 using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;

@@ -25,7 +25,7 @@
  *
  ******************************************************************************/
 
-#include <nvbench/nvbench.cuh>
+#include <nvbench_helper.cuh>
 
 #include <thrust/count.h>
 #include <thrust/device_vector.h>
@@ -45,11 +45,14 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::copy(input.cbegin(),
-                 input.cend(),
-                 output.begin());
-  });
+  caching_allocator_t alloc;
+  thrust::copy(policy(alloc), input.cbegin(), input.cend(), output.begin());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::copy(policy(alloc, launch), input.cbegin(), input.cend(),
+                            output.begin());
+             });
 }
 
 using types = nvbench::type_list<nvbench::uint8_t,

@@ -74,9 +74,14 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(selected_elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op);
-  });
+  caching_allocator_t alloc;
+  thrust::copy_if(policy(alloc), input.cbegin(), input.cend(), output.begin(), select_op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::copy_if(policy(alloc, launch), input.cbegin(),
+                               input.cend(), output.begin(), select_op);
+             });
 }
 
 using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;

@@ -41,9 +41,13 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_element_count(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::fill(output.begin(), output.end(), T{42});
-  });
+  caching_allocator_t alloc;
+  thrust::fill(policy(alloc), output.begin(), output.end(), T{42});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::fill(policy(alloc, launch), output.begin(), output.end(), T{42});
+             });
 }
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))

@@ -44,9 +44,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements * 2);
   state.add_global_memory_writes<T>(1);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0});
-  });
+  caching_allocator_t alloc;
+  thrust::inner_product(policy(alloc), lhs.begin(), lhs.end(), rhs.begin(), T{0});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::inner_product(policy(alloc, launch), lhs.begin(),
+                                     lhs.end(), rhs.begin(), T{0});
+             });
 }
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types))

@@ -50,13 +50,17 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::merge(in.cbegin(),
-                  in.cbegin() + elements_in_lhs,
-                  in.cbegin() + elements_in_lhs,
-                  in.cend(),
-                  out.begin());
-  });
+  caching_allocator_t alloc;
+  thrust::merge(policy(alloc), in.cbegin(), in.cbegin() + elements_in_lhs,
+                in.cbegin() + elements_in_lhs, in.cend(), out.begin());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::merge(policy(alloc, launch), in.cbegin(),
+                             in.cbegin() + elements_in_lhs,
+                             in.cbegin() + elements_in_lhs, in.cend(),
+                             out.begin());
+             });
 }
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))

@@ -72,13 +72,19 @@ static void basic(nvbench::state &state,
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::partition_copy(input.cbegin(),
-                           input.cend(),
-                           output.begin(),
-                           thrust::make_reverse_iterator(output.begin() + elements),
-                           select_op);
-  });
+  caching_allocator_t alloc;
+  thrust::partition_copy(
+      policy(alloc), input.cbegin(), input.cend(), output.begin(),
+      thrust::make_reverse_iterator(output.begin() + elements), select_op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::partition_copy(
+                   policy(alloc, launch), input.cbegin(), input.cend(),
+                   output.begin(),
+                   thrust::make_reverse_iterator(output.begin() + elements),
+                   select_op);
+             });
 }
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))

@@ -42,8 +42,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(1);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    do_not_optimize(thrust::reduce(in.begin(), in.end()));
+  caching_allocator_t alloc;
+  do_not_optimize(thrust::reduce(policy(alloc), in.begin(), in.end()));
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & launch) {
+    do_not_optimize(thrust::reduce(policy(alloc, launch), in.begin(), in.end()));
   });
 }
 

@@ -57,13 +57,16 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_writes<KeyT>(unique_keys);
   state.add_global_memory_writes<ValueT>(unique_keys);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::reduce_by_key(in_keys.begin(),
-                          in_keys.end(),
-                          in_vals.begin(),
-                          out_keys.begin(),
-                          out_vals.begin());
-  });
+  caching_allocator_t alloc;
+  thrust::reduce_by_key(policy(alloc), in_keys.begin(), in_keys.end(),
+                        in_vals.begin(), out_keys.begin(), out_vals.begin());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::reduce_by_key(policy(alloc, launch), in_keys.begin(),
+                                     in_keys.end(), in_vals.begin(),
+                                     out_keys.begin(), out_vals.begin());
+             });
 }
 
 using key_types = nvbench::type_list<int8_t,

@@ -45,9 +45,16 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
   state.add_global_memory_reads<ValueT>(elements);
   state.add_global_memory_writes<ValueT>(elements);
 
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
-    thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
-  });
+  caching_allocator_t alloc;
+  thrust::exclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(),
+                                in_vals.cbegin(), out_vals.begin());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::exclusive_scan_by_key(
+                   policy(alloc, launch), keys.cbegin(), keys.cend(),
+                   in_vals.cbegin(), out_vals.begin());
+             });
 }
 
 using key_types = all_types;