Revert "Rename CUDA_TRY macro to CUDF_CUDA_TRY, rename CHECK_CUDA mac…

…ro to CUDF_CHECK_CUDA. (rapidsai#10589)" This reverts commit e8d189c.
abellina · Apr 14, 2022 · 04bab9e · 04bab9e
1 parent ce56bc3
commit 04bab9e
Show file tree

Hide file tree

Showing 77 changed files with 452 additions and 485 deletions.
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
@@ -45,7 +45,7 @@ static void BM_concatenate(benchmark::State& state)
   auto input_columns = input->view();
   std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -87,7 +87,7 @@ static void BM_concatenate_tables(benchmark::State& state)
     return table->view();
   });
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -146,7 +146,7 @@ static void BM_concatenate_strings(benchmark::State& state)
       return static_cast<cudf::column_view>(col);
     });
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,31 +139,31 @@ void generate_input_tables(key_type* const build_tbl,
 
   // Maximize exposed parallelism while minimizing storage for curand state
   int num_blocks_init_build_tbl{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_build_tbl, init_build_tbl<key_type, size_type>, block_size, 0));
 
   int num_blocks_init_probe_tbl{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
   int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
+  CUDA_TRY(cudaGetDevice(&dev_id));
 
   int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
   rmm::device_uvector<curandState> devStates(num_states, rmm::cuda_stream_default);
 
   init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   init_build_tbl<key_type, size_type><<<num_sms * num_blocks_init_build_tbl, block_size>>>(
     build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   auto const rand_max = std::numeric_limits<key_type>::max();
 
@@ -177,5 +177,5 @@ void generate_input_tables(key_type* const build_tbl,
                                                           devStates.data(),
                                                           num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 }
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
@@ -116,7 +116,7 @@ static void BM_join(state_type& state, Join JoinFunc)
   auto build_payload_column = cudf::sequence(build_table_size, *init);
   auto probe_payload_column = cudf::sequence(probe_table_size, *init);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   cudf::table_view build_table({build_key_column->view(), *build_payload_column});
   cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column});

diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,32 +29,32 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
   // flush all of L2$
   if (flush_l2_cache) {
     int current_device = 0;
-    CUDF_CUDA_TRY(cudaGetDevice(&current_device));
+    CUDA_TRY(cudaGetDevice(&current_device));
 
     int l2_cache_bytes = 0;
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+    CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDF_CUDA_TRY(
+      CUDA_TRY(
         cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
-  CUDF_CUDA_TRY(cudaEventCreate(&start));
-  CUDF_CUDA_TRY(cudaEventCreate(&stop));
-  CUDF_CUDA_TRY(cudaEventRecord(start, stream.value()));
+  CUDA_TRY(cudaEventCreate(&start));
+  CUDA_TRY(cudaEventCreate(&stop));
+  CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  CUDF_CUDA_TRY(cudaEventRecord(stop, stream.value()));
-  CUDF_CUDA_TRY(cudaEventSynchronize(stop));
+  CUDA_TRY(cudaEventRecord(stop, stream.value()));
+  CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;
-  CUDF_CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
+  CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
   p_state->SetIterationTime(milliseconds / (1000.0f));
-  CUDF_CUDA_TRY(cudaEventDestroy(start));
-  CUDF_CUDA_TRY(cudaEventDestroy(stop));
+  CUDA_TRY(cudaEventDestroy(start));
+  CUDA_TRY(cudaEventDestroy(stop));
 }
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -196,13 +196,13 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);
 
   if (dispatching_type == NO_DISPATCHING) {
-    CUDF_CUDA_TRY(cudaMemcpy(
+    CUDA_TRY(cudaMemcpy(
       d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
   }
 
   // Warm up
   launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
-  CUDF_CUDA_TRY(cudaDeviceSynchronize());
+  CUDA_TRY(cudaDeviceSynchronize());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0

diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
@@ -373,7 +373,7 @@ namespace detail{
     void external_function(..., rmm::cuda_stream_view stream){
         // Implementation uses the stream with async APIs.
         rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream), ...);
     }
@@ -777,7 +777,7 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
+Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
 macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
 thrown exception includes a description of the CUDA error code in its `what()` message.
 

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
@@ -254,10 +254,10 @@ struct scatter_gather_functor {
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
+      CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
+                               0,
+                               cudf::bitmask_allocation_size_bytes(output.size()),
+                               stream.value()));
     }
 
     auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
@@ -344,7 +344,7 @@ std::unique_ptr<table> copy_if(
 
   // initialize just the first element of block_offsets to 0 since the InclusiveSum below
   // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
 
   // 2. Find the offset for each block's output using a scan of block counts
   if (grid.num_blocks > 1) {
@@ -370,7 +370,7 @@ std::unique_ptr<table> copy_if(
   // As it is InclusiveSum, last value in block_offsets will be output_size
   // unless num_blocks == 1, in which case output_size is just block_counts[0]
   cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     &output_size,
     grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
     sizeof(cudf::size_type),

diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
       nullptr);
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 }
 
 /**

diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,11 +49,11 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
   T result;
-  CUDF_CUDA_TRY(cudaMemcpyAsync(&result,
-                                col_view.data<T>() + element_index,
-                                sizeof(T),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(&result,
+                           col_view.data<T>() + element_index,
+                           sizeof(T),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
   return result;
 }

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
@@ -170,20 +170,20 @@ size_type inplace_bitmask_binop(
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                                masks_begin_bits.data(),
-                                masks_begin_bits.size_bytes(),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
+                           masks_begin_bits.data(),
+                           masks_begin_bits.size_bytes(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);
   offset_bitmask_binop<block_size>
     <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
   return d_counter.value(stream);
 }
 
@@ -298,25 +298,27 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Allocate temporary memory.
   size_t temp_storage_bytes{0};
-  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
-                                                temp_storage_bytes,
-                                                num_set_bits_in_word,
-                                                d_bit_counts.begin(),
-                                                num_ranges,
-                                                first_word_indices,
-                                                last_word_indices,
-                                                stream.value()));
+  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
+                                           temp_storage_bytes,
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
+                                           num_ranges,
+                                           first_word_indices,
+                                           last_word_indices,
+                                           stream.value()));
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
   // Perform segmented reduction.
-  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
-                                                temp_storage_bytes,
-                                                num_set_bits_in_word,
-                                                d_bit_counts.begin(),
-                                                num_ranges,
-                                                first_word_indices,
-                                                last_word_indices,
-                                                stream.value()));
+  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
+                                           temp_storage_bytes,
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
+                                           num_ranges,
+                                           first_word_indices,
+                                           last_word_indices,
+                                           stream.value()));
+
+  CHECK_CUDA(stream.value());
 
   // Adjust counts in segment boundaries (if segments are not word-aligned).
   constexpr size_type block_size{256};
@@ -348,7 +350,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                       });
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
   return d_bit_counts;
 }
 

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -127,12 +127,12 @@ cudf::size_type elements_per_thread(Kernel kernel,
 
   // calculate theoretical occupancy
   int max_blocks = 0;
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
 
   int device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  CUDA_TRY(cudaGetDevice(&device));
   int num_sms = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
   int per_thread = total_size / (max_blocks * num_sms * block_size);
   return std::clamp(per_thread, 1, max_per_thread);
 }