Skip to content

Commit

Permalink
Revert "Rename CUDA_TRY macro to CUDF_CUDA_TRY, rename CHECK_CUDA mac…
Browse files Browse the repository at this point in the history
…ro to CUDF_CHECK_CUDA. (rapidsai#10589)"

This reverts commit e8d189c.
  • Loading branch information
abellina committed Apr 14, 2022
1 parent ce56bc3 commit 04bab9e
Show file tree
Hide file tree
Showing 77 changed files with 452 additions and 485 deletions.
6 changes: 3 additions & 3 deletions cpp/benchmarks/column/concatenate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static void BM_concatenate(benchmark::State& state)
auto input_columns = input->view();
std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
Expand Down Expand Up @@ -87,7 +87,7 @@ static void BM_concatenate_tables(benchmark::State& state)
return table->view();
});

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
Expand Down Expand Up @@ -146,7 +146,7 @@ static void BM_concatenate_strings(benchmark::State& state)
return static_cast<cudf::column_view>(col);
});

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
Expand Down
16 changes: 8 additions & 8 deletions cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -139,31 +139,31 @@ void generate_input_tables(key_type* const build_tbl,

// Maximize exposed parallelism while minimizing storage for curand state
int num_blocks_init_build_tbl{-1};
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks_init_build_tbl, init_build_tbl<key_type, size_type>, block_size, 0));

int num_blocks_init_probe_tbl{-1};
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));

int dev_id{-1};
CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
CUDA_TRY(cudaGetDevice(&dev_id));

int num_sms{-1};
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));

const int num_states =
num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
rmm::device_uvector<curandState> devStates(num_states, rmm::cuda_stream_default);

init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

init_build_tbl<key_type, size_type><<<num_sms * num_blocks_init_build_tbl, block_size>>>(
build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states);

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

auto const rand_max = std::numeric_limits<key_type>::max();

Expand All @@ -177,5 +177,5 @@ void generate_input_tables(key_type* const build_tbl,
devStates.data(),
num_states);

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);
}
2 changes: 1 addition & 1 deletion cpp/benchmarks/join/join_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ static void BM_join(state_type& state, Join JoinFunc)
auto build_payload_column = cudf::sequence(build_table_size, *init);
auto probe_payload_column = cudf::sequence(probe_table_size, *init);

CUDF_CHECK_CUDA(0);
CHECK_CUDA(0);

cudf::table_view build_table({build_key_column->view(), *build_payload_column});
cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column});
Expand Down
24 changes: 12 additions & 12 deletions cpp/benchmarks/synchronization/synchronization.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,32 +29,32 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&current_device));
CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
const int memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
CUDF_CUDA_TRY(
CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}

CUDF_CUDA_TRY(cudaEventCreate(&start));
CUDF_CUDA_TRY(cudaEventCreate(&stop));
CUDF_CUDA_TRY(cudaEventRecord(start, stream.value()));
CUDA_TRY(cudaEventCreate(&start));
CUDA_TRY(cudaEventCreate(&stop));
CUDA_TRY(cudaEventRecord(start, stream.value()));
}

cuda_event_timer::~cuda_event_timer()
{
CUDF_CUDA_TRY(cudaEventRecord(stop, stream.value()));
CUDF_CUDA_TRY(cudaEventSynchronize(stop));
CUDA_TRY(cudaEventRecord(stop, stream.value()));
CUDA_TRY(cudaEventSynchronize(stop));

float milliseconds = 0.0f;
CUDF_CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
p_state->SetIterationTime(milliseconds / (1000.0f));
CUDF_CUDA_TRY(cudaEventDestroy(start));
CUDF_CUDA_TRY(cudaEventDestroy(stop));
CUDA_TRY(cudaEventDestroy(start));
CUDA_TRY(cudaEventDestroy(stop));
}
4 changes: 2 additions & 2 deletions cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,13 @@ void type_dispatcher_benchmark(::benchmark::State& state)
rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);

if (dispatching_type == NO_DISPATCHING) {
CUDF_CUDA_TRY(cudaMemcpy(
CUDA_TRY(cudaMemcpy(
d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
}

// Warm up
launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
CUDF_CUDA_TRY(cudaDeviceSynchronize());
CUDA_TRY(cudaDeviceSynchronize());

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
Expand Down
4 changes: 2 additions & 2 deletions cpp/docs/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ namespace detail{
void external_function(..., rmm::cuda_stream_view stream){
// Implementation uses the stream with async APIs.
rmm::device_buffer buff(...,stream);
CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream), ...);
}
Expand Down Expand Up @@ -777,7 +777,7 @@ CUDF_FAIL("This code path should not be reached.");

### CUDA Error Checking

Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
thrown exception includes a description of the CUDA error code in its `what()` message.

Expand Down
12 changes: 6 additions & 6 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,10 @@ struct scatter_gather_functor {
if (output.nullable()) {
// Have to initialize the output mask to all zeros because we may update
// it with atomicOr().
CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
0,
cudf::bitmask_allocation_size_bytes(output.size()),
stream.value()));
CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
0,
cudf::bitmask_allocation_size_bytes(output.size()),
stream.value()));
}

auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
Expand Down Expand Up @@ -344,7 +344,7 @@ std::unique_ptr<table> copy_if(

// initialize just the first element of block_offsets to 0 since the InclusiveSum below
// starts at the second element.
CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));

// 2. Find the offset for each block's output using a scan of block counts
if (grid.num_blocks > 1) {
Expand All @@ -370,7 +370,7 @@ std::unique_ptr<table> copy_if(
// As it is InclusiveSum, last value in block_offsets will be output_size
// unless num_blocks == 1, in which case output_size is just block_counts[0]
cudf::size_type output_size{0};
CUDF_CUDA_TRY(cudaMemcpyAsync(
CUDA_TRY(cudaMemcpyAsync(
&output_size,
grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
sizeof(cudf::size_type),
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
nullptr);
}

CUDF_CHECK_CUDA(stream.value());
CHECK_CUDA(stream.value());
}

/**
Expand Down
12 changes: 6 additions & 6 deletions cpp/include/cudf/detail/get_value.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,11 +49,11 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
"invalid element_index value");
T result;
CUDF_CUDA_TRY(cudaMemcpyAsync(&result,
col_view.data<T>() + element_index,
sizeof(T),
cudaMemcpyDeviceToHost,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(&result,
col_view.data<T>() + element_index,
sizeof(T),
cudaMemcpyDeviceToHost,
stream.value()));
stream.synchronize();
return result;
}
Expand Down
50 changes: 26 additions & 24 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -170,20 +170,20 @@ size_type inplace_bitmask_binop(
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

CUDF_CUDA_TRY(cudaMemcpyAsync(
CUDA_TRY(cudaMemcpyAsync(
d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
masks_begin_bits.data(),
masks_begin_bits.size_bytes(),
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
masks_begin_bits.data(),
masks_begin_bits.size_bytes(),
cudaMemcpyHostToDevice,
stream.value()));

auto constexpr block_size = 256;
cudf::detail::grid_1d config(dest_mask.size(), block_size);
offset_bitmask_binop<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
CUDF_CHECK_CUDA(stream.value());
CHECK_CUDA(stream.value());
return d_counter.value(stream);
}

Expand Down Expand Up @@ -298,25 +298,27 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,

// Allocate temporary memory.
size_t temp_storage_bytes{0};
CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
temp_storage_bytes,
num_set_bits_in_word,
d_bit_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
temp_storage_bytes,
num_set_bits_in_word,
d_bit_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);

// Perform segmented reduction.
CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
temp_storage_bytes,
num_set_bits_in_word,
d_bit_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
temp_storage_bytes,
num_set_bits_in_word,
d_bit_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));

CHECK_CUDA(stream.value());

// Adjust counts in segment boundaries (if segments are not word-aligned).
constexpr size_type block_size{256};
Expand Down Expand Up @@ -348,7 +350,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
});
}

CUDF_CHECK_CUDA(stream.value());
CHECK_CUDA(stream.value());
return d_bit_counts;
}

Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,12 @@ cudf::size_type elements_per_thread(Kernel kernel,

// calculate theoretical occupancy
int max_blocks = 0;
CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));

int device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&device));
CUDA_TRY(cudaGetDevice(&device));
int num_sms = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
int per_thread = total_size / (max_blocks * num_sms * block_size);
return std::clamp(per_thread, 1, max_per_thread);
}
Expand Down
Loading

0 comments on commit 04bab9e

Please sign in to comment.