Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu_hist performance tweaks #5707

Merged
merged 3 commits into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 0 additions & 30 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -427,36 +427,6 @@ class TemporaryArray {
size_t size_;
};

/**
* \brief A double buffer, useful for algorithms like sort.
*/
template <typename T>
class DoubleBuffer {
public:
cub::DoubleBuffer<T> buff;
xgboost::common::Span<T> a, b;
DoubleBuffer() = default;
template <typename VectorT>
DoubleBuffer(VectorT *v1, VectorT *v2) {
a = xgboost::common::Span<T>(v1->data().get(), v1->size());
b = xgboost::common::Span<T>(v2->data().get(), v2->size());
buff = cub::DoubleBuffer<T>(a.data(), b.data());
}

size_t Size() const {
CHECK_EQ(a.size(), b.size());
return a.size();
}
cub::DoubleBuffer<T> &CubBuffer() { return buff; }

T *Current() { return buff.Current(); }
xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{buff.Current(), Size()};
}

T *Other() { return buff.Alternate(); }
};

/**
* \brief Copies device span to std::vector.
*
Expand Down
51 changes: 25 additions & 26 deletions src/tree/gpu_hist/row_partitioner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,26 +93,23 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
position.size(), stream);
}

void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
common::Span<bst_node_t> position) {
CHECK_EQ(ridx.size(), position.size());
dh::LaunchN(device_idx, ridx.size(), [=] __device__(size_t idx) {
ridx[idx] = idx;
position[idx] = 0;
});
}

RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
: device_idx_(device_idx) {
: device_idx_(device_idx), ridx_a_(num_rows), position_a_(num_rows) {
dh::safe_cuda(cudaSetDevice(device_idx_));
ridx_a_.resize(num_rows);
ridx_b_.resize(num_rows);
position_a_.resize(num_rows);
position_b_.resize(num_rows);
ridx_ = dh::DoubleBuffer<RowIndexT>{&ridx_a_, &ridx_b_};
position_ = dh::DoubleBuffer<bst_node_t>{&position_a_, &position_b_};
ridx_segments_.emplace_back(Segment(0, num_rows));

thrust::sequence(
thrust::device_pointer_cast(ridx_.CurrentSpan().data()),
thrust::device_pointer_cast(ridx_.CurrentSpan().data() + ridx_.Size()));
thrust::fill(
thrust::device_pointer_cast(position_.Current()),
thrust::device_pointer_cast(position_.Current() + position_.Size()), 0);
Reset(device_idx, dh::ToSpan(ridx_a_), dh::ToSpan(position_a_));
left_counts_.resize(256);
thrust::fill(left_counts_.begin(), left_counts_.end(), 0);
streams_.resize(2);
ridx_segments_.emplace_back(Segment(0, num_rows));
for (auto& stream : streams_) {
dh::safe_cuda(cudaStreamCreate(&stream));
}
Expand All @@ -132,15 +129,15 @@ common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
if (segment.Size() == 0) {
return common::Span<const RowPartitioner::RowIndexT>();
}
return ridx_.CurrentSpan().subspan(segment.begin, segment.Size());
return dh::ToSpan(ridx_a_).subspan(segment.begin, segment.Size());
}

common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows() {
return ridx_.CurrentSpan();
return dh::ToSpan(ridx_a_);
}

common::Span<const bst_node_t> RowPartitioner::GetPosition() {
return position_.CurrentSpan();
return dh::ToSpan(position_a_);
}
std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
bst_node_t nidx) {
Expand All @@ -162,23 +159,25 @@ void RowPartitioner::SortPositionAndCopy(const Segment& segment,
bst_node_t right_nidx,
int64_t* d_left_count,
cudaStream_t stream) {
dh::TemporaryArray<bst_node_t> position_temp(position_a_.size());
dh::TemporaryArray<RowIndexT> ridx_temp(ridx_a_.size());
SortPosition(
// position_in
common::Span<bst_node_t>(position_.Current() + segment.begin,
common::Span<bst_node_t>(position_a_.data().get() + segment.begin,
segment.Size()),
// position_out
common::Span<bst_node_t>(position_.Other() + segment.begin,
common::Span<bst_node_t>(position_temp.data().get() + segment.begin,
segment.Size()),
// row index in
common::Span<RowIndexT>(ridx_.Current() + segment.begin, segment.Size()),
common::Span<RowIndexT>(ridx_a_.data().get() + segment.begin, segment.Size()),
// row index out
common::Span<RowIndexT>(ridx_.Other() + segment.begin, segment.Size()),
common::Span<RowIndexT>(ridx_temp.data().get() + segment.begin, segment.Size()),
left_nidx, right_nidx, d_left_count, stream);
// Copy back key/value
const auto d_position_current = position_.Current() + segment.begin;
const auto d_position_other = position_.Other() + segment.begin;
const auto d_ridx_current = ridx_.Current() + segment.begin;
const auto d_ridx_other = ridx_.Other() + segment.begin;
const auto d_position_current = position_a_.data().get() + segment.begin;
const auto d_position_other = position_temp.data().get() + segment.begin;
const auto d_ridx_current = ridx_a_.data().get() + segment.begin;
const auto d_ridx_other = ridx_temp.data().get() + segment.begin;
dh::LaunchN(device_idx_, segment.Size(), stream, [=] __device__(size_t idx) {
d_position_current[idx] = d_position_other[idx];
d_ridx_current[idx] = d_ridx_other[idx];
Expand Down
24 changes: 7 additions & 17 deletions src/tree/gpu_hist/row_partitioner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,8 @@ class RowPartitioner {
*/
/*! \brief Range of row index for each node, pointers into ridx below. */
std::vector<Segment> ridx_segments_;
dh::caching_device_vector<RowIndexT> ridx_a_;
dh::caching_device_vector<RowIndexT> ridx_b_;
dh::caching_device_vector<bst_node_t> position_a_;
dh::caching_device_vector<bst_node_t> position_b_;
/*! \brief mapping for node id -> rows.
* This looks like:
* node id | 1 | 2 |
* rows idx | 3, 5, 1 | 13, 31 |
*/
dh::DoubleBuffer<RowIndexT> ridx_;
/*! \brief mapping for row -> node id. */
dh::DoubleBuffer<bst_node_t> position_;
dh::TemporaryArray<RowIndexT> ridx_a_;
dh::TemporaryArray<bst_node_t> position_a_;
dh::caching_device_vector<int64_t>
left_counts_; // Useful to keep a bunch of zeroed memory for sort position
std::vector<cudaStream_t> streams_;
Expand Down Expand Up @@ -110,8 +100,8 @@ class RowPartitioner {
void UpdatePosition(bst_node_t nidx, bst_node_t left_nidx,
bst_node_t right_nidx, UpdatePositionOpT op) {
Segment segment = ridx_segments_.at(nidx); // rows belongs to node nidx
auto d_ridx = ridx_.CurrentSpan();
auto d_position = position_.CurrentSpan();
auto d_ridx = dh::ToSpan(ridx_a_);
auto d_position = dh::ToSpan(position_a_);
if (left_counts_.size() <= nidx) {
left_counts_.resize((nidx * 2) + 1);
thrust::fill(left_counts_.begin(), left_counts_.end(), 0);
Expand Down Expand Up @@ -159,9 +149,9 @@ class RowPartitioner {
*/
template <typename FinalisePositionOpT>
void FinalisePosition(FinalisePositionOpT op) {
auto d_position = position_.Current();
const auto d_ridx = ridx_.Current();
dh::LaunchN(device_idx_, position_.Size(), [=] __device__(size_t idx) {
auto d_position = position_a_.data().get();
const auto d_ridx = ridx_a_.data().get();
dh::LaunchN(device_idx_, position_a_.size(), [=] __device__(size_t idx) {
auto position = d_position[idx];
RowIndexT ridx = d_ridx[idx];
bst_node_t new_position = op(ridx, position);
Expand Down
1 change: 0 additions & 1 deletion src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,6 @@ struct GPUHistMakerDevice {
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
reducer->Synchronize();

monitor.Stop("AllReduce");
}
Expand Down