Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/cuda shuffle changes #108

Merged
merged 3 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ if(CCTAG_WITH_CUDA)
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-gencode=arch=compute_52,code=sm_52;-gencode=arch=compute_52,code=compute_52;--default-stream;per-thread;-Xptxas;--warn-on-local-memory-usage;-Xptxas;--warn-on-spills")
cuda_find_library_local_first(CUDA_CUDADEVRT_LIBRARY cudadevrt "\"cudadevrt\" library")

if( ( CUDA_VERSION VERSION_EQUAL "9.0" ) OR ( CUDA_VERSION VERSION_GREATER "9.0") )
set(CCTAG_HAVE_SHFL_DOWN_SYNC 1)
else()
set(CCTAG_HAVE_SHFL_DOWN_SYNC 0)
endif()
else(CCTAG_WITH_CUDA)
message( STATUS "Building without CUDA" )
endif(CCTAG_WITH_CUDA)
Expand Down
6 changes: 5 additions & 1 deletion cmake/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@

#define CUB_CDP

#endif
#ifndef CCTAG_HAVE_SHFL_DOWN_SYNC
#cmakedefine CCTAG_HAVE_SHFL_DOWN_SYNC
#endif

#endif
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ if(CCTAG_WITH_CUDA)
PUBLIC -DCCTAG_WITH_CUDA -DCUB_CDP
PRIVATE ${TBB_DEFINITIONS})

if(CCTAG_HAVE_SHFL_DOWN_SYNC)
target_compile_definitions(CCTag PRIVATE "-DCCTAG_HAVE_SHFL_DOWN_SYNC")
endif(CCTAG_HAVE_SHFL_DOWN_SYNC)

# This is nececessary for the CCTagConfig.cmake to correctly export the
# includes, always because we used CUDA_ADD_LIBRARY
set_target_properties(CCTag PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
Expand Down
32 changes: 30 additions & 2 deletions src/cctag/cuda/assist.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,47 @@ inline int d_sign( T value )
return ( ( value < 0 ) ? -1 : 1 );
}

#ifdef CCTAG_HAVE_SHFL_DOWN_SYNC
template<typename T> __device__ inline T shuffle ( T variable, int src ) { return __shfl_sync ( 0xffffffff, variable, src ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta ) { return __shfl_up_sync ( 0xffffffff, variable, delta ); }
template<typename T> __device__ inline T shuffle_down( T variable, int delta ) { return __shfl_down_sync( 0xffffffff, variable, delta ); }
template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) { return __shfl_xor_sync ( 0xffffffff, variable, delta ); }
__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot_sync ( 0xffffffff, pred ); }
__device__ inline int any ( unsigned int pred ) { return __any_sync ( 0xffffffff, pred ); }
__device__ inline int all ( unsigned int pred ) { return __all_sync ( 0xffffffff, pred ); }

template<typename T> __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl_sync ( 0xffffffff, variable, src , ws ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up_sync ( 0xffffffff, variable, delta, ws ); }
template<typename T> __device__ inline T shuffle_down( T variable, int delta, int ws ) { return __shfl_down_sync( 0xffffffff, variable, delta, ws ); }
template<typename T> __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor_sync ( 0xffffffff, variable, delta, ws ); }
#else
template<typename T> __device__ inline T shuffle ( T variable, int src ) { return __shfl ( variable, src ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta ) { return __shfl_up ( variable, delta ); }
template<typename T> __device__ inline T shuffle_down( T variable, int delta ) { return __shfl_down( variable, delta ); }
template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) { return __shfl_xor ( variable, delta ); }
__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot ( pred ); }
__device__ inline int any ( unsigned int pred ) { return __any ( pred ); }
__device__ inline int all ( unsigned int pred ) { return __all ( pred ); }

template<typename T> __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl ( variable, src , ws ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up ( variable, delta, ws ); }
template<typename T> __device__ inline T shuffle_down( T variable, int delta, int ws ) { return __shfl_down( variable, delta, ws ); }
template<typename T> __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor ( variable, delta, ws ); }
#endif

__device__
inline
bool reduce_OR_32x32( bool cnt )
{
__shared__ int reduce_array[32];

int cnt_row = ::__any( cnt );
int cnt_row = cctag::any( cnt );
if( threadIdx.x == 0 ) {
reduce_array[threadIdx.y] = cnt_row;
}
__syncthreads();
if( threadIdx.y == 0 ) {
int cnt_col = ::__any( reduce_array[threadIdx.x] );
int cnt_col = cctag::any( reduce_array[threadIdx.x] );
if( threadIdx.x == 0 ) {
reduce_array[0] = cnt_col;
}
Expand Down
10 changes: 5 additions & 5 deletions src/cctag/cuda/frame_04_hyst.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ bool edge_block_loop( )
bool mark = update_edge_pixel( threadIdx.y, threadIdx.x*4 );

/* every row checks whether any pixel has been changed */
bool line_changed = ::__any( mark );
bool line_changed = cctag::any( mark );

#if 0
/* the first thread of each row write the result to continuation[] */
Expand All @@ -166,7 +166,7 @@ bool edge_block_loop( )
* threadfence() is implied by syncthreads() */
__syncthreads();

/* Each thread in a warp reads ::__any() results for one of 32 warps.
/* Each thread in a warp reads cctag::any() results for one of 32 warps.
* Redundant, but I have no better idea for spreading the result
* to all warps. */
// mark = threadIdx.x < HYST_H ? continuation[threadIdx.x] : false;
Expand All @@ -177,13 +177,13 @@ bool edge_block_loop( )
* If there has been any change in this round, try to spread
* the change further.
*/
again = ::__any( mark );
again = cctag::any( mark );
#else
if( threadIdx.x == 0 ) continuation[threadIdx.y] = line_changed;
__syncthreads();
if( threadIdx.y == 0 ) {
mark = continuation[threadIdx.x];
again = ::__any(mark);
again = cctag::any(mark);
if( threadIdx.x == 0 ) {
continuation[0] = again;
}
Expand Down Expand Up @@ -260,7 +260,7 @@ void edge_second( cv::cuda::PtrStepSzb img, FrameMetaPtr meta )

bool something_changed = edge( meta );

if( ::__any( something_changed ) ) {
if( cctag::any( something_changed ) ) {
store( input );
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/cctag/cuda/frame_05_thin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,15 @@ void second_round( cv::cuda::PtrStepSzb src, // input
atomicAdd( &meta.num_edges_thinned(), 1 );
}
#endif
uint32_t mask = __ballot( keep ); // bitfield of warps with results
uint32_t mask = cctag::ballot( keep ); // bitfield of warps with results
uint32_t ct = __popc( mask ); // horizontal reduce
uint32_t leader = __ffs(mask) - 1; // the highest thread id with indicator==true
uint32_t write_index;
if( threadIdx.x == leader ) {
// leader gets warp's offset from global value and increases it
write_index = atomicAdd( &meta.list_size_all_edgecoords(), int(ct) );
}
write_index = __shfl( write_index, leader ); // broadcast warp write index to all
write_index = cctag::shuffle( write_index, leader ); // broadcast warp write index to all
write_index += __popc( mask & ((1 << threadIdx.x) - 1) ); // find own write index

if( keep ) {
Expand Down
5 changes: 2 additions & 3 deletions src/cctag/cuda/frame_06_graddesc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <algorithm>
#include <limits>
#include <cctag/cuda/cctag_cuda_runtime.h>
#include <cub/cub.cuh>
#include <stdio.h>
#include "debug_macros.hpp"
#include "debug_is_on_edge.h"
Expand Down Expand Up @@ -257,7 +256,7 @@ void gradient_descent( FrameMetaPtr meta,
assert( ! outOfBounds( out_edge.descending.befor.x, out_edge.descending.befor.y, edgepoint_index_table ) );
assert( ! outOfBounds( out_edge.descending.after.x, out_edge.descending.after.y, edgepoint_index_table ) );

uint32_t mask = __ballot( keep ); // bitfield of warps with results
uint32_t mask = cctag::ballot( keep ); // bitfield of warps with results

// keep is false for all 32 threads
if( mask == 0 ) return;
Expand Down Expand Up @@ -290,7 +289,7 @@ void gradient_descent( FrameMetaPtr meta,
}
// assert( *chained_edgecoord_list_sz >= 2*all_edgecoord_list_sz );

write_index = __shfl( write_index, leader ); // broadcast warp write index to all
write_index = cctag::shuffle( write_index, leader ); // broadcast warp write index to all
write_index += __popc( mask & ((1 << threadIdx.x) - 1) ); // find own write index

assert( write_index >= 0 );
Expand Down
1 change: 0 additions & 1 deletion src/cctag/cuda/frame_07_vote.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <algorithm>
#include <limits>
#include <cctag/cuda/cctag_cuda_runtime.h>
#include <cub/cub.cuh>
#include <stdio.h>
#include "debug_macros.hpp"
#include "debug_is_on_edge.h"
Expand Down
5 changes: 0 additions & 5 deletions src/cctag/cuda/frame_07_vote.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
#include <cctag/cuda/cctag_cuda_runtime.h>
#include <opencv2/core/cuda_types.hpp>

#if CUDA_VERSION >= 8000
#else
#include <cub/cub.cuh>
#endif

#include "onoff.h"

#include "framemeta.h"
Expand Down
5 changes: 2 additions & 3 deletions src/cctag/cuda/frame_07a_vote_line.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <algorithm>
#include <limits>
#include <cctag/cuda/cctag_cuda_runtime.h>
#include <cub/cub.cuh>
#include <stdio.h>
#include "debug_macros.hpp"
#include "debug_is_on_edge.h"
Expand Down Expand Up @@ -307,15 +306,15 @@ void construct_line( FrameMetaPtr meta,
if( chosen && chosen->coord.x == 0 && chosen->coord.y == 0 ) chosen = 0;

int idx = 0;
uint32_t mask = __ballot( chosen != 0 );
uint32_t mask = cctag::ballot( chosen != 0 );
uint32_t ct = __popc( mask );
if( ct == 0 ) return;

uint32_t write_index;
if( threadIdx.x == 0 ) {
write_index = atomicAdd( &meta.list_size_inner_points(), (int)ct );
}
write_index = __shfl( write_index, 0 );
write_index = cctag::shuffle( write_index, 0 );
write_index += __popc( mask & ((1 << threadIdx.x) - 1) );

if( chosen ) {
Expand Down
48 changes: 24 additions & 24 deletions src/cctag/cuda/frame_07c_eval.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,19 @@ void count_winners( FrameMetaPtr& meta,
for( int point=0; point<CONC_POINTS; point++ ) {
if( inner_point[point] == 0 ) continue;

winner_size[point] += __shfl_down( winner_size[point], 16 );
winner_size[point] += __shfl_down( winner_size[point], 8 );
winner_size[point] += __shfl_down( winner_size[point], 4 );
winner_size[point] += __shfl_down( winner_size[point], 2 );
winner_size[point] += __shfl_down( winner_size[point], 1 );
winner_size[point] = __shfl ( winner_size[point], 0 );

flow_length[point] += __shfl_down( flow_length[point], 16 );
flow_length[point] += __shfl_down( flow_length[point], 8 );
flow_length[point] += __shfl_down( flow_length[point], 4 );
flow_length[point] += __shfl_down( flow_length[point], 2 );
flow_length[point] += __shfl_down( flow_length[point], 1 );
flow_length[point] = __shfl ( flow_length[point], 0 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 16 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 8 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 4 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 2 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 1 );
winner_size[point] = cctag::shuffle ( winner_size[point], 0 );

flow_length[point] += cctag::shuffle_down( flow_length[point], 16 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 8 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 4 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 2 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 1 );
flow_length[point] = cctag::shuffle ( flow_length[point], 0 );
}

__shared__ int winner_array[CONC_POINTS][32];
Expand All @@ -102,17 +102,17 @@ void count_winners( FrameMetaPtr& meta,
winner_size[point] = winner_array[point][threadIdx.x];
flow_length[point] = length_array[point][threadIdx.x];

winner_size[point] += __shfl_down( winner_size[point], 16 );
winner_size[point] += __shfl_down( winner_size[point], 8 );
winner_size[point] += __shfl_down( winner_size[point], 4 );
winner_size[point] += __shfl_down( winner_size[point], 2 );
winner_size[point] += __shfl_down( winner_size[point], 1 );

flow_length[point] += __shfl_down( flow_length[point], 16 );
flow_length[point] += __shfl_down( flow_length[point], 8 );
flow_length[point] += __shfl_down( flow_length[point], 4 );
flow_length[point] += __shfl_down( flow_length[point], 2 );
flow_length[point] += __shfl_down( flow_length[point], 1 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 16 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 8 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 4 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 2 );
winner_size[point] += cctag::shuffle_down( winner_size[point], 1 );

flow_length[point] += cctag::shuffle_down( flow_length[point], 16 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 8 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 4 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 2 );
flow_length[point] += cctag::shuffle_down( flow_length[point], 1 );

if( threadIdx.x == 0 ) {
inner_point[point]->_winnerSize = winner_size[point];
Expand Down
4 changes: 2 additions & 2 deletions src/cctag/cuda/recursive_sweep.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,12 @@ bool single_block_loop( cv::cuda::PtrStepSz<T> img )
Processor proc;
while( again ) {
bool mark = proc.check( img, idx, idy );
bool any_marks = ::__any( mark );
bool any_marks = cctag::any( mark );
if( threadIdx.x == 0 ) continuation[threadIdx.y] = any_marks;
__syncthreads();
mark = threadIdx.x < HYST_H ? continuation[threadIdx.x] : false;
__syncthreads();
again = ::__any( mark );
again = cctag::any( mark );
if( again ) nothing_changed = false;
}

Expand Down
40 changes: 20 additions & 20 deletions src/cctag/cuda/tag_identify.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ void extractSignalUsingHomography( const CutStruct& cut,

bool breaknow = ( xyRes.x < 1.0f && xyRes.x > src.cols-1 && xyRes.y < 1.0f && xyRes.y > src.rows-1 );

if( ::__any( breaknow ) )
if( cctag::any( breaknow ) )
{
if( threadIdx.x == 0 ) signals.outOfBounds = 1;
return;
Expand Down Expand Up @@ -266,11 +266,11 @@ void idComputeResult( NearbyPointGrid* d_NearbyPointGrid,
}
}

val += __shfl_down( val, 16 );
val += __shfl_down( val, 8 );
val += __shfl_down( val, 4 );
val += __shfl_down( val, 2 );
val += __shfl_down( val, 1 );
val += cctag::shuffle_down( val, 16 );
val += cctag::shuffle_down( val, 8 );
val += cctag::shuffle_down( val, 4 );
val += cctag::shuffle_down( val, 2 );
val += cctag::shuffle_down( val, 1 );

__shared__ float signal_sum[32];
__shared__ int count_sum[32];
Expand All @@ -284,17 +284,17 @@ void idComputeResult( NearbyPointGrid* d_NearbyPointGrid,

if( threadIdx.y == 0 ) {
val = signal_sum[threadIdx.x];
val += __shfl_down( val, 16 );
val += __shfl_down( val, 8 );
val += __shfl_down( val, 4 );
val += __shfl_down( val, 2 );
val += __shfl_down( val, 1 );
val += cctag::shuffle_down( val, 16 );
val += cctag::shuffle_down( val, 8 );
val += cctag::shuffle_down( val, 4 );
val += cctag::shuffle_down( val, 2 );
val += cctag::shuffle_down( val, 1 );
ct = count_sum[threadIdx.x];
ct += __shfl_down( ct, 16 );
ct += __shfl_down( ct, 8 );
ct += __shfl_down( ct, 4 );
ct += __shfl_down( ct, 2 );
ct += __shfl_down( ct, 1 );
ct += cctag::shuffle_down( ct, 16 );
ct += cctag::shuffle_down( ct, 8 );
ct += cctag::shuffle_down( ct, 4 );
ct += cctag::shuffle_down( ct, 2 );
ct += cctag::shuffle_down( ct, 1 );

if( threadIdx.x == 0 ) {
atomicAdd( &nPoint.result, val );
Expand Down Expand Up @@ -340,8 +340,8 @@ void idBestNearbyPoint32plus( NearbyPointGrid* d_NearbyPointGrid,
// phase 2: reduce to let thread 0 know the best point
#pragma unroll
for( int shft=4; shft>=0; shft-- ) {
int otherRes = __shfl_down( bestRes, (1 << shft) );
int otherIdx = __shfl_down( bestIdx, (1 << shft) );
int otherRes = cctag::shuffle_down( bestRes, (1 << shft) );
int otherIdx = cctag::shuffle_down( bestIdx, (1 << shft) );
if( otherRes < bestRes ) {
bestRes = otherRes;
bestIdx = otherIdx;
Expand Down Expand Up @@ -385,8 +385,8 @@ void idBestNearbyPoint31max( NearbyPointGrid* d_NearbyPointGrid,
// phase 2: reduce to let thread 0 know the best point
#pragma unroll
for( int shft=4; shft>=0; shft-- ) {
int otherRes = __shfl_down( bestRes, (1 << shft) );
int otherIdx = __shfl_down( bestIdx, (1 << shft) );
int otherRes = cctag::shuffle_down( bestRes, (1 << shft) );
int otherIdx = cctag::shuffle_down( bestIdx, (1 << shft) );
if( otherRes < bestRes ) {
bestRes = otherRes;
bestIdx = otherIdx;
Expand Down