Skip to content

Commit

Permalink
Revert "remove copy_if pattern family from reduce_then_scan"
Browse files Browse the repository at this point in the history
This reverts commit 75eeca8.
  • Loading branch information
danhoeflinger committed Aug 6, 2024
1 parent 4df758a commit f757df5
Show file tree
Hide file tree
Showing 5 changed files with 252 additions and 46 deletions.
16 changes: 8 additions & 8 deletions include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,17 +384,17 @@ template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typ
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
oneapi::dpl::__internal::__difference_t<_Range2>
__pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
_Predicate __pred, _Assign)
_Predicate __pred, _Assign&& __assign)
{
using _SizeType = decltype(__rng1.size());
using _ReduceOp = ::std::plus<_SizeType>;
auto __n = __rng1.size();
if (__n == 0)
return 0;

unseq_backend::__create_mask<_Predicate, _SizeType> __create_mask_op{__pred};
unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op;
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), __n, __pred, std::forward<_Assign>(__assign));

return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2),
__create_mask_op, __copy_by_mask_op);
return __res.get(); //is a blocking call
}

//------------------------------------------------------------------------
Expand Down
176 changes: 138 additions & 38 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -554,10 +554,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
__internal::__optional_kernel_name<_ScanKernelName...>>
{
template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
typename _UnaryOp>
typename _UnaryOp, typename _Assign>
auto
operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
_BinaryOperation __bin_op, _UnaryOp __unary_op)
_BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign)
{
using _ValueType = ::std::uint16_t;

Expand Down Expand Up @@ -622,7 +622,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
{
if (__lacc[__idx])
__out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
__assign(static_cast<__tuple_type>(__in_rng[__idx]),
__out_rng[__lacc[__idx + __elems_per_wg]]);
}

const ::std::uint16_t __residual = __n % _WGSize;
Expand All @@ -631,7 +632,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
{
auto __idx = __residual_start + __item_id;
if (__lacc[__idx])
__out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
__assign(static_cast<__tuple_type>(__in_rng[__idx]),
__out_rng[__lacc[__idx + __elems_per_wg]]);
}

if (__item_id == 0)
Expand Down Expand Up @@ -796,6 +798,76 @@ struct __simple_write_to_idx
}
};

template <typename _Predicate>
struct __gen_mask
{
template <typename _InRng>
bool
operator()(_InRng&& __in_rng, std::size_t __idx) const
{
return __pred(__in_rng[__idx]);
}
_Predicate __pred;
};

template <typename _GenMask>
struct __gen_count_mask
{
template <typename _InRng, typename _SizeType>
_SizeType
operator()(_InRng&& __in_rng, _SizeType __idx) const
{
return __gen_mask(std::forward<_InRng>(__in_rng), __idx) ? _SizeType{1} : _SizeType{0};
}
_GenMask __gen_mask;
};

template <typename _GenMask>
struct __gen_expand_count_mask
{
template <typename _InRng, typename _SizeType>
auto
operator()(_InRng&& __in_rng, _SizeType __idx) const
{
// Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
// zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
// the values of zipped the input types rather than their references.
using _ElementType =
oneapi::dpl::__internal::__decay_with_tuple_specialization_t<oneapi::dpl::__internal::__value_t<_InRng>>;
_ElementType ele = __in_rng[__idx];
bool mask = __gen_mask(__in_rng, __idx);
return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
}
_GenMask __gen_mask;
};

struct __get_zeroth_element
{
template <typename _Tp>
auto&
operator()(_Tp&& __a) const
{
return std::get<0>(std::forward<_Tp>(__a));
}
};
template <std::int32_t __offset = 0, typename Assign = oneapi::dpl::__internal::__pstl_assign>
struct __write_to_idx_if
{
template <typename _OutRng, typename _SizeType, typename ValueType>
void
operator()(_OutRng&& __out_rng, _SizeType __idx, const ValueType& __v) const
{
// Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
// internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
using _ConvertedTupleType =
typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
std::decay_t<decltype(__out_rng[__idx])>>::__type;
if (std::get<1>(__v))
__assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1 + __offset]);
}
Assign __assign;
};

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
Expand Down Expand Up @@ -872,9 +944,11 @@ struct __invoke_single_group_copy_if
// Specialization for devices that have a max work-group size of at least 1024
static constexpr ::std::uint16_t __targeted_wg_size = 1024;

template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred>
template <std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred)
operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred,
_Assign&& __assign)
{
constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
Expand All @@ -886,34 +960,51 @@ struct __invoke_single_group_copy_if
if (__is_full_group)
return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
_SizeType, __num_elems_per_item, __wg_size, true,
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>,
::std::integral_constant<::std::uint16_t, __num_elems_per_item>,
/* _IsFullGroup= */ std::true_type, _CustomName>>
>()(
__exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{},
_ReduceOp{}, ::std::forward<_Pred>(__pred));
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__scan_copy_single_wg_kernel<std::integral_constant<std::uint16_t, __wg_size>,
std::integral_constant<std::uint16_t, __num_elems_per_item>,
/* _IsFullGroup= */ std::true_type, _CustomName>>>()(
__exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{},
std::forward<_Pred>(__pred), std::forward<_Assign>(__assign));
else
return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
_SizeType, __num_elems_per_item, __wg_size, false,
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>,
::std::integral_constant<::std::uint16_t, __num_elems_per_item>,
/* _IsFullGroup= */ std::false_type, _CustomName>>
>()(
__exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{},
_ReduceOp{}, ::std::forward<_Pred>(__pred));
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__scan_copy_single_wg_kernel<std::integral_constant<std::uint16_t, __wg_size>,
std::integral_constant<std::uint16_t, __num_elems_per_item>,
/* _IsFullGroup= */ std::false_type, _CustomName>>>()(
__exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{},
std::forward<_Pred>(__pred), std::forward<_Assign>(__assign));
}
};

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
typename _WriteOp>
auto
__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask,
_WriteOp __write_op)
{
using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>;
using _ReduceOp = std::plus<_Size>;
using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>;
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;

return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask},
_ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{});
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
typename _CopyByMaskOp>
auto
__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
_CopyByMaskOp __copy_by_mask_op)
{
using _ReduceOp = ::std::plus<_Size>;
using _ReduceOp = std::plus<_Size>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<1>;
Expand All @@ -930,7 +1021,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag

return __parallel_transform_scan_base(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
oneapi::dpl::__ranges::zip_view(
::std::forward<_InRng>(__in_rng),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
Expand All @@ -947,10 +1038,11 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
__copy_by_mask_op);
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred>
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred)
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign&& __assign = _Assign{})
{
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;

Expand All @@ -964,30 +1056,38 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
// The kernel stores n integers for the predicate and another n integers for the offsets
const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;

constexpr ::std::uint16_t __single_group_upper_limit = 16384;
constexpr ::std::uint16_t __single_group_upper_limit = 2048;

::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);

if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
__max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
{
using _SizeBreakpoints =
::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;

return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
_SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
::std::forward<_OutRng>(__out_rng), __pred);
_SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign));
}
else if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
{
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>;

return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
_GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)});
}
else
{
using _ReduceOp = ::std::plus<_Size>;
using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
/*inclusive*/ ::std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n,
CreateOp{__pred}, CopyOp{});
using _ReduceOp = std::plus<_Size>;
using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
_CreateOp{__pred}, _CopyOp{_ReduceOp{}, std::forward<_Assign>(__assign)});
}
}

Expand Down
38 changes: 38 additions & 0 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,18 @@ namespace oneapi::dpl::__par_backend_hetero
template <typename _UnaryOp>
struct __gen_transform_input;

template <typename _Predicate>
struct __gen_mask;

template <typename _GenMask>
struct __gen_count_mask;

template <typename _GenMask>
struct __gen_expand_count_mask;

template <int32_t __offset, typename Assign>
struct __write_to_idx_if;

template <typename _ExecutionPolicy, typename _Pred>
struct __early_exit_find_or;

Expand All @@ -248,6 +260,32 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
{
};

template <typename _Predicate>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)>
: oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
{
};

template <typename _GenMask>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)>
: oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
{
};

template <typename _GenMask>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask,
_GenMask)>
: oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
{
};

template <int32_t __offset, typename Assign>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset,
Assign)>
: oneapi::dpl::__internal::__are_all_device_copyable<Assign>
{
};

template <typename _ExecutionPolicy, typename _Pred>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or,
_ExecutionPolicy, _Pred)>
Expand Down
Loading

0 comments on commit f757df5

Please sign in to comment.