Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 10 additions & 22 deletions cub/benchmarks/bench/segmented_radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,14 @@

template <class T, typename OffsetT>
void seg_radix_sort(nvbench::state& state,
nvbench::type_list<T, OffsetT> ts,
nvbench::type_list<T, OffsetT>,
const thrust::device_vector<OffsetT>& offsets,
bit_entropy entropy)
{
constexpr bool is_overwrite_ok = false;

using offset_t = OffsetT;
using begin_offset_it_t = const offset_t*;
using end_offset_it_t = const offset_t*;
using segment_size_t = cuda::std::int32_t; // same as CUB public API
using key_t = T;
using value_t = cub::NullType;

constexpr int begin_bit = 0;
constexpr int end_bit = sizeof(key_t) * 8;
Expand All @@ -31,11 +27,8 @@ void seg_radix_sort(nvbench::state& state,
thrust::device_vector<key_t> buffer_1 = generate(elements, entropy);
thrust::device_vector<key_t> buffer_2(elements);

key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());

cub::DoubleBuffer<key_t> d_keys(d_buffer_1, d_buffer_2);
cub::DoubleBuffer<value_t> d_values;
const key_t* d_keys_1 = thrust::raw_pointer_cast(buffer_1.data());
key_t* d_keys_2 = thrust::raw_pointer_cast(buffer_2.data());

begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(offsets.data());
end_offset_it_t d_end_offsets = d_begin_offsets + 1;
Expand All @@ -47,40 +40,35 @@ void seg_radix_sort(nvbench::state& state,

std::size_t temp_storage_bytes{};
std::uint8_t* d_temp_storage{};
cub::detail::segmented_radix_sort::dispatch<cub::SortOrder::Ascending, segment_size_t>(
cub::DeviceSegmentedRadixSort::SortKeys(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
d_keys_1,
d_keys_2,
elements,
segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
is_overwrite_ok,
nullptr);

thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes, thrust::no_init);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

cub::detail::segmented_radix_sort::dispatch<cub::SortOrder::Ascending, segment_size_t>(
cub::DeviceSegmentedRadixSort::SortKeys(
d_temp_storage,
temp_storage_bytes,
keys,
values,
d_keys_1,
d_keys_2,
elements,
segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
is_overwrite_ok,
launch.get_stream());
});
}
Expand Down
48 changes: 32 additions & 16 deletions cub/cub/device/device_segmented_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ public:
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Ascending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -563,7 +563,9 @@ public:
begin_bit,
end_bit,
false,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -697,7 +699,7 @@ public:

using SegmentSizeT = ::cuda::std::int32_t;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Ascending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -710,7 +712,9 @@ public:
begin_bit,
end_bit,
true,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -1202,7 +1206,7 @@ public:
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Descending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -1215,7 +1219,9 @@ public:
begin_bit,
end_bit,
false,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -1349,7 +1355,7 @@ public:

using SegmentSizeT = ::cuda::std::int32_t;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Descending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -1362,7 +1368,9 @@ public:
begin_bit,
end_bit,
true,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -1819,7 +1827,7 @@ public:
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Ascending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -1832,7 +1840,9 @@ public:
begin_bit,
end_bit,
false,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -1957,7 +1967,7 @@ public:
// Null value type
DoubleBuffer<NullType> d_values;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Ascending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -1970,7 +1980,9 @@ public:
begin_bit,
end_bit,
true,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -2419,7 +2431,7 @@ public:
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Descending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -2432,7 +2444,9 @@ public:
begin_bit,
end_bit,
false,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down Expand Up @@ -2557,7 +2571,7 @@ public:
// Null value type
DoubleBuffer<NullType> d_values;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) {
return detail::segmented_radix_sort::dispatch<SortOrder::Descending, SegmentSizeT>(
storage,
bytes,
Expand All @@ -2570,7 +2584,9 @@ public:
begin_bit,
end_bit,
true,
stream);
stream,
/* decomposer */ {},
tuning_env);
});
}

Expand Down
54 changes: 24 additions & 30 deletions cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <cuda/__device/compute_capability.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/__execution/env.h>
#include <cuda/std/__host_stdlib/sstream>
#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/cstdint>
Expand Down Expand Up @@ -883,21 +884,9 @@ template <SortOrder Order,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename DecomposerT = identity_decomposer_t,
typename PolicySelector = policy_selector_from_types<KeyT, ValueT, SegmentSizeT>,
typename KernelSource = DeviceSegmentedRadixSortKernelSource<
PolicySelector,
Order,
KeyT,
ValueT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
SegmentSizeT,
DecomposerT>,
typename DecomposerT = identity_decomposer_t,
typename TuningEnvT = ::cuda::std::execution::env<>,
typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
#if _CCCL_HAS_CONCEPTS()
requires segmented_radix_sort_policy_selector<PolicySelector>
#endif // _CCCL_HAS_CONCEPTS()
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -911,11 +900,27 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
int end_bit,
bool is_overwrite_okay,
cudaStream_t stream,
DecomposerT decomposer = {},
PolicySelector policy_selector = {},
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
DecomposerT decomposer = {},
TuningEnvT = {})
{
using default_policy_selector_t = policy_selector_from_types<KeyT, ValueT, SegmentSizeT>;
using policy_selector_t = ::cuda::std::decay_t<
::cuda::std::execution::__query_result_or_t<TuningEnvT, segmented_radix_sort_policy, default_policy_selector_t>>;
#if _CCCL_HAS_CONCEPTS()
static_assert(segmented_radix_sort_policy_selector<policy_selector_t>);
#endif // _CCCL_HAS_CONCEPTS()

auto kernel_source = DeviceSegmentedRadixSortKernelSource<
policy_selector_t,
Order,
KeyT,
ValueT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
SegmentSizeT,
DecomposerT>{};
auto launcher_factory = KernelLauncherFactory{};

if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay))
{
if (d_temp_storage == nullptr)
Expand All @@ -930,18 +935,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
{
return error;
}
const segmented_radix_sort_policy active_policy = policy_selector(cc);

#if _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
NV_IF_TARGET(NV_IS_HOST, ({
::std::stringstream ss;
ss << active_policy;
_CubLog("Dispatching DeviceSegmentedRadixSort to compute capability %d.%d with tuning: %s\n",
cc.major_cap(),
cc.minor_cap(),
ss.str().c_str());
}))
#endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
const segmented_radix_sort_policy active_policy = policy_selector_t{}(cc);

#if _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
NV_IF_TARGET(NV_IS_HOST, ({
Expand Down
Loading
Loading