From 539e56ce3e83bf4418fca35296f26780334c3e3e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Mar 2026 17:56:13 +0100 Subject: [PATCH 1/9] Segmented radix sort --- .../bench/segmented_radix_sort/keys.cu | 32 ++-- .../device/device_segmented_radix_sort.cuh | 144 ++++++++++-------- ...h2_test_device_segmented_radix_sort_env.cu | 118 ++++++++++++++ cub/test/catch2_test_env_launch_helper.h | 14 ++ 4 files changed, 226 insertions(+), 82 deletions(-) diff --git a/cub/benchmarks/bench/segmented_radix_sort/keys.cu b/cub/benchmarks/bench/segmented_radix_sort/keys.cu index ade3333d280..eb8b12844ab 100644 --- a/cub/benchmarks/bench/segmented_radix_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_radix_sort/keys.cu @@ -9,18 +9,14 @@ template void seg_radix_sort(nvbench::state& state, - nvbench::type_list ts, + nvbench::type_list, const thrust::device_vector& offsets, bit_entropy entropy) { - constexpr bool is_overwrite_ok = false; - using offset_t = OffsetT; using begin_offset_it_t = const offset_t*; using end_offset_it_t = const offset_t*; - using segment_size_t = cuda::std::int32_t; // same as CUB public API using key_t = T; - using value_t = cub::NullType; constexpr int begin_bit = 0; constexpr int end_bit = sizeof(key_t) * 8; @@ -31,11 +27,8 @@ void seg_radix_sort(nvbench::state& state, thrust::device_vector buffer_1 = generate(elements, entropy); thrust::device_vector buffer_2(elements); - key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data()); - key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data()); - - cub::DoubleBuffer d_keys(d_buffer_1, d_buffer_2); - cub::DoubleBuffer d_values; + const key_t* d_keys_1 = thrust::raw_pointer_cast(buffer_1.data()); + key_t* d_keys_2 = thrust::raw_pointer_cast(buffer_2.data()); begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(offsets.data()); end_offset_it_t d_end_offsets = d_begin_offsets + 1; @@ -47,40 +40,35 @@ void seg_radix_sort(nvbench::state& state, std::size_t temp_storage_bytes{}; std::uint8_t* d_temp_storage{}; - cub::detail::segmented_radix_sort::dispatch( + cub::DeviceSegmentedRadixSort::SortKeys( d_temp_storage, temp_storage_bytes, - d_keys, - d_values, + d_keys_1, + d_keys_2, elements, segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, - is_overwrite_ok, nullptr); - thrust::device_vector temp_storage(temp_storage_bytes); + thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cub::DoubleBuffer keys = d_keys; - cub::DoubleBuffer values = d_values; - - cub::detail::segmented_radix_sort::dispatch( + cub::DeviceSegmentedRadixSort::SortKeys( d_temp_storage, temp_storage_bytes, - keys, - values, + d_keys_1, + d_keys_2, elements, segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, - is_overwrite_ok, launch.get_stream()); }); } diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index dbd11468d72..c04d694dd92 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -550,21 +550,27 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - static_cast<::cuda::std::int64_t>(num_items), - static_cast<::cuda::std::int64_t>(num_segments), - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { + using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + static_cast<::cuda::std::int64_t>(num_items), + static_cast<::cuda::std::int64_t>(num_segments), + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + policy_selector_t{}); + }); } //! @rst @@ -1202,21 +1208,27 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { + using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + policy_selector_t{}); + }); } //! @rst @@ -1819,21 +1831,27 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { + using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + policy_selector_t{}); + }); } //! @rst @@ -2419,21 +2437,27 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { + using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + detail::identity_decomposer_t{}, + policy_selector_t{}); + }); } //! @rst diff --git a/cub/test/catch2_test_device_segmented_radix_sort_env.cu b/cub/test/catch2_test_device_segmented_radix_sort_env.cu index 9c4ea96d6c1..21f1203e886 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_env.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_env.cu @@ -803,3 +803,121 @@ TEST_CASE("DeviceSegmentedRadixSort::SortPairs DoubleBuffer uses custom stream", REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); } + +template +struct segmented_radix_bits_policy_selector +{ + _CCCL_API constexpr auto operator()(cuda::arch_id arch) const -> cub::detail::radix_sort::radix_sort_policy + { + using default_selector_t = cub::detail::radix_sort::policy_selector_from_types; + auto policy = default_selector_t{}(arch); + policy.segmented.radix_bits = RadixBits; + policy.alt_segmented.radix_bits = RadixBits; + return policy; + } +}; + +template +std::size_t measure_kernel_launches(CallableT&& run, PolicySelector policy_selector) +{ + auto env = stdexec::env{cuda::execution::__tune(policy_selector)}; + reset_kernel_launches(); + REQUIRE(cudaSuccess == run(env)); + auto launches = get_kernel_launches(); + CHECK(launches > 0); + return launches; +} + +TEST_CASE("DeviceSegmentedRadixSort::SortPairs can be tuned", "[segmented_radix_sort][device]") +{ + auto l = [&](auto env) { + auto keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + auto offsets = c2h::device_vector{0, 5'000, 10'000}; + return cub::DeviceSegmentedRadixSort::SortPairs( + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(values.data()), + static_cast(keys.size()), + 2, + offsets.begin(), + offsets.begin() + 1, + 0, + 32, + env); + }; + + auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + CHECK(launches_4b != launches_5b); +} + +TEST_CASE("DeviceSegmentedRadixSort::SortPairsDescending can be tuned", "[segmented_radix_sort][device]") +{ + auto l = [&](auto env) { + auto keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + auto offsets = c2h::device_vector{0, 5'000, 10'000}; + return cub::DeviceSegmentedRadixSort::SortPairsDescending( + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(values.data()), + thrust::raw_pointer_cast(values.data()), + static_cast(keys.size()), + 2, + offsets.begin(), + offsets.begin() + 1, + 0, + 32, + env); + }; + + auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + CHECK(launches_4b != launches_5b); +} + +TEST_CASE("DeviceSegmentedRadixSort::SortKeys can be tuned", "[segmented_radix_sort][device]") +{ + auto l = [&](auto env) { + auto keys = c2h::device_vector(10'000); + auto offsets = c2h::device_vector{0, 5'000, 10'000}; + return cub::DeviceSegmentedRadixSort::SortKeys( + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(keys.data()), + static_cast(keys.size()), + 2, + offsets.begin(), + offsets.begin() + 1, + 0, + 32, + env); + }; + + auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + CHECK(launches_4b != launches_5b); +} + +TEST_CASE("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmented_radix_sort][device]") +{ + auto l = [&](auto env) { + auto keys = c2h::device_vector(10'000); + auto offsets = c2h::device_vector{0, 5'000, 10'000}; + return cub::DeviceSegmentedRadixSort::SortKeysDescending( + thrust::raw_pointer_cast(keys.data()), + thrust::raw_pointer_cast(keys.data()), + static_cast(keys.size()), + 2, + offsets.begin(), + offsets.begin() + 1, + 0, + 32, + env); + }; + + auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); + CHECK(launches_4b != launches_5b); +} diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h index 53436ffdc28..df193ab27c6 100644 --- a/cub/test/catch2_test_env_launch_helper.h +++ b/cub/test/catch2_test_env_launch_helper.h @@ -65,6 +65,7 @@ struct stream_registry_factory_state_t { cuda::std::optional m_stream; cuda::std::span m_kernels; + size_t m_kernel_launches = 0; }; static CUB_RUNTIME_FUNCTION stream_registry_factory_state_t* get_stream_registry_factory_state() @@ -74,6 +75,18 @@ static CUB_RUNTIME_FUNCTION stream_registry_factory_state_t* get_stream_registry return ptr; } +static CUB_RUNTIME_FUNCTION void reset_kernel_launches() +{ + NV_IF_TARGET(NV_IS_HOST, (get_stream_registry_factory_state()->m_kernel_launches = 0;)); +} + +static CUB_RUNTIME_FUNCTION auto get_kernel_launches() -> size_t +{ + size_t launches = 0; + NV_IF_TARGET(NV_IS_HOST, (launches = get_stream_registry_factory_state()->m_kernel_launches;)); + return launches; +} + struct kernel_launcher_t : thrust::cuda_cub::detail::triple_chevron { CUB_RUNTIME_FUNCTION kernel_launcher_t( @@ -85,6 +98,7 @@ struct kernel_launcher_t : thrust::cuda_cub::detail::triple_chevron CUB_RUNTIME_FUNCTION cudaError_t doit(K kernel, Args const&... args) const { NV_IF_TARGET(NV_IS_HOST, ({ + ++get_stream_registry_factory_state()->m_kernel_launches; auto& kernels = get_stream_registry_factory_state()->m_kernels; if (!kernels.empty()) { From aafcf7b2d8a0620d0e8bbe0b5fac961fe49fba5c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 22:09:48 +0200 Subject: [PATCH 2/9] tuning env refactor --- .../device/device_segmented_radix_sort.cuh | 152 ++++++++---------- .../dispatch_segmented_radix_sort.cuh | 54 +++---- 2 files changed, 91 insertions(+), 115 deletions(-) diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index c04d694dd92..e1091657162 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -550,27 +550,23 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return detail::dispatch_with_env( - env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { - using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; - using policy_selector_t = ::cuda::std::execution:: - __query_result_or_t; - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - static_cast<::cuda::std::int64_t>(num_items), - static_cast<::cuda::std::int64_t>(num_segments), - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - {}, - policy_selector_t{}); - }); + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + static_cast<::cuda::std::int64_t>(num_items), + static_cast<::cuda::std::int64_t>(num_segments), + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + tuning_env); + }); } //! @rst @@ -1208,27 +1204,23 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return detail::dispatch_with_env( - env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { - using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; - using policy_selector_t = ::cuda::std::execution:: - __query_result_or_t; - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - {}, - policy_selector_t{}); - }); + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + tuning_env); + }); } //! @rst @@ -1831,27 +1823,23 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return detail::dispatch_with_env( - env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { - using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; - using policy_selector_t = ::cuda::std::execution:: - __query_result_or_t; - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - {}, - policy_selector_t{}); - }); + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + {}, + tuning_env); + }); } //! @rst @@ -2437,27 +2425,23 @@ public: DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return detail::dispatch_with_env( - env, [&]([[maybe_unused]] auto tuning_env, void* storage, size_t& bytes, auto stream) { - using default_policy_selector_t = detail::radix_sort::policy_selector_from_types; - using policy_selector_t = ::cuda::std::execution:: - __query_result_or_t; - return detail::segmented_radix_sort::dispatch( - storage, - bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - detail::identity_decomposer_t{}, - policy_selector_t{}); - }); + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { + return detail::segmented_radix_sort::dispatch( + storage, + bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + detail::identity_decomposer_t{}, + tuning_env); + }); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 23bb954acb5..8c3138a461d 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -883,21 +883,8 @@ template , - typename KernelSource = DeviceSegmentedRadixSortKernelSource< - PolicySelector, - Order, - KeyT, - ValueT, - BeginOffsetIteratorT, - EndOffsetIteratorT, - SegmentSizeT, - DecomposerT>, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> -#if _CCCL_HAS_CONCEPTS() - requires segmented_radix_sort_policy_selector -#endif // _CCCL_HAS_CONCEPTS() + typename DecomposerT = identity_decomposer_t, + typename TuningEnvT = ::cuda::std::execution::env<>> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( void* d_temp_storage, size_t& temp_storage_bytes, @@ -911,11 +898,27 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( int end_bit, bool is_overwrite_okay, cudaStream_t stream, - DecomposerT decomposer = {}, - PolicySelector policy_selector = {}, - KernelSource kernel_source = {}, - KernelLauncherFactory launcher_factory = {}) + DecomposerT decomposer = {}, + TuningEnvT = {}) { + using default_policy_selector_t = policy_selector_from_types; + using policy_selector_t = + ::cuda::std::execution::__query_result_or_t; +#if _CCCL_HAS_CONCEPTS() + static_assert(segmented_radix_sort_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + + auto kernel_source = DeviceSegmentedRadixSortKernelSource< + policy_selector_t, + Order, + KeyT, + ValueT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + SegmentSizeT, + DecomposerT>{}; + auto launcher_factory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY{}; + if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay)) { if (d_temp_storage == nullptr) @@ -930,18 +933,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( { return error; } - const segmented_radix_sort_policy active_policy = policy_selector(cc); - -#if _CCCL_HOSTED() && defined(CUB_DEBUG_LOG) - NV_IF_TARGET(NV_IS_HOST, ({ - ::std::stringstream ss; - ss << active_policy; - _CubLog("Dispatching DeviceSegmentedRadixSort to compute capability %d.%d with tuning: %s\n", - cc.major_cap(), - cc.minor_cap(), - ss.str().c_str()); - })) -#endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG) + const segmented_radix_sort_policy active_policy = policy_selector_t{}(cc); #if _CCCL_HOSTED() && defined(CUB_DEBUG_LOG) NV_IF_TARGET(NV_IS_HOST, ({ From 8d04fe21c980c305aac3fbf44982ad0e50e15715 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 22:47:32 +0200 Subject: [PATCH 3/9] fix --- cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 8c3138a461d..29b520520f9 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -883,8 +883,9 @@ template > + typename DecomposerT = identity_decomposer_t, + typename TuningEnvT = ::cuda::std::execution::env<>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( void* d_temp_storage, size_t& temp_storage_bytes, @@ -917,7 +918,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( EndOffsetIteratorT, SegmentSizeT, DecomposerT>{}; - auto launcher_factory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY{}; + auto launcher_factory = KernelLauncherFactory{}; if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay)) { From 2781bf077015242dce11f84c8a93927269dea6cc Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 22:50:42 +0200 Subject: [PATCH 4/9] fix --- .../device/device_segmented_radix_sort.cuh | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index e1091657162..ec7a17c9233 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -699,7 +699,7 @@ public: using SegmentSizeT = ::cuda::std::int32_t; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { return detail::segmented_radix_sort::dispatch( storage, bytes, @@ -712,7 +712,9 @@ public: begin_bit, end_bit, true, - stream); + stream, + {}, + tuning_env); }); } @@ -1353,7 +1355,7 @@ public: using SegmentSizeT = ::cuda::std::int32_t; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { return detail::segmented_radix_sort::dispatch( storage, bytes, @@ -1366,7 +1368,9 @@ public: begin_bit, end_bit, true, - stream); + stream, + {}, + tuning_env); }); } @@ -1963,7 +1967,7 @@ public: // Null value type DoubleBuffer d_values; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { return detail::segmented_radix_sort::dispatch( storage, bytes, @@ -1976,7 +1980,9 @@ public: begin_bit, end_bit, true, - stream); + stream, + {}, + tuning_env); }); } @@ -2565,7 +2571,7 @@ public: // Null value type DoubleBuffer d_values; - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + return detail::dispatch_with_env(env, [&](auto tuning_env, void* storage, size_t& bytes, auto stream) { return detail::segmented_radix_sort::dispatch( storage, bytes, @@ -2578,7 +2584,9 @@ public: begin_bit, end_bit, true, - stream); + stream, + {}, + tuning_env); }); } From e47da68b87a59f1a5c47ce80c31c5fb96ec0a9be Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 23:33:15 +0200 Subject: [PATCH 5/9] tests pass --- .../dispatch_segmented_radix_sort.cuh | 4 +- ...h2_test_device_segmented_radix_sort_env.cu | 179 ++++++++++-------- cub/test/catch2_test_env_launch_helper.h | 14 -- 3 files changed, 102 insertions(+), 95 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 29b520520f9..eeb60940675 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -903,8 +903,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( TuningEnvT = {}) { using default_policy_selector_t = policy_selector_from_types; - using policy_selector_t = - ::cuda::std::execution::__query_result_or_t; + using policy_selector_t = ::cuda::std::decay_t< + ::cuda::std::execution::__query_result_or_t>; #if _CCCL_HAS_CONCEPTS() static_assert(segmented_radix_sort_policy_selector); #endif // _CCCL_HAS_CONCEPTS() diff --git a/cub/test/catch2_test_device_segmented_radix_sort_env.cu b/cub/test/catch2_test_device_segmented_radix_sort_env.cu index 21f1203e886..e0efe89d3a7 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_env.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_env.cu @@ -804,120 +804,141 @@ TEST_CASE("DeviceSegmentedRadixSort::SortPairs DoubleBuffer uses custom stream", REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); } -template -struct segmented_radix_bits_policy_selector +#if TEST_LAUNCH == 0 + +// Radix sort does not accept user-provided functors or iterators, so we cannot use the block_size_extracting_op +// approach. Instead, we pass block_size_extracting_constant_iterator for the offsets, which records blockDim.x when +// dereferenced on the device. A custom policy selector sets threads_per_block, and we verify the recorded block size +// matches. +template +struct segmented_radix_sort_block_size_tuning { - _CCCL_API constexpr auto operator()(cuda::arch_id arch) const -> cub::detail::radix_sort::radix_sort_policy + _CCCL_API constexpr auto operator()(cuda::compute_capability cc) const + -> cub::detail::segmented_radix_sort::segmented_radix_sort_policy { - using default_selector_t = cub::detail::radix_sort::policy_selector_from_types; - auto policy = default_selector_t{}(arch); - policy.segmented.radix_bits = RadixBits; - policy.alt_segmented.radix_bits = RadixBits; + using default_selector_t = cub::detail::segmented_radix_sort::policy_selector_from_types; + auto policy = default_selector_t{}(cc); + policy.segmented.threads_per_block = ThreadsPerBlock; + policy.alt_segmented.threads_per_block = ThreadsPerBlock; return policy; } }; -template -std::size_t measure_kernel_launches(CallableT&& run, PolicySelector policy_selector) -{ - auto env = stdexec::env{cuda::execution::__tune(policy_selector)}; - reset_kernel_launches(); - REQUIRE(cudaSuccess == run(env)); - auto launches = get_kernel_launches(); - CHECK(launches > 0); - return launches; -} +using block_sizes = + c2h::type_list, cuda::std::integral_constant>; -TEST_CASE("DeviceSegmentedRadixSort::SortPairs can be tuned", "[segmented_radix_sort][device]") +C2H_TEST("DeviceSegmentedRadixSort::SortPairs can be tuned", "[segmented_radix_sort][device]", block_sizes) { - auto l = [&](auto env) { - auto keys = c2h::device_vector(10'000); - auto values = c2h::device_vector(10'000); - auto offsets = c2h::device_vector{0, 5'000, 10'000}; - return cub::DeviceSegmentedRadixSort::SortPairs( + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortPairs( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(values.data()), static_cast(keys.size()), - 2, - offsets.begin(), - offsets.begin() + 1, + 1, + d_begin_offsets, + d_end_offsets, 0, - 32, - env); - }; - - auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - CHECK(launches_4b != launches_5b); + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); } -TEST_CASE("DeviceSegmentedRadixSort::SortPairsDescending can be tuned", "[segmented_radix_sort][device]") +C2H_TEST("DeviceSegmentedRadixSort::SortPairsDescending can be tuned", "[segmented_radix_sort][device]", block_sizes) { - auto l = [&](auto env) { - auto keys = c2h::device_vector(10'000); - auto values = c2h::device_vector(10'000); - auto offsets = c2h::device_vector{0, 5'000, 10'000}; - return cub::DeviceSegmentedRadixSort::SortPairsDescending( + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortPairsDescending( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(values.data()), static_cast(keys.size()), - 2, - offsets.begin(), - offsets.begin() + 1, + 1, + d_begin_offsets, + d_end_offsets, 0, - 32, - env); - }; - - auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - CHECK(launches_4b != launches_5b); + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); } -TEST_CASE("DeviceSegmentedRadixSort::SortKeys can be tuned", "[segmented_radix_sort][device]") +C2H_TEST("DeviceSegmentedRadixSort::SortKeys can be tuned", "[segmented_radix_sort][device]", block_sizes) { - auto l = [&](auto env) { - auto keys = c2h::device_vector(10'000); - auto offsets = c2h::device_vector{0, 5'000, 10'000}; - return cub::DeviceSegmentedRadixSort::SortKeys( + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortKeys( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(keys.data()), static_cast(keys.size()), - 2, - offsets.begin(), - offsets.begin() + 1, + 1, + d_begin_offsets, + d_end_offsets, 0, - 32, - env); - }; - - auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - CHECK(launches_4b != launches_5b); + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); } -TEST_CASE("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmented_radix_sort][device]") +C2H_TEST("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmented_radix_sort][device]", block_sizes) { - auto l = [&](auto env) { - auto keys = c2h::device_vector(10'000); - auto offsets = c2h::device_vector{0, 5'000, 10'000}; - return cub::DeviceSegmentedRadixSort::SortKeysDescending( + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortKeysDescending( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(keys.data()), static_cast(keys.size()), - 2, - offsets.begin(), - offsets.begin() + 1, + 1, + d_begin_offsets, + d_end_offsets, 0, - 32, - env); - }; - - auto launches_4b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - auto launches_5b = measure_kernel_launches(l, segmented_radix_bits_policy_selector{}); - CHECK(launches_4b != launches_5b); + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); } + +#endif // TEST_LAUNCH == 0 diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h index df193ab27c6..53436ffdc28 100644 --- a/cub/test/catch2_test_env_launch_helper.h +++ b/cub/test/catch2_test_env_launch_helper.h @@ -65,7 +65,6 @@ struct stream_registry_factory_state_t { cuda::std::optional m_stream; cuda::std::span m_kernels; - size_t m_kernel_launches = 0; }; static CUB_RUNTIME_FUNCTION stream_registry_factory_state_t* get_stream_registry_factory_state() @@ -75,18 +74,6 @@ static CUB_RUNTIME_FUNCTION stream_registry_factory_state_t* get_stream_registry return ptr; } -static CUB_RUNTIME_FUNCTION void reset_kernel_launches() -{ - NV_IF_TARGET(NV_IS_HOST, (get_stream_registry_factory_state()->m_kernel_launches = 0;)); -} - -static CUB_RUNTIME_FUNCTION auto get_kernel_launches() -> size_t -{ - size_t launches = 0; - NV_IF_TARGET(NV_IS_HOST, (launches = get_stream_registry_factory_state()->m_kernel_launches;)); - return launches; -} - struct kernel_launcher_t : thrust::cuda_cub::detail::triple_chevron { CUB_RUNTIME_FUNCTION kernel_launcher_t( @@ -98,7 +85,6 @@ struct kernel_launcher_t : thrust::cuda_cub::detail::triple_chevron CUB_RUNTIME_FUNCTION cudaError_t doit(K kernel, Args const&... args) const { NV_IF_TARGET(NV_IS_HOST, ({ - ++get_stream_registry_factory_state()->m_kernel_launches; auto& kernels = get_stream_registry_factory_state()->m_kernels; if (!kernels.empty()) { From 58bbb0650812bcb485259aeb31ccdb67f4e1dc51 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 23:48:32 +0200 Subject: [PATCH 6/9] decomposer comment --- cub/cub/device/device_segmented_radix_sort.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index ec7a17c9233..0f3264ff162 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -564,7 +564,7 @@ public: end_bit, false, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -713,7 +713,7 @@ public: end_bit, true, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -1220,7 +1220,7 @@ public: end_bit, false, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -1369,7 +1369,7 @@ public: end_bit, true, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -1841,7 +1841,7 @@ public: end_bit, false, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -1981,7 +1981,7 @@ public: end_bit, true, stream, - {}, + /* decomposer */ {}, tuning_env); }); } @@ -2445,7 +2445,7 @@ public: end_bit, false, stream, - detail::identity_decomposer_t{}, + /* decomposer */ {}, tuning_env); }); } @@ -2585,7 +2585,7 @@ public: end_bit, true, stream, - {}, + /* decomposer */ {}, tuning_env); }); } From f1c305207c9e996afd8f4a3995a2acd5663a337b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 15 May 2026 23:53:46 +0200 Subject: [PATCH 7/9] more tets --- ...h2_test_device_segmented_radix_sort_env.cu | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/cub/test/catch2_test_device_segmented_radix_sort_env.cu b/cub/test/catch2_test_device_segmented_radix_sort_env.cu index e0efe89d3a7..c681a063e43 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_env.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_env.cu @@ -941,4 +941,132 @@ C2H_TEST("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmente REQUIRE(d_block_size[0] == target_block_size); } +C2H_TEST("DeviceSegmentedRadixSort::SortPairs DoubleBuffer can be tuned", "[segmented_radix_sort][device]", block_sizes) +{ + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto alt_keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + auto alt_values = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(alt_keys.data())); + cub::DoubleBuffer d_values(thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(alt_values.data())); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortPairs( + d_keys, + d_values, + static_cast(keys.size()), + 1, + d_begin_offsets, + d_end_offsets, + 0, + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); +} + +C2H_TEST("DeviceSegmentedRadixSort::SortPairsDescending DoubleBuffer can be tuned", + "[segmented_radix_sort][device]", + block_sizes) +{ + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto alt_keys = c2h::device_vector(10'000); + auto values = c2h::device_vector(10'000); + auto alt_values = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(alt_keys.data())); + cub::DoubleBuffer d_values(thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(alt_values.data())); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_keys, + d_values, + static_cast(keys.size()), + 1, + d_begin_offsets, + d_end_offsets, + 0, + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); +} + +C2H_TEST("DeviceSegmentedRadixSort::SortKeys DoubleBuffer can be tuned", "[segmented_radix_sort][device]", block_sizes) +{ + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto alt_keys = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(alt_keys.data())); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortKeys( + d_keys, + static_cast(keys.size()), + 1, + d_begin_offsets, + d_end_offsets, + 0, + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); +} + +C2H_TEST("DeviceSegmentedRadixSort::SortKeysDescending DoubleBuffer can be tuned", + "[segmented_radix_sort][device]", + block_sizes) +{ + constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; + + auto keys = c2h::device_vector(10'000); + auto alt_keys = c2h::device_vector(10'000); + c2h::device_vector d_block_size(1, 0); + + cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(alt_keys.data())); + + auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); + auto d_end_offsets = block_size_extracting_constant_iterator(10'000, thrust::raw_pointer_cast(d_block_size.data())); + + auto env = cuda::execution::tune(segmented_radix_sort_block_size_tuning{}); + + REQUIRE( + cudaSuccess + == cub::DeviceSegmentedRadixSort::SortKeysDescending( + d_keys, + static_cast(keys.size()), + 1, + d_begin_offsets, + d_end_offsets, + 0, + static_cast(sizeof(int) * 8), + env)); + REQUIRE(d_block_size[0] == target_block_size); +} + #endif // TEST_LAUNCH == 0 From 88687f307b3dc145e4827e7a9c926859d3cdf788 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 16 May 2026 00:10:09 +0200 Subject: [PATCH 8/9] Fix tests --- ...h2_test_device_segmented_radix_sort_env.cu | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/cub/test/catch2_test_device_segmented_radix_sort_env.cu b/cub/test/catch2_test_device_segmented_radix_sort_env.cu index c681a063e43..830e043efb6 100644 --- a/cub/test/catch2_test_device_segmented_radix_sort_env.cu +++ b/cub/test/catch2_test_device_segmented_radix_sort_env.cu @@ -831,8 +831,10 @@ C2H_TEST("DeviceSegmentedRadixSort::SortPairs can be tuned", "[segmented_radix_s { constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; - auto keys = c2h::device_vector(10'000); - auto values = c2h::device_vector(10'000); + auto keys_in = c2h::device_vector(10'000); + auto keys_out = c2h::device_vector(10'000); + auto values_in = c2h::device_vector(10'000); + auto values_out = c2h::device_vector(10'000); c2h::device_vector d_block_size(1, 0); auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); @@ -843,11 +845,11 @@ C2H_TEST("DeviceSegmentedRadixSort::SortPairs can be tuned", "[segmented_radix_s REQUIRE( cudaSuccess == cub::DeviceSegmentedRadixSort::SortPairs( - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(values.data()), - thrust::raw_pointer_cast(values.data()), - static_cast(keys.size()), + thrust::raw_pointer_cast(keys_in.data()), + thrust::raw_pointer_cast(keys_out.data()), + thrust::raw_pointer_cast(values_in.data()), + thrust::raw_pointer_cast(values_out.data()), + static_cast(keys_in.size()), 1, d_begin_offsets, d_end_offsets, @@ -861,8 +863,10 @@ C2H_TEST("DeviceSegmentedRadixSort::SortPairsDescending can be tuned", "[segment { constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; - auto keys = c2h::device_vector(10'000); - auto values = c2h::device_vector(10'000); + auto keys_in = c2h::device_vector(10'000); + auto keys_out = c2h::device_vector(10'000); + auto values_in = c2h::device_vector(10'000); + auto values_out = c2h::device_vector(10'000); c2h::device_vector d_block_size(1, 0); auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); @@ -873,11 +877,11 @@ C2H_TEST("DeviceSegmentedRadixSort::SortPairsDescending can be tuned", "[segment REQUIRE( cudaSuccess == cub::DeviceSegmentedRadixSort::SortPairsDescending( - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(values.data()), - thrust::raw_pointer_cast(values.data()), - static_cast(keys.size()), + thrust::raw_pointer_cast(keys_in.data()), + thrust::raw_pointer_cast(keys_out.data()), + thrust::raw_pointer_cast(values_in.data()), + thrust::raw_pointer_cast(values_out.data()), + static_cast(keys_in.size()), 1, d_begin_offsets, d_end_offsets, @@ -891,7 +895,8 @@ C2H_TEST("DeviceSegmentedRadixSort::SortKeys can be tuned", "[segmented_radix_so { constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; - auto keys = c2h::device_vector(10'000); + auto keys_in = c2h::device_vector(10'000); + auto keys_out = c2h::device_vector(10'000); c2h::device_vector d_block_size(1, 0); auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); @@ -902,9 +907,9 @@ C2H_TEST("DeviceSegmentedRadixSort::SortKeys can be tuned", "[segmented_radix_so REQUIRE( cudaSuccess == cub::DeviceSegmentedRadixSort::SortKeys( - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(keys.data()), - static_cast(keys.size()), + thrust::raw_pointer_cast(keys_in.data()), + thrust::raw_pointer_cast(keys_out.data()), + static_cast(keys_in.size()), 1, d_begin_offsets, d_end_offsets, @@ -918,7 +923,8 @@ C2H_TEST("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmente { constexpr unsigned int target_block_size = c2h::get<0, TestType>::value; - auto keys = c2h::device_vector(10'000); + auto keys_in = c2h::device_vector(10'000); + auto keys_out = c2h::device_vector(10'000); c2h::device_vector d_block_size(1, 0); auto d_begin_offsets = block_size_extracting_constant_iterator(0, thrust::raw_pointer_cast(d_block_size.data())); @@ -929,9 +935,9 @@ C2H_TEST("DeviceSegmentedRadixSort::SortKeysDescending can be tuned", "[segmente REQUIRE( cudaSuccess == cub::DeviceSegmentedRadixSort::SortKeysDescending( - thrust::raw_pointer_cast(keys.data()), - thrust::raw_pointer_cast(keys.data()), - static_cast(keys.size()), + thrust::raw_pointer_cast(keys_in.data()), + thrust::raw_pointer_cast(keys_out.data()), + static_cast(keys_in.size()), 1, d_begin_offsets, d_end_offsets, From 4817007868f1de6274ffaaa1ae41efbc3428a508 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 16 May 2026 02:00:40 +0200 Subject: [PATCH 9/9] missing include --- cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index eeb60940675..4b90a2f1978 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include