NVIDIA · bernhardmgruber · May 18, 2026 · May 12, 2026 · May 14, 2026
@@ -33,20 +33,18 @@
 #  endif // TUNE_LOAD
 
 template <typename InputT>
-struct policy_hub_t
+struct policy_selector
 {
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
+    -> cub::detail::select::select_if_policy
   {
-    using SelectIfPolicyT =
-      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
-                               TUNE_ITEMS_PER_THREAD,
-                               TUNE_LOAD_ALGORITHM,
-                               TUNE_LOAD_MODIFIER,
-                               cub::BLOCK_SCAN_WARP_SCANS,
-                               delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
+    return {TUNE_THREADS_PER_BLOCK,
+            TUNE_ITEMS_PER_THREAD,
+            TUNE_LOAD_ALGORITHM,
+            TUNE_LOAD_MODIFIER,
+            cub::BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy};
+  }
 };
 #endif // TUNE_BASE
 
@@ -70,31 +68,11 @@ void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_o
 template <typename T, typename OffsetT, typename UseDistinctPartitionT>
 void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
 {
-  using input_it_t                           = const T*;
-  using flag_it_t                            = const bool*;
-  using num_selected_it_t                    = OffsetT*;
-  using select_op_t                          = cub::NullType;
-  using equality_op_t                        = cub::NullType;
   using offset_t                             = OffsetT;
   constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
   using output_it_t                          = typename ::cuda::std::
     conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    cub::SelectImpl::Partition
-#if !TUNE_BASE
-    ,
-    policy_hub_t<T>
-#endif // TUNE_BASE
-    >;
-
   // Retrieve axis parameters
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
@@ -106,9 +84,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   thrust::device_vector<offset_t> num_selected(1);
   thrust::device_vector<T> out(elements);
 
-  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
-  flag_it_t d_flags                = thrust::raw_pointer_cast(flags.data());
-  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  const T* d_in            = thrust::raw_pointer_cast(in.data());
+  const bool* d_flags      = thrust::raw_pointer_cast(flags.data());
+  offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());
   output_it_t d_out{};
   init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);
 
@@ -118,25 +96,25 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   state.add_global_memory_writes<T>(elements);
   state.add_global_memory_writes<offset_t>(1);
 
-  std::size_t temp_size{};
-  dispatch_t::Dispatch(
-    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, nullptr);
-
-  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
-  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
-
+  caching_allocator_t alloc;
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      temp_storage,
-      temp_size,
+    auto env = cub_bench_env(
+      alloc,
+      launch
+#if !TUNE_BASE
+      ,
+      cuda::execution::tune(policy_selector<T>{})
+#endif // !TUNE_BASE
+    );
+    _CCCL_TRY_CUDA_API(
+      cub::DevicePartition::Flagged,
+      "Flagged failed",
       d_in,
       d_flags,
       d_out,
       d_num_selected,
-      select_op_t{},
-      equality_op_t{},
-      elements,
-      launch.get_stream());
+      static_cast<offset_t>(elements),
+      env);
   });
 }
 

@@ -33,20 +33,18 @@
 #  endif // TUNE_LOAD
 
 template <typename InputT>
-struct policy_hub_t
+struct policy_selector
 {
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
+    -> cub::detail::select::select_if_policy
   {
-    using SelectIfPolicyT =
-      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
-                               TUNE_ITEMS_PER_THREAD,
-                               TUNE_LOAD_ALGORITHM,
-                               TUNE_LOAD_MODIFIER,
-                               cub::BLOCK_SCAN_WARP_SCANS,
-                               delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
+    return {TUNE_THREADS_PER_BLOCK,
+            TUNE_ITEMS_PER_THREAD,
+            TUNE_LOAD_ALGORITHM,
+            TUNE_LOAD_MODIFIER,
+            cub::BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy};
+  }
 };
 #endif // !TUNE_BASE
 
@@ -71,31 +69,12 @@ void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_par
 template <typename T, typename OffsetT, typename UseDistinctPartitionT>
 void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
 {
-  using input_it_t                           = const T*;
-  using flag_it_t                            = cub::NullType*;
-  using num_selected_it_t                    = OffsetT*;
   using select_op_t                          = less_then_t<T>;
-  using equality_op_t                        = cub::NullType;
   using offset_t                             = OffsetT;
   constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
   using output_it_t                          = typename ::cuda::std::
     conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    cub::SelectImpl::Partition
-#if !TUNE_BASE
-    ,
-    policy_hub_t<T>
-#endif // !TUNE_BASE
-    >;
-
   // Retrieve axis parameters
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
@@ -108,9 +87,8 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
 
   thrust::device_vector<T> out(elements);
 
-  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
-  flag_it_t d_flags                = nullptr;
-  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  const T* d_in            = thrust::raw_pointer_cast(in.data());
+  offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());
   output_it_t d_out{};
   init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);
 
@@ -119,25 +97,25 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
   state.add_global_memory_writes<T>(elements);
   state.add_global_memory_writes<offset_t>(1);
 
-  std::size_t temp_size{};
-  dispatch_t::Dispatch(
-    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, nullptr);
-
-  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
-  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
-
+  caching_allocator_t alloc;
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      temp_storage,
-      temp_size,
+    auto env = cub_bench_env(
+      alloc,
+      launch
+#if !TUNE_BASE
+      ,
+      cuda::execution::tune(policy_selector<T>{})
+#endif // !TUNE_BASE
+    );
+    _CCCL_TRY_CUDA_API(
+      cub::DevicePartition::If,
+      "If failed",
       d_in,
-      d_flags,
       d_out,
       d_num_selected,
+      static_cast<offset_t>(elements),
       select_op,
-      equality_op_t{},
-      elements,
-      launch.get_stream());
+      env);
   });
 }
 

@@ -1,8 +1,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
-#include <cub/device/dispatch/dispatch_three_way_partition.cuh>
-#include <cub/device/dispatch/tuning/tuning_three_way_partition.cuh>
+#include <cub/device/device_partition.cuh>
 
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
@@ -16,7 +15,7 @@
 
 #if !TUNE_BASE
 template <typename InputT>
-struct tuned_policy_selector_t
+struct policy_selector
 {
   [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
     -> cub::detail::three_way_partition::three_way_partition_policy
@@ -26,19 +25,16 @@ struct tuned_policy_selector_t
             TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
             cub::LOAD_DEFAULT,
             cub::BLOCK_SCAN_WARP_SCANS,
-            cub::detail::delay_constructor_policy_from_type<delay_constructor_t>};
+            delay_constructor_policy};
   }
 };
 #endif // !TUNE_BASE
 
 template <typename T, typename OffsetT>
 void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
-  using input_it_t        = const T*;
-  using output_it_t       = T*;
-  using num_selected_it_t = OffsetT*;
-  using select_op_t       = less_then_t<T>;
-  using offset_t          = OffsetT;
+  using select_op_t = less_then_t<T>;
+  using offset_t    = OffsetT;
 
   // Retrieve axis parameters
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
@@ -54,50 +50,44 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   select_op_t select_op_2{right_border};
 
   thrust::device_vector<T> in = generate(elements, entropy, min_val, max_val);
-  thrust::device_vector<offset_t> num_selected(1);
+  thrust::device_vector<offset_t> num_selected(2);
   thrust::device_vector<T> out_1(elements);
   thrust::device_vector<T> out_2(elements);
   thrust::device_vector<T> out_3(elements);
 
-  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
-  output_it_t d_out_1              = thrust::raw_pointer_cast(out_1.data());
-  output_it_t d_out_2              = thrust::raw_pointer_cast(out_2.data());
-  output_it_t d_out_3              = thrust::raw_pointer_cast(out_3.data());
-  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  const T* d_in            = thrust::raw_pointer_cast(in.data());
+  T* d_out_1               = thrust::raw_pointer_cast(out_1.data());
+  T* d_out_2               = thrust::raw_pointer_cast(out_2.data());
+  T* d_out_3               = thrust::raw_pointer_cast(out_3.data());
+  offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);
   state.add_global_memory_writes<T>(elements);
-  state.add_global_memory_writes<offset_t>(1);
+  state.add_global_memory_writes<offset_t>(2);
 
-  std::size_t temp_size{};
-  auto dispatch = [&](void* temp_storage, cudaStream_t stream) {
-    return cub::detail::three_way_partition::dispatch(
-      temp_storage,
-      temp_size,
+  caching_allocator_t alloc;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    auto env = cub_bench_env(
+      alloc,
+      launch
+#if !TUNE_BASE
+      ,
+      cuda::execution::tune(policy_selector<T>{})
+#endif // !TUNE_BASE
+    );
+    _CCCL_TRY_CUDA_API(
+      cub::DevicePartition::If,
+      "If three-way failed",
       d_in,
       d_out_1,
       d_out_2,
       d_out_3,
       d_num_selected,
+      static_cast<offset_t>(elements),
       select_op_1,
       select_op_2,
-      static_cast<offset_t>(elements),
-      stream
-#if !TUNE_BASE
-      ,
-      policy_selector_t{}
-#endif // !TUNE_BASE
-    );
-  };
-
-  dispatch(nullptr, nullptr);
-
-  thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
-  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
-
-  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch(temp_storage, launch.get_stream());
+      env);
   });
 }