From 1cd8c3d5524605af4c5e222c0674e807d093efcb Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 22:40:10 -0700
Subject: [PATCH 01/11] add env DeviceTopK without temp_storage args

---
 cub/cub/device/device_topk.cuh              | 868 +++++++++++++++++++-
 cub/test/catch2_test_device_topk_api.cu     |  90 ++
 cub/test/catch2_test_device_topk_env_api.cu | 332 ++++++++
 3 files changed, 1257 insertions(+), 33 deletions(-)
 create mode 100644 cub/test/catch2_test_device_topk_env_api.cu

diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh
index 0567b76149d..1deca561e8d 100644
--- a/cub/cub/device/device_topk.cuh
+++ b/cub/cub/device/device_topk.cuh
@@ -18,6 +18,7 @@
 #endif // no system header
 
 #include <cub/detail/choose_offset.cuh>
+#include <cub/detail/env_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_topk.cuh>
 
 #include <cuda/__execution/determinism.h>
@@ -293,6 +294,115 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs.
+  //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-max-pairs-env
+  //!     :end-before: example-end topk-max-pairs-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access input iterator type for writing output values @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Random-access iterator to the input sequence containing the keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
+  //!
+  //! @param[in] d_values_in
+  //!   Random-access iterator to the input sequence containing the values associated to each key
+  //!
+  //! @param[out] d_values_out
+  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
+  //!   written to
+  //!
+  //! @param[in] num_items
+  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
+  //!
+  //! @param[in] k
+  //!   The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of
+  //!   `num_items`.
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename ValueInputIteratorT,
+    typename ValueOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    ::cuda::std::enable_if_t<!detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, EnvT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueInputIteratorT d_values_in,
+    ValueOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::max>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        d_values_in,
+        d_values_out,
+        num_items,
+        k,
+        detail::identity_decomposer_t{},
+        env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -433,6 +543,119 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs,
+  //! using a decomposer to interpret user-defined key types.
+  //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-max-pairs-decomposer-env
+  //!     :end-before: example-end topk-max-pairs-decomposer-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access input iterator type for writing output values @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its
+  //!   constituent arithmetic types.
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input values
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the K output values
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] decomposer
+  //!   Decomposer object for interpreting user-defined key types
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueInputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumItemsT,
+            typename NumOutItemsT,
+            typename DecomposerT,
+            typename EnvT = ::cuda::std::execution::env<>>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+    ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
+                             cudaError_t>
+    MaxPairs(KeyInputIteratorT d_keys_in,
+             KeyOutputIteratorT d_keys_out,
+             ValueInputIteratorT d_values_in,
+             ValueOutputIteratorT d_values_out,
+             NumItemsT num_items,
+             NumOutItemsT k,
+             DecomposerT decomposer,
+             EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs");
+    using key_t = detail::it_value_t<KeyInputIteratorT>;
+
+    static_assert(!detail::radix::can_twiddle<key_t>,
+                  "Custom decomposers are not supported for fundamental types; "
+                  "use the non-decomposer API overload instead");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::max>(
+        storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -545,6 +768,109 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-min-pairs-env
+  //!     :end-before: example-end topk-min-pairs-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access input iterator type for writing output values @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input values
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the K output values
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename ValueInputIteratorT,
+    typename ValueOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    ::cuda::std::enable_if_t<!detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, EnvT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueInputIteratorT d_values_in,
+    ValueOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::min>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        d_values_in,
+        d_values_out,
+        num_items,
+        k,
+        detail::identity_decomposer_t{},
+        env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -608,40 +934,157 @@ struct DeviceTopK
   //!   **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its
   //!   constituent arithmetic types.
   //!
-  //! @param[in] d_temp_storage
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
-  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Random-access iterator to the input sequence containing the keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
+  //!
+  //! @param[in] d_values_in
+  //!   Random-access iterator to the input sequence containing the values associated to each key
+  //!
+  //! @param[out] d_values_out
+  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
+  //!   written to
+  //!
+  //! @param[in] num_items
+  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
+  //!
+  //! @param[in] k
+  //!   The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of
+  //!   `num_items`.
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
+  //!   types.
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueInputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumItemsT,
+            typename NumOutItemsT,
+            typename DecomposerT,
+            typename EnvT = ::cuda::std::execution::env<>>
+  CUB_RUNTIME_FUNCTION static //
+    ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
+                             cudaError_t>
+    MinPairs(void* d_temp_storage,
+             size_t& temp_storage_bytes,
+             KeyInputIteratorT d_keys_in,
+             KeyOutputIteratorT d_keys_out,
+             ValueInputIteratorT d_values_in,
+             ValueOutputIteratorT d_values_out,
+             NumItemsT num_items,
+             NumOutItemsT k,
+             DecomposerT decomposer,
+             EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MinPairs");
+    using key_t = detail::it_value_t<KeyInputIteratorT>;
+
+    static_assert(!detail::radix::can_twiddle<key_t>,
+                  "Custom decomposers are not supported for fundamental types; "
+                  "use the non-decomposer API overload instead");
+
+    return detail::dispatch_topk<detail::topk::select::min>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      k,
+      decomposer,
+      ::cuda::std::move(env));
+  }
+
+  //! @rst
+  //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs,
+  //! using a decomposer to interpret user-defined key types.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-min-pairs-decomposer-env
+  //!     :end-before: example-end topk-min-pairs-decomposer-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access input iterator type for writing output values @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of decomposer
   //!
-  //! @param[in,out] temp_storage_bytes
-  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Random-access iterator to the input sequence containing the keys
+  //!   Pointer to the input keys
   //!
   //! @param[out] d_keys_out
-  //!   Random-access iterator to the output sequence of keys, where K values will be written to
+  //!   Pointer to the K output keys
   //!
   //! @param[in] d_values_in
-  //!   Random-access iterator to the input sequence containing the values associated to each key
+  //!   Pointer to the input values
   //!
   //! @param[out] d_values_out
-  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
-  //!   written to
+  //!   Pointer to the K output values
   //!
   //! @param[in] num_items
-  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
+  //!   Number of input items
   //!
   //! @param[in] k
-  //!   The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of
-  //!   `num_items`.
+  //!   The K value
   //!
-  //! @param decomposer
-  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
-  //!   types.
+  //! @param[in] decomposer
+  //!   Decomposer object for interpreting user-defined key types
   //!
   //! @param[in] env
   //!   @rst
-  //!   **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
   template <typename KeyInputIteratorT,
             typename KeyOutputIteratorT,
@@ -651,12 +1094,10 @@ struct DeviceTopK
             typename NumOutItemsT,
             typename DecomposerT,
             typename EnvT = ::cuda::std::execution::env<>>
-  CUB_RUNTIME_FUNCTION static //
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
     ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
                              cudaError_t>
-    MinPairs(void* d_temp_storage,
-             size_t& temp_storage_bytes,
-             KeyInputIteratorT d_keys_in,
+    MinPairs(KeyInputIteratorT d_keys_in,
              KeyOutputIteratorT d_keys_out,
              ValueInputIteratorT d_values_in,
              ValueOutputIteratorT d_values_out,
@@ -665,24 +1106,17 @@ struct DeviceTopK
              DecomposerT decomposer,
              EnvT env = {})
   {
-    _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MinPairs");
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs");
     using key_t = detail::it_value_t<KeyInputIteratorT>;
 
     static_assert(!detail::radix::can_twiddle<key_t>,
                   "Custom decomposers are not supported for fundamental types; "
                   "use the non-decomposer API overload instead");
 
-    return detail::dispatch_topk<detail::topk::select::min>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      k,
-      decomposer,
-      ::cuda::std::move(env));
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::min>(
+        storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
+    });
   }
 
   //! @rst
@@ -780,6 +1214,89 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the largest K keys from an unordered input sequence.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-max-keys-env
+  //!     :end-before: example-end topk-max-keys-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    ::cuda::std::enable_if_t<!detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, EnvT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys(
+    KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::max>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        static_cast<NullType*>(nullptr),
+        static_cast<NullType*>(nullptr),
+        num_items,
+        k,
+        detail::identity_decomposer_t{},
+        env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -903,6 +1420,107 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the largest K keys from an unordered input sequence,
+  //! using a decomposer to interpret user-defined key types.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-max-keys-decomposer-env
+  //!     :end-before: example-end topk-max-keys-decomposer-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of decomposer
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] decomposer
+  //!   Decomposer object for interpreting user-defined key types
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename NumItemsT,
+            typename NumOutItemsT,
+            typename DecomposerT,
+            typename EnvT = ::cuda::std::execution::env<>>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+    ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
+                             cudaError_t>
+    MaxKeys(KeyInputIteratorT d_keys_in,
+            KeyOutputIteratorT d_keys_out,
+            NumItemsT num_items,
+            NumOutItemsT k,
+            DecomposerT decomposer,
+            EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys");
+    using key_t = detail::it_value_t<KeyInputIteratorT>;
+
+    static_assert(!detail::radix::can_twiddle<key_t>,
+                  "Custom decomposers are not supported for fundamental types; "
+                  "use the non-decomposer API overload instead");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::max>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        static_cast<NullType*>(nullptr),
+        static_cast<NullType*>(nullptr),
+        num_items,
+        k,
+        decomposer,
+        env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -998,6 +1616,89 @@ struct DeviceTopK
       ::cuda::std::move(env));
   }
 
+  //! @rst
+  //! Finds the smallest K keys from an unordered input sequence.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-min-keys-env
+  //!     :end-before: example-end topk-min-keys-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    ::cuda::std::enable_if_t<!detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, EnvT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys(
+    KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::min>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        static_cast<NullType*>(nullptr),
+        static_cast<NullType*>(nullptr),
+        num_items,
+        k,
+        detail::identity_decomposer_t{},
+        env);
+    });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -1120,6 +1821,107 @@ struct DeviceTopK
       decomposer,
       ::cuda::std::move(env));
   }
+
+  //! @rst
+  //! Finds the smallest K keys from an unordered input sequence,
+  //! using a decomposer to interpret user-defined key types.
+  //!
+  //! .. versionadded:: 3.5.0
+  //!    First appears in CUDA Toolkit 13.5.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Unlike the temp-storage overload, this overload allocates and manages the required temporary
+  //! storage internally using the memory resource queried from the environment.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin topk-min-keys-decomposer-env
+  //!     :end-before: example-end topk-min-keys-decomposer-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output keys @iterator
+  //!
+  //! @tparam NumItemsT
+  //!  The integral type of variable num_items
+  //!
+  //! @tparam NumOutItemsT
+  //!  The integral type of variable k
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of decomposer
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input keys
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the K output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of input items
+  //!
+  //! @param[in] k
+  //!   The K value
+  //!
+  //! @param[in] decomposer
+  //!   Decomposer object for interpreting user-defined key types
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename NumItemsT,
+            typename NumOutItemsT,
+            typename DecomposerT,
+            typename EnvT = ::cuda::std::execution::env<>>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+    ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
+                             cudaError_t>
+    MinKeys(KeyInputIteratorT d_keys_in,
+            KeyOutputIteratorT d_keys_out,
+            NumItemsT num_items,
+            NumOutItemsT k,
+            DecomposerT decomposer,
+            EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys");
+    using key_t = detail::it_value_t<KeyInputIteratorT>;
+
+    static_assert(!detail::radix::can_twiddle<key_t>,
+                  "Custom decomposers are not supported for fundamental types; "
+                  "use the non-decomposer API overload instead");
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+      return detail::dispatch_topk<detail::topk::select::min>(
+        storage,
+        bytes,
+        d_keys_in,
+        d_keys_out,
+        static_cast<NullType*>(nullptr),
+        static_cast<NullType*>(nullptr),
+        num_items,
+        k,
+        decomposer,
+        env);
+    });
+  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu
index 8b7d7a52a70..a988e2ca7ff 100644
--- a/cub/test/catch2_test_device_topk_api.cu
+++ b/cub/test/catch2_test_device_topk_api.cu
@@ -495,3 +495,93 @@ C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]")
     REQUIRE(expected_vals == vals_out);
   }
 }
+
+// Guard tests: each public DeviceTopK method's legacy temp-storage form must resolve
+// unambiguously when called in its minimal form (no explicit stream, all defaults
+// implicit), even though the env-alloc overloads (which drop the temp-storage args)
+// also live in scope. If the env-overload SFINAE drifts, these become "ambiguous
+// overload" compile errors.
+
+C2H_TEST("DeviceTopK::MaxKeys legacy size-query is unambiguous", "[topk][device]")
+{
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  int* d_keys_in            = nullptr;
+  int* d_keys_out           = nullptr;
+  int num_items             = 0;
+  int k                     = 0;
+
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MaxKeys(
+            d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements));
+}
+
+C2H_TEST("DeviceTopK::MinKeys legacy size-query is unambiguous", "[topk][device]")
+{
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  int* d_keys_in            = nullptr;
+  int* d_keys_out           = nullptr;
+  int num_items             = 0;
+  int k                     = 0;
+
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MinKeys(
+            d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements));
+}
+
+C2H_TEST("DeviceTopK::MaxPairs legacy size-query is unambiguous", "[topk][device]")
+{
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  int* d_keys_in            = nullptr;
+  int* d_keys_out           = nullptr;
+  int* d_values_in          = nullptr;
+  int* d_values_out         = nullptr;
+  int num_items             = 0;
+  int k                     = 0;
+
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MaxPairs(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            num_items,
+            k,
+            requirements));
+}
+
+C2H_TEST("DeviceTopK::MinPairs legacy size-query is unambiguous", "[topk][device]")
+{
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  int* d_keys_in            = nullptr;
+  int* d_keys_out           = nullptr;
+  int* d_values_in          = nullptr;
+  int* d_values_out         = nullptr;
+  int num_items             = 0;
+  int k                     = 0;
+
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MinPairs(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            num_items,
+            k,
+            requirements));
+}
diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu
new file mode 100644
index 00000000000..6d5964a5f2b
--- /dev/null
+++ b/cub/test/catch2_test_device_topk_env_api.cu
@@ -0,0 +1,332 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_topk.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cuda/__execution/determinism.h>
+#include <cuda/__execution/output_ordering.h>
+#include <cuda/__execution/require.h>
+#include <cuda/devices>
+#include <cuda/std/__execution/env.h>
+#include <cuda/std/functional>
+#include <cuda/stream>
+
+#include <algorithm>
+#include <iostream>
+
+#include <c2h/catch2_test_helper.h>
+
+// Simple user-defined key type for the decomposer-based examples.
+struct topk_custom_t
+{
+  int rank;
+  int payload;
+};
+
+struct topk_custom_decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<int&> operator()(topk_custom_t& key) const
+  {
+    return {key.rank};
+  }
+  __host__ __device__ ::cuda::std::tuple<const int&> operator()(const topk_custom_t& key) const
+  {
+    return {key.rank};
+  }
+};
+
+C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-max-keys-env
+  auto d_in  = thrust::device_vector<int>{8, 6, 7, 5, 3, 0, 9, 1, 4, 2};
+  auto d_out = thrust::device_vector<int>(3);
+  int k      = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n';
+  }
+  // example-end topk-max-keys-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  // Result order is unspecified for TopK; sort before compare.
+  thrust::sort(d_out.begin(), d_out.end(), cuda::std::greater<int>{});
+  thrust::device_vector<int> expected{9, 8, 7};
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-min-keys-env
+  auto d_in  = thrust::device_vector<int>{8, 6, 7, 5, 3, 0, 9, 1, 4, 2};
+  auto d_out = thrust::device_vector<int>(3);
+  int k      = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MinKeys(d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n';
+  }
+  // example-end topk-min-keys-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::sort(d_out.begin(), d_out.end());
+  thrust::device_vector<int> expected{0, 1, 2};
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-max-pairs-env
+  auto d_keys_in    = thrust::device_vector<int>{8, 6, 7, 5, 3, 0, 9, 1, 4, 2};
+  auto d_values_in  = thrust::device_vector<int>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto d_keys_out   = thrust::device_vector<int>(3);
+  auto d_values_out = thrust::device_vector<int>(3);
+  int k             = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MaxPairs(
+    d_keys_in.begin(),
+    d_keys_out.begin(),
+    d_values_in.begin(),
+    d_values_out.begin(),
+    static_cast<int>(d_keys_in.size()),
+    k,
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n';
+  }
+  // example-end topk-max-pairs-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::sort(d_keys_out.begin(), d_keys_out.end(), cuda::std::greater<int>{});
+  thrust::device_vector<int> expected_keys{9, 8, 7};
+  REQUIRE(d_keys_out == expected_keys);
+}
+
+C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-min-pairs-env
+  auto d_keys_in    = thrust::device_vector<int>{8, 6, 7, 5, 3, 0, 9, 1, 4, 2};
+  auto d_values_in  = thrust::device_vector<int>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto d_keys_out   = thrust::device_vector<int>(3);
+  auto d_values_out = thrust::device_vector<int>(3);
+  int k             = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MinPairs(
+    d_keys_in.begin(),
+    d_keys_out.begin(),
+    d_values_in.begin(),
+    d_values_out.begin(),
+    static_cast<int>(d_keys_in.size()),
+    k,
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n';
+  }
+  // example-end topk-min-pairs-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::sort(d_keys_out.begin(), d_keys_out.end());
+  thrust::device_vector<int> expected_keys{0, 1, 2};
+  REQUIRE(d_keys_out == expected_keys);
+}
+
+C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-max-keys-decomposer-env
+  thrust::host_vector<topk_custom_t> h_in{
+    {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}};
+  thrust::device_vector<topk_custom_t> d_in = h_in;
+  thrust::device_vector<topk_custom_t> d_out(3);
+  int k = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MaxKeys(
+    d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, topk_custom_decomposer_t{}, env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n';
+  }
+  // example-end topk-max-keys-decomposer-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::host_vector<topk_custom_t> h_out = d_out;
+  std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
+    return a.rank > b.rank;
+  });
+  REQUIRE(h_out[0].rank == 9);
+  REQUIRE(h_out[1].rank == 8);
+  REQUIRE(h_out[2].rank == 7);
+}
+
+C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-min-keys-decomposer-env
+  thrust::host_vector<topk_custom_t> h_in{
+    {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}};
+  thrust::device_vector<topk_custom_t> d_in = h_in;
+  thrust::device_vector<topk_custom_t> d_out(3);
+  int k = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MinKeys(
+    d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, topk_custom_decomposer_t{}, env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n';
+  }
+  // example-end topk-min-keys-decomposer-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::host_vector<topk_custom_t> h_out = d_out;
+  std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
+    return a.rank < b.rank;
+  });
+  REQUIRE(h_out[0].rank == 0);
+  REQUIRE(h_out[1].rank == 1);
+  REQUIRE(h_out[2].rank == 2);
+}
+
+C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-max-pairs-decomposer-env
+  thrust::host_vector<topk_custom_t> h_keys_in{
+    {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}};
+  thrust::device_vector<topk_custom_t> d_keys_in = h_keys_in;
+  thrust::device_vector<int> d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  thrust::device_vector<topk_custom_t> d_keys_out(3);
+  thrust::device_vector<int> d_values_out(3);
+  int k = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MaxPairs(
+    d_keys_in.begin(),
+    d_keys_out.begin(),
+    d_values_in.begin(),
+    d_values_out.begin(),
+    static_cast<int>(d_keys_in.size()),
+    k,
+    topk_custom_decomposer_t{},
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n';
+  }
+  // example-end topk-max-pairs-decomposer-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::host_vector<topk_custom_t> h_keys_out = d_keys_out;
+  std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
+    return a.rank > b.rank;
+  });
+  REQUIRE(h_keys_out[0].rank == 9);
+  REQUIRE(h_keys_out[1].rank == 8);
+  REQUIRE(h_keys_out[2].rank == 7);
+}
+
+C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref", "[topk][env]")
+{
+  // example-begin topk-min-pairs-decomposer-env
+  thrust::host_vector<topk_custom_t> h_keys_in{
+    {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}};
+  thrust::device_vector<topk_custom_t> d_keys_in = h_keys_in;
+  thrust::device_vector<int> d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  thrust::device_vector<topk_custom_t> d_keys_out(3);
+  thrust::device_vector<int> d_values_out(3);
+  int k = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    stream_ref};
+
+  auto error = cub::DeviceTopK::MinPairs(
+    d_keys_in.begin(),
+    d_keys_out.begin(),
+    d_values_in.begin(),
+    d_values_out.begin(),
+    static_cast<int>(d_keys_in.size()),
+    k,
+    topk_custom_decomposer_t{},
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n';
+  }
+  // example-end topk-min-pairs-decomposer-env
+
+  stream.sync();
+  REQUIRE(error == cudaSuccess);
+
+  thrust::host_vector<topk_custom_t> h_keys_out = d_keys_out;
+  std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
+    return a.rank < b.rank;
+  });
+  REQUIRE(h_keys_out[0].rank == 0);
+  REQUIRE(h_keys_out[1].rank == 1);
+  REQUIRE(h_keys_out[2].rank == 2);
+}

From a4e3f4682a686860849282ca9b7d65939a7890c7 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 22:58:48 -0700
Subject: [PATCH 02/11] remove non-env api tests

---
 cub/test/catch2_test_device_topk_api.cu | 587 ------------------------
 1 file changed, 587 deletions(-)
 delete mode 100644 cub/test/catch2_test_device_topk_api.cu

diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu
deleted file mode 100644
index a988e2ca7ff..00000000000
--- a/cub/test/catch2_test_device_topk_api.cu
+++ /dev/null
@@ -1,587 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <cub/device/device_topk.cuh>
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/sort.h>
-
-#include <cuda/__execution/determinism.h>
-#include <cuda/__execution/output_ordering.h>
-#include <cuda/__execution/require.h>
-#include <cuda/iterator>
-#include <cuda/std/functional>
-#include <cuda/stream>
-
-#include <c2h/catch2_test_helper.h>
-
-C2H_TEST("DeviceTopK::MinKeys API example for non-deterministic, unsorted results", "[device][device_transform]")
-{
-  // example-begin topk-min-keys-non-deterministic-unsorted
-  const int k = 4;
-  auto input  = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
-  auto output = thrust::device_vector<int>(k, thrust::no_init);
-
-  // Specify that we do not require a specific output order and do not require deterministic results
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-  // Prepare CUDA stream
-  cudaStream_t stream = nullptr;
-  cudaStreamCreate(&stream);
-  cuda::stream_ref stream_ref{stream};
-
-  // Create the environment with the stream and requirements
-  auto env = cuda::std::execution::env{stream_ref, requirements};
-
-  // Query temporary storage requirements
-  size_t temp_storage_bytes{};
-  cub::DeviceTopK::MinKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env);
-
-  // Allocate temporary storage
-  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
-
-  cub::DeviceTopK::MinKeys(
-    thrust::raw_pointer_cast(temp_storage.data()),
-    temp_storage_bytes,
-    input.begin(),
-    output.begin(),
-    input.size(),
-    k,
-    env);
-
-  // Get the top-k results into sorted order for easy comparison
-  thrust::sort(output.begin(), output.end());
-  thrust::host_vector<int> expected{-3, 1, 2, 4};
-  // example-end topk-min-keys-non-deterministic-unsorted
-
-  REQUIRE(output == expected);
-}
-
-C2H_TEST("DeviceTopK::MaxKeys API example for non-deterministic, unsorted results", "[device][device_transform]")
-{
-  // example-begin topk-max-keys-non-deterministic-unsorted
-  const int k = 4;
-  auto input  = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
-  auto output = thrust::device_vector<int>(k, thrust::no_init);
-
-  // Specify that we do not require a specific output order and do not require deterministic results
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-  // Prepare CUDA stream
-  cudaStream_t stream = nullptr;
-  cudaStreamCreate(&stream);
-  cuda::stream_ref stream_ref{stream};
-
-  // Create the environment with the stream and requirements
-  auto env = cuda::std::execution::env{stream_ref, requirements};
-
-  // Query temporary storage requirements
-  size_t temp_storage_bytes{};
-  cub::DeviceTopK::MaxKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env);
-
-  // Allocate temporary storage
-  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
-
-  cub::DeviceTopK::MaxKeys(
-    thrust::raw_pointer_cast(temp_storage.data()),
-    temp_storage_bytes,
-    input.begin(),
-    output.begin(),
-    input.size(),
-    k,
-    env);
-
-  // Get the top-k results into sorted order for easy comparison
-  thrust::sort(output.begin(), output.end(), cuda::std::greater{});
-  thrust::host_vector<int> expected{8, 7, 6, 5};
-  // example-end topk-max-keys-non-deterministic-unsorted
-
-  REQUIRE(output == expected);
-}
-
-C2H_TEST("DeviceTopK::MinPairs API example for non-deterministic, unsorted results", "[device][device_transform]")
-{
-  // example-begin topk-min-pairs-non-deterministic-unsorted
-  const int k     = 4;
-  auto keys       = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
-  auto values     = cuda::make_counting_iterator<int>(0);
-  auto keys_out   = thrust::device_vector<int>(k, thrust::no_init);
-  auto values_out = thrust::device_vector<int>(k, thrust::no_init);
-
-  // Specify that we do not require a specific output order and do not require deterministic results
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-  // Prepare CUDA stream
-  cudaStream_t stream = nullptr;
-  cudaStreamCreate(&stream);
-  cuda::stream_ref stream_ref{stream};
-
-  // Create the environment with the stream and requirements
-  auto env = cuda::std::execution::env{stream_ref, requirements};
-
-  // Query temporary storage requirements
-  size_t temp_storage_bytes{};
-  cub::DeviceTopK::MinPairs(
-    nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env);
-
-  // Allocate temporary storage
-  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
-
-  cub::DeviceTopK::MinPairs(
-    thrust::raw_pointer_cast(temp_storage.data()),
-    temp_storage_bytes,
-    keys.begin(),
-    keys_out.begin(),
-    values,
-    values_out.begin(),
-    keys.size(),
-    k,
-    env);
-
-  // Get the top-k results into sorted order for easy comparison
-  thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin());
-  thrust::host_vector<int> expected_keys{-3, 1, 2, 4};
-  thrust::host_vector<int> expected_values{1, 2, 5, 6};
-  // example-end topk-min-pairs-non-deterministic-unsorted
-
-  REQUIRE(keys_out == expected_keys);
-  REQUIRE(values_out == expected_values);
-}
-
-C2H_TEST("DeviceTopK::MaxPairs API example for non-deterministic, unsorted results", "[device][device_transform]")
-{
-  // example-begin topk-max-pairs-non-deterministic-unsorted
-  const int k     = 4;
-  auto keys       = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
-  auto values     = cuda::make_counting_iterator<int>(0);
-  auto keys_out   = thrust::device_vector<int>(k, thrust::no_init);
-  auto values_out = thrust::device_vector<int>(k, thrust::no_init);
-
-  // Specify that we do not require a specific output order and do not require deterministic results
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-  // Prepare CUDA stream
-  cudaStream_t stream = nullptr;
-  cudaStreamCreate(&stream);
-  cuda::stream_ref stream_ref{stream};
-
-  // Create the environment with the stream and requirements
-  auto env = cuda::std::execution::env{stream_ref, requirements};
-
-  // Query temporary storage requirements
-  size_t temp_storage_bytes{};
-  cub::DeviceTopK::MaxPairs(
-    nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env);
-
-  // Allocate temporary storage
-  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
-
-  cub::DeviceTopK::MaxPairs(
-    thrust::raw_pointer_cast(temp_storage.data()),
-    temp_storage_bytes,
-    keys.begin(),
-    keys_out.begin(),
-    values,
-    values_out.begin(),
-    keys.size(),
-    k,
-    env);
-
-  // Get the top-k results into sorted order for easy comparison
-  thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin(), cuda::std::greater<>{});
-  thrust::host_vector<int> expected_keys{8, 7, 6, 5};
-  thrust::host_vector<int> expected_values{4, 3, 7, 0};
-  // example-end topk-max-pairs-non-deterministic-unsorted
-
-  REQUIRE(keys_out == expected_keys);
-  REQUIRE(values_out == expected_values);
-}
-
-// example-begin topk-custom-type
-struct custom_t
-{
-  float f;
-  int unused;
-  long long int lli;
-
-  custom_t() = default;
-  custom_t(float f, long long int lli)
-      : f(f)
-      , unused(42)
-      , lli(lli)
-  {}
-};
-
-struct decomposer_t
-{
-  __host__ __device__ cuda::std::tuple<float&, long long int&> operator()(custom_t& key) const
-  {
-    return {key.f, key.lli};
-  }
-};
-// example-end topk-custom-type
-
-static __host__ std::ostream& operator<<(std::ostream& os, const custom_t& self)
-{
-  return os << "{ " << self.f << ", " << self.lli << " }";
-}
-
-static __host__ __device__ bool operator==(const custom_t& lhs, const custom_t& rhs)
-{
-  return lhs.f == rhs.f && lhs.lli == rhs.lli;
-}
-
-static __host__ __device__ bool operator<(const custom_t& lhs, const custom_t& rhs)
-{
-  return lhs.lli == rhs.lli ? lhs.f < rhs.f : lhs.lli < rhs.lli;
-}
-
-static __host__ __device__ bool operator>(const custom_t& lhs, const custom_t& rhs)
-{
-  return rhs < lhs;
-}
-
-C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]")
-{
-  SECTION("MaxKeys")
-  {
-    // example-begin topk-max-keys-custom-type
-    constexpr int num_items = 6;
-    constexpr int k         = 3;
-
-    thrust::device_vector<custom_t> in = {
-      {+2.5f, 4}, //
-      {-2.5f, 0}, //
-      {+1.1f, 3}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2}, //
-      {+3.7f, 5} //
-    };
-
-    thrust::device_vector<custom_t> out(k);
-
-    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
-    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
-
-    auto requirements = cuda::execution::require(
-      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-    // 1) Get temp storage size
-    std::uint8_t* d_temp_storage{};
-    std::size_t temp_storage_bytes{};
-
-    cub::DeviceTopK::MaxKeys(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
-
-    // 2) Allocate temp storage
-    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
-    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
-
-    // 3) Find the top-k largest keys
-    cub::DeviceTopK::MaxKeys(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
-
-    // Sort output for comparison (output order is not guaranteed)
-    thrust::sort(out.begin(), out.end(), cuda::std::greater<>{});
-    thrust::device_vector<custom_t> expected = {
-      {+3.7f, 5}, //
-      {+2.5f, 4}, //
-      {+1.1f, 3} //
-    };
-    // example-end topk-max-keys-custom-type
-
-    REQUIRE(expected == out);
-  }
-
-  SECTION("MinKeys")
-  {
-    // example-begin topk-min-keys-custom-type
-    constexpr int num_items = 6;
-    constexpr int k         = 3;
-
-    thrust::device_vector<custom_t> in = {
-      {+2.5f, 4}, //
-      {-2.5f, 0}, //
-      {+1.1f, 3}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2}, //
-      {+3.7f, 5} //
-    };
-
-    thrust::device_vector<custom_t> out(k);
-
-    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
-    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
-
-    auto requirements = cuda::execution::require(
-      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-    std::uint8_t* d_temp_storage{};
-    std::size_t temp_storage_bytes{};
-
-    cub::DeviceTopK::MinKeys(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
-
-    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
-    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
-
-    cub::DeviceTopK::MinKeys(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
-
-    // Sort output for comparison (output order is not guaranteed)
-    thrust::sort(out.begin(), out.end());
-    thrust::device_vector<custom_t> expected = {
-      {-2.5f, 0}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2} //
-    };
-    // example-end topk-min-keys-custom-type
-
-    REQUIRE(expected == out);
-  }
-
-  SECTION("MaxPairs")
-  {
-    // example-begin topk-max-pairs-custom-type
-    constexpr int num_items = 6;
-    constexpr int k         = 3;
-
-    thrust::device_vector<custom_t> keys_in = {
-      {+2.5f, 4}, //
-      {-2.5f, 0}, //
-      {+1.1f, 3}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2}, //
-      {+3.7f, 5} //
-    };
-
-    thrust::device_vector<custom_t> keys_out(k);
-
-    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
-    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
-
-    thrust::device_vector<int> vals_in = {0, 1, 2, 3, 4, 5};
-    thrust::device_vector<int> vals_out(k);
-
-    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
-    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
-
-    auto requirements = cuda::execution::require(
-      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-    std::uint8_t* d_temp_storage{};
-    std::size_t temp_storage_bytes{};
-
-    cub::DeviceTopK::MaxPairs(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_vals_in,
-      d_vals_out,
-      num_items,
-      k,
-      decomposer_t{},
-      requirements);
-
-    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
-    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
-
-    cub::DeviceTopK::MaxPairs(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_vals_in,
-      d_vals_out,
-      num_items,
-      k,
-      decomposer_t{},
-      requirements);
-
-    // Sort by key for comparison (output order is not guaranteed)
-    thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin(), cuda::std::greater<>{});
-
-    thrust::device_vector<custom_t> expected_keys = {
-      {+3.7f, 5}, //
-      {+2.5f, 4}, //
-      {+1.1f, 3} //
-    };
-
-    thrust::device_vector<int> expected_vals = {5, 0, 2};
-    // example-end topk-max-pairs-custom-type
-
-    REQUIRE(expected_keys == keys_out);
-    REQUIRE(expected_vals == vals_out);
-  }
-
-  SECTION("MinPairs")
-  {
-    // example-begin topk-min-pairs-custom-type
-    constexpr int num_items = 6;
-    constexpr int k         = 3;
-
-    thrust::device_vector<custom_t> keys_in = {
-      {+2.5f, 4}, //
-      {-2.5f, 0}, //
-      {+1.1f, 3}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2}, //
-      {+3.7f, 5} //
-    };
-
-    thrust::device_vector<custom_t> keys_out(k);
-
-    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
-    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
-
-    thrust::device_vector<int> vals_in = {0, 1, 2, 3, 4, 5};
-    thrust::device_vector<int> vals_out(k);
-
-    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
-    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
-
-    auto requirements = cuda::execution::require(
-      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-
-    std::uint8_t* d_temp_storage{};
-    std::size_t temp_storage_bytes{};
-
-    cub::DeviceTopK::MinPairs(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_vals_in,
-      d_vals_out,
-      num_items,
-      k,
-      decomposer_t{},
-      requirements);
-
-    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
-    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
-
-    cub::DeviceTopK::MinPairs(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_vals_in,
-      d_vals_out,
-      num_items,
-      k,
-      decomposer_t{},
-      requirements);
-
-    // Sort by key for comparison (output order is not guaranteed)
-    thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin());
-
-    thrust::device_vector<custom_t> expected_keys = {
-      {-2.5f, 0}, //
-      {+0.0f, 1}, //
-      {-0.0f, 2} //
-    };
-
-    thrust::device_vector<int> expected_vals = {1, 3, 4};
-    // example-end topk-min-pairs-custom-type
-
-    REQUIRE(expected_keys == keys_out);
-    REQUIRE(expected_vals == vals_out);
-  }
-}
-
-// Guard tests: each public DeviceTopK method's legacy temp-storage form must resolve
-// unambiguously when called in its minimal form (no explicit stream, all defaults
-// implicit), even though the env-alloc overloads (which drop the temp-storage args)
-// also live in scope. If the env-overload SFINAE drifts, these become "ambiguous
-// overload" compile errors.
-
-C2H_TEST("DeviceTopK::MaxKeys legacy size-query is unambiguous", "[topk][device]")
-{
-  void* d_temp_storage      = nullptr;
-  size_t temp_storage_bytes = 0;
-  int* d_keys_in            = nullptr;
-  int* d_keys_out           = nullptr;
-  int num_items             = 0;
-  int k                     = 0;
-
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-  REQUIRE(cudaSuccess
-          == cub::DeviceTopK::MaxKeys(
-            d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements));
-}
-
-C2H_TEST("DeviceTopK::MinKeys legacy size-query is unambiguous", "[topk][device]")
-{
-  void* d_temp_storage      = nullptr;
-  size_t temp_storage_bytes = 0;
-  int* d_keys_in            = nullptr;
-  int* d_keys_out           = nullptr;
-  int num_items             = 0;
-  int k                     = 0;
-
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-  REQUIRE(cudaSuccess
-          == cub::DeviceTopK::MinKeys(
-            d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements));
-}
-
-C2H_TEST("DeviceTopK::MaxPairs legacy size-query is unambiguous", "[topk][device]")
-{
-  void* d_temp_storage      = nullptr;
-  size_t temp_storage_bytes = 0;
-  int* d_keys_in            = nullptr;
-  int* d_keys_out           = nullptr;
-  int* d_values_in          = nullptr;
-  int* d_values_out         = nullptr;
-  int num_items             = 0;
-  int k                     = 0;
-
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-  REQUIRE(cudaSuccess
-          == cub::DeviceTopK::MaxPairs(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_keys_out,
-            d_values_in,
-            d_values_out,
-            num_items,
-            k,
-            requirements));
-}
-
-C2H_TEST("DeviceTopK::MinPairs legacy size-query is unambiguous", "[topk][device]")
-{
-  void* d_temp_storage      = nullptr;
-  size_t temp_storage_bytes = 0;
-  int* d_keys_in            = nullptr;
-  int* d_keys_out           = nullptr;
-  int* d_values_in          = nullptr;
-  int* d_values_out         = nullptr;
-  int num_items             = 0;
-  int k                     = 0;
-
-  auto requirements =
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
-  REQUIRE(cudaSuccess
-          == cub::DeviceTopK::MinPairs(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_keys_out,
-            d_values_in,
-            d_values_out,
-            num_items,
-            k,
-            requirements));
-}

From e199d20c21d739c0ff7a923bb251a220e1d24022 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 23:02:15 -0700
Subject: [PATCH 03/11] revert device_topk_api

---
 cub/test/catch2_test_device_topk_api.cu | 497 ++++++++++++++++++++++++
 1 file changed, 497 insertions(+)
 create mode 100644 cub/test/catch2_test_device_topk_api.cu

diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu
new file mode 100644
index 00000000000..8b7d7a52a70
--- /dev/null
+++ b/cub/test/catch2_test_device_topk_api.cu
@@ -0,0 +1,497 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cub/device/device_topk.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cuda/__execution/determinism.h>
+#include <cuda/__execution/output_ordering.h>
+#include <cuda/__execution/require.h>
+#include <cuda/iterator>
+#include <cuda/std/functional>
+#include <cuda/stream>
+
+#include <c2h/catch2_test_helper.h>
+
+C2H_TEST("DeviceTopK::MinKeys API example for non-deterministic, unsorted results", "[device][device_transform]")
+{
+  // example-begin topk-min-keys-non-deterministic-unsorted
+  const int k = 4;
+  auto input  = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
+  auto output = thrust::device_vector<int>(k, thrust::no_init);
+
+  // Specify that we do not require a specific output order and do not require deterministic results
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+  // Prepare CUDA stream
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  cuda::stream_ref stream_ref{stream};
+
+  // Create the environment with the stream and requirements
+  auto env = cuda::std::execution::env{stream_ref, requirements};
+
+  // Query temporary storage requirements
+  size_t temp_storage_bytes{};
+  cub::DeviceTopK::MinKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env);
+
+  // Allocate temporary storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
+
+  cub::DeviceTopK::MinKeys(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    input.begin(),
+    output.begin(),
+    input.size(),
+    k,
+    env);
+
+  // Get the top-k results into sorted order for easy comparison
+  thrust::sort(output.begin(), output.end());
+  thrust::host_vector<int> expected{-3, 1, 2, 4};
+  // example-end topk-min-keys-non-deterministic-unsorted
+
+  REQUIRE(output == expected);
+}
+
+C2H_TEST("DeviceTopK::MaxKeys API example for non-deterministic, unsorted results", "[device][device_transform]")
+{
+  // example-begin topk-max-keys-non-deterministic-unsorted
+  const int k = 4;
+  auto input  = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
+  auto output = thrust::device_vector<int>(k, thrust::no_init);
+
+  // Specify that we do not require a specific output order and do not require deterministic results
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+  // Prepare CUDA stream
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  cuda::stream_ref stream_ref{stream};
+
+  // Create the environment with the stream and requirements
+  auto env = cuda::std::execution::env{stream_ref, requirements};
+
+  // Query temporary storage requirements
+  size_t temp_storage_bytes{};
+  cub::DeviceTopK::MaxKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env);
+
+  // Allocate temporary storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
+
+  cub::DeviceTopK::MaxKeys(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    input.begin(),
+    output.begin(),
+    input.size(),
+    k,
+    env);
+
+  // Get the top-k results into sorted order for easy comparison
+  thrust::sort(output.begin(), output.end(), cuda::std::greater{});
+  thrust::host_vector<int> expected{8, 7, 6, 5};
+  // example-end topk-max-keys-non-deterministic-unsorted
+
+  REQUIRE(output == expected);
+}
+
+C2H_TEST("DeviceTopK::MinPairs API example for non-deterministic, unsorted results", "[device][device_transform]")
+{
+  // example-begin topk-min-pairs-non-deterministic-unsorted
+  const int k     = 4;
+  auto keys       = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
+  auto values     = cuda::make_counting_iterator<int>(0);
+  auto keys_out   = thrust::device_vector<int>(k, thrust::no_init);
+  auto values_out = thrust::device_vector<int>(k, thrust::no_init);
+
+  // Specify that we do not require a specific output order and do not require deterministic results
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+  // Prepare CUDA stream
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  cuda::stream_ref stream_ref{stream};
+
+  // Create the environment with the stream and requirements
+  auto env = cuda::std::execution::env{stream_ref, requirements};
+
+  // Query temporary storage requirements
+  size_t temp_storage_bytes{};
+  cub::DeviceTopK::MinPairs(
+    nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env);
+
+  // Allocate temporary storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
+
+  cub::DeviceTopK::MinPairs(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys.begin(),
+    keys_out.begin(),
+    values,
+    values_out.begin(),
+    keys.size(),
+    k,
+    env);
+
+  // Get the top-k results into sorted order for easy comparison
+  thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin());
+  thrust::host_vector<int> expected_keys{-3, 1, 2, 4};
+  thrust::host_vector<int> expected_values{1, 2, 5, 6};
+  // example-end topk-min-pairs-non-deterministic-unsorted
+
+  REQUIRE(keys_out == expected_keys);
+  REQUIRE(values_out == expected_values);
+}
+
+C2H_TEST("DeviceTopK::MaxPairs API example for non-deterministic, unsorted results", "[device][device_transform]")
+{
+  // example-begin topk-max-pairs-non-deterministic-unsorted
+  const int k     = 4;
+  auto keys       = thrust::device_vector<int>{5, -3, 1, 7, 8, 2, 4, 6};
+  auto values     = cuda::make_counting_iterator<int>(0);
+  auto keys_out   = thrust::device_vector<int>(k, thrust::no_init);
+  auto values_out = thrust::device_vector<int>(k, thrust::no_init);
+
+  // Specify that we do not require a specific output order and do not require deterministic results
+  auto requirements =
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+  // Prepare CUDA stream
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  cuda::stream_ref stream_ref{stream};
+
+  // Create the environment with the stream and requirements
+  auto env = cuda::std::execution::env{stream_ref, requirements};
+
+  // Query temporary storage requirements
+  size_t temp_storage_bytes{};
+  cub::DeviceTopK::MaxPairs(
+    nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env);
+
+  // Allocate temporary storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes, thrust::no_init);
+
+  cub::DeviceTopK::MaxPairs(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys.begin(),
+    keys_out.begin(),
+    values,
+    values_out.begin(),
+    keys.size(),
+    k,
+    env);
+
+  // Get the top-k results into sorted order for easy comparison
+  thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin(), cuda::std::greater<>{});
+  thrust::host_vector<int> expected_keys{8, 7, 6, 5};
+  thrust::host_vector<int> expected_values{4, 3, 7, 0};
+  // example-end topk-max-pairs-non-deterministic-unsorted
+
+  REQUIRE(keys_out == expected_keys);
+  REQUIRE(values_out == expected_values);
+}
+
+// example-begin topk-custom-type
+struct custom_t
+{
+  float f;
+  int unused;
+  long long int lli;
+
+  custom_t() = default;
+  custom_t(float f, long long int lli)
+      : f(f)
+      , unused(42)
+      , lli(lli)
+  {}
+};
+
+struct decomposer_t
+{
+  __host__ __device__ cuda::std::tuple<float&, long long int&> operator()(custom_t& key) const
+  {
+    return {key.f, key.lli};
+  }
+};
+// example-end topk-custom-type
+
+static __host__ std::ostream& operator<<(std::ostream& os, const custom_t& self)
+{
+  return os << "{ " << self.f << ", " << self.lli << " }";
+}
+
+static __host__ __device__ bool operator==(const custom_t& lhs, const custom_t& rhs)
+{
+  return lhs.f == rhs.f && lhs.lli == rhs.lli;
+}
+
+static __host__ __device__ bool operator<(const custom_t& lhs, const custom_t& rhs)
+{
+  return lhs.lli == rhs.lli ? lhs.f < rhs.f : lhs.lli < rhs.lli;
+}
+
+static __host__ __device__ bool operator>(const custom_t& lhs, const custom_t& rhs)
+{
+  return rhs < lhs;
+}
+
+C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]")
+{
+  SECTION("MaxKeys")
+  {
+    // example-begin topk-max-keys-custom-type
+    constexpr int num_items = 6;
+    constexpr int k         = 3;
+
+    thrust::device_vector<custom_t> in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> out(k);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    auto requirements = cuda::execution::require(
+      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceTopK::MaxKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Find the top-k largest keys
+    cub::DeviceTopK::MaxKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
+
+    // Sort output for comparison (output order is not guaranteed)
+    thrust::sort(out.begin(), out.end(), cuda::std::greater<>{});
+    thrust::device_vector<custom_t> expected = {
+      {+3.7f, 5}, //
+      {+2.5f, 4}, //
+      {+1.1f, 3} //
+    };
+    // example-end topk-max-keys-custom-type
+
+    REQUIRE(expected == out);
+  }
+
+  SECTION("MinKeys")
+  {
+    // example-begin topk-min-keys-custom-type
+    constexpr int num_items = 6;
+    constexpr int k         = 3;
+
+    thrust::device_vector<custom_t> in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> out(k);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    auto requirements = cuda::execution::require(
+      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceTopK::MinKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceTopK::MinKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements);
+
+    // Sort output for comparison (output order is not guaranteed)
+    thrust::sort(out.begin(), out.end());
+    thrust::device_vector<custom_t> expected = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2} //
+    };
+    // example-end topk-min-keys-custom-type
+
+    REQUIRE(expected == out);
+  }
+
+  SECTION("MaxPairs")
+  {
+    // example-begin topk-max-pairs-custom-type
+    constexpr int num_items = 6;
+    constexpr int k         = 3;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> keys_out(k);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {0, 1, 2, 3, 4, 5};
+    thrust::device_vector<int> vals_out(k);
+
+    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    auto requirements = cuda::execution::require(
+      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceTopK::MaxPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      k,
+      decomposer_t{},
+      requirements);
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceTopK::MaxPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      k,
+      decomposer_t{},
+      requirements);
+
+    // Sort by key for comparison (output order is not guaranteed)
+    thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin(), cuda::std::greater<>{});
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {+3.7f, 5}, //
+      {+2.5f, 4}, //
+      {+1.1f, 3} //
+    };
+
+    thrust::device_vector<int> expected_vals = {5, 0, 2};
+    // example-end topk-max-pairs-custom-type
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+
+  SECTION("MinPairs")
+  {
+    // example-begin topk-min-pairs-custom-type
+    constexpr int num_items = 6;
+    constexpr int k         = 3;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> keys_out(k);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {0, 1, 2, 3, 4, 5};
+    thrust::device_vector<int> vals_out(k);
+
+    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    auto requirements = cuda::execution::require(
+      cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);
+
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceTopK::MinPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      k,
+      decomposer_t{},
+      requirements);
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceTopK::MinPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      k,
+      decomposer_t{},
+      requirements);
+
+    // Sort by key for comparison (output order is not guaranteed)
+    thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin());
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2} //
+    };
+
+    thrust::device_vector<int> expected_vals = {1, 3, 4};
+    // example-end topk-min-pairs-custom-type
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+}

From 59755df01832a7d0839d60199d674038e276ee79 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 23:05:22 -0700
Subject: [PATCH 04/11] add unit test

---
 cub/test/catch2_test_device_topk_env.cu | 142 ++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu
index 296eb603908..3b9fd3b31bd 100644
--- a/cub/test/catch2_test_device_topk_env.cu
+++ b/cub/test/catch2_test_device_topk_env.cu
@@ -11,6 +11,7 @@ struct stream_registry_factory_t;
 
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/sort.h>
 
 #include <cuda/__execution/determinism.h>
@@ -20,6 +21,8 @@ struct stream_registry_factory_t;
 #include <cuda/iterator>
 #include <cuda/std/functional>
 
+#include <algorithm>
+
 #include "catch2_test_env_launch_helper.h"
 
 // %PARAM% TEST_LAUNCH lid 0:2
@@ -184,4 +187,143 @@ C2H_TEST("DeviceTopK::MinPairs can be tuned", "[topk][device]", block_sizes)
   REQUIRE(d_block_size[0] == target_block_size);
 }
 
+namespace
+{
+template <typename T>
+thrust::host_vector<T> sorted_top_k(const thrust::host_vector<T>& h_in, int k, bool largest)
+{
+  auto sorted = h_in;
+  if (largest)
+  {
+    std::sort(sorted.begin(), sorted.end(), cuda::std::greater<T>{});
+  }
+  else
+  {
+    std::sort(sorted.begin(), sorted.end());
+  }
+  sorted.resize(static_cast<size_t>(k));
+  return sorted;
+}
+} // namespace
+
+using topk_element_types = c2h::type_list<int8_t, int16_t, int32_t, uint32_t, int64_t, float, double>;
+
+C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]", topk_element_types)
+{
+  using T = c2h::get<0, TestType>;
+
+  const int num_items = 256;
+  thrust::host_vector<T> h_in(num_items);
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = static_cast<T>((i * 1664525 + 1013904223) % 251);
+  }
+  thrust::device_vector<T> d_in    = h_in;
+  thrust::device_vector<T> d_out_k = thrust::device_vector<T>(8);
+
+  auto env = topk_requirements();
+  REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env));
+
+  thrust::host_vector<T> h_out = d_out_k;
+  std::sort(h_out.begin(), h_out.end(), cuda::std::greater<T>{});
+
+  auto expected = sorted_top_k(h_in, 8, /*largest*/ true);
+  REQUIRE(h_out == expected);
+}
+
+C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]", topk_element_types)
+{
+  using T = c2h::get<0, TestType>;
+
+  const int num_items = 256;
+  thrust::host_vector<T> h_in(num_items);
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = static_cast<T>((i * 1664525 + 1013904223) % 251);
+  }
+  thrust::device_vector<T> d_in    = h_in;
+  thrust::device_vector<T> d_out_k = thrust::device_vector<T>(8);
+
+  auto env = topk_requirements();
+  REQUIRE(cudaSuccess == cub::DeviceTopK::MinKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env));
+
+  thrust::host_vector<T> h_out = d_out_k;
+  std::sort(h_out.begin(), h_out.end());
+
+  auto expected = sorted_top_k(h_in, 8, /*largest*/ false);
+  REQUIRE(h_out == expected);
+}
+
+C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", topk_element_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+
+  const int num_items = 256;
+  thrust::host_vector<KeyT> h_keys_in(num_items);
+  thrust::host_vector<int> h_values_in(num_items);
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_keys_in[i]   = static_cast<KeyT>((i * 1664525 + 1013904223) % 251);
+    h_values_in[i] = i;
+  }
+  thrust::device_vector<KeyT> d_keys_in   = h_keys_in;
+  thrust::device_vector<int> d_values_in  = h_values_in;
+  thrust::device_vector<KeyT> d_keys_out  = thrust::device_vector<KeyT>(8);
+  thrust::device_vector<int> d_values_out = thrust::device_vector<int>(8);
+
+  auto env = topk_requirements();
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MaxPairs(
+            d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
+
+  thrust::host_vector<KeyT> h_keys_out = d_keys_out;
+  std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater<KeyT>{});
+
+  auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true);
+  REQUIRE(h_keys_out == expected);
+}
+
+C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]", topk_element_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+
+  const int num_items = 256;
+  thrust::host_vector<KeyT> h_keys_in(num_items);
+  thrust::host_vector<int> h_values_in(num_items);
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_keys_in[i]   = static_cast<KeyT>((i * 1664525 + 1013904223) % 251);
+    h_values_in[i] = i;
+  }
+  thrust::device_vector<KeyT> d_keys_in   = h_keys_in;
+  thrust::device_vector<int> d_values_in  = h_values_in;
+  thrust::device_vector<KeyT> d_keys_out  = thrust::device_vector<KeyT>(8);
+  thrust::device_vector<int> d_values_out = thrust::device_vector<int>(8);
+
+  auto env = topk_requirements();
+  REQUIRE(cudaSuccess
+          == cub::DeviceTopK::MinPairs(
+            d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
+
+  thrust::host_vector<KeyT> h_keys_out = d_keys_out;
+  std::sort(h_keys_out.begin(), h_keys_out.end());
+
+  auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ false);
+  REQUIRE(h_keys_out == expected);
+}
+
+C2H_TEST("DeviceTopK::MaxKeys env-alloc handles K equal to num_items", "[topk][env]")
+{
+  thrust::device_vector<int> d_in{5, 2, 9, 1, 7};
+  thrust::device_vector<int> d_out(d_in.size());
+
+  auto env = topk_requirements();
+  REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), 5, 5, env));
+
+  thrust::host_vector<int> h_out = d_out;
+  std::sort(h_out.begin(), h_out.end(), cuda::std::greater<int>{});
+  thrust::host_vector<int> expected{9, 7, 5, 2, 1};
+  REQUIRE(h_out == expected);
+}
+
 #endif // TEST_LAUNCH != 1

From 1637e52d0e6c604d6c83fba3f0599ff4ba498cd4 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 23:06:23 -0700
Subject: [PATCH 05/11] pre commit

---
 cub/cub/device/device_topk.cuh | 180 +++++++++++++++++----------------
 1 file changed, 94 insertions(+), 86 deletions(-)

diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh
index 1deca561e8d..39476b584af 100644
--- a/cub/cub/device/device_topk.cuh
+++ b/cub/cub/device/device_topk.cuh
@@ -388,19 +388,20 @@ struct DeviceTopK
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::max>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        d_values_in,
-        d_values_out,
-        num_items,
-        k,
-        detail::identity_decomposer_t{},
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::max>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          d_values_in,
+          d_values_out,
+          num_items,
+          k,
+          detail::identity_decomposer_t{},
+          env);
+      });
   }
 
   //! @rst
@@ -650,10 +651,11 @@ struct DeviceTopK
                   "Custom decomposers are not supported for fundamental types; "
                   "use the non-decomposer API overload instead");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::max>(
-        storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::max>(
+          storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
+      });
   }
 
   //! @rst
@@ -856,19 +858,20 @@ struct DeviceTopK
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::min>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        d_values_in,
-        d_values_out,
-        num_items,
-        k,
-        detail::identity_decomposer_t{},
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::min>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          d_values_in,
+          d_values_out,
+          num_items,
+          k,
+          detail::identity_decomposer_t{},
+          env);
+      });
   }
 
   //! @rst
@@ -1113,10 +1116,11 @@ struct DeviceTopK
                   "Custom decomposers are not supported for fundamental types; "
                   "use the non-decomposer API overload instead");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::min>(
-        storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::min>(
+          storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env);
+      });
   }
 
   //! @rst
@@ -1282,19 +1286,20 @@ struct DeviceTopK
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::max>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        static_cast<NullType*>(nullptr),
-        static_cast<NullType*>(nullptr),
-        num_items,
-        k,
-        detail::identity_decomposer_t{},
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::max>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          static_cast<NullType*>(nullptr),
+          static_cast<NullType*>(nullptr),
+          num_items,
+          k,
+          detail::identity_decomposer_t{},
+          env);
+      });
   }
 
   //! @rst
@@ -1506,19 +1511,20 @@ struct DeviceTopK
                   "Custom decomposers are not supported for fundamental types; "
                   "use the non-decomposer API overload instead");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::max>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        static_cast<NullType*>(nullptr),
-        static_cast<NullType*>(nullptr),
-        num_items,
-        k,
-        decomposer,
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::max>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          static_cast<NullType*>(nullptr),
+          static_cast<NullType*>(nullptr),
+          num_items,
+          k,
+          decomposer,
+          env);
+      });
   }
 
   //! @rst
@@ -1684,19 +1690,20 @@ struct DeviceTopK
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::min>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        static_cast<NullType*>(nullptr),
-        static_cast<NullType*>(nullptr),
-        num_items,
-        k,
-        detail::identity_decomposer_t{},
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::min>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          static_cast<NullType*>(nullptr),
+          static_cast<NullType*>(nullptr),
+          num_items,
+          k,
+          detail::identity_decomposer_t{},
+          env);
+      });
   }
 
   //! @rst
@@ -1908,19 +1915,20 @@ struct DeviceTopK
                   "Custom decomposers are not supported for fundamental types; "
                   "use the non-decomposer API overload instead");
 
-    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
-      return detail::dispatch_topk<detail::topk::select::min>(
-        storage,
-        bytes,
-        d_keys_in,
-        d_keys_out,
-        static_cast<NullType*>(nullptr),
-        static_cast<NullType*>(nullptr),
-        num_items,
-        k,
-        decomposer,
-        env);
-    });
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) {
+        return detail::dispatch_topk<detail::topk::select::min>(
+          storage,
+          bytes,
+          d_keys_in,
+          d_keys_out,
+          static_cast<NullType*>(nullptr),
+          static_cast<NullType*>(nullptr),
+          num_items,
+          k,
+          decomposer,
+          env);
+      });
   }
 };
 

From 776655d635d09f80b54bf599298f613e52c5de91 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 13 May 2026 23:19:03 -0700
Subject: [PATCH 06/11] add tests for key-value pairing

---
 cub/test/catch2_test_device_topk_env.cu | 26 +++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu
index 3b9fd3b31bd..8cf410d716e 100644
--- a/cub/test/catch2_test_device_topk_env.cu
+++ b/cub/test/catch2_test_device_topk_env.cu
@@ -276,7 +276,18 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]",
           == cub::DeviceTopK::MaxPairs(
             d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
 
-  thrust::host_vector<KeyT> h_keys_out = d_keys_out;
+  thrust::host_vector<KeyT> h_keys_out  = d_keys_out;
+  thrust::host_vector<int> h_values_out = d_values_out;
+
+  // Verify pair association: each returned value indexes back to the corresponding key
+  // (recall h_values_in[i] = i, so value-out is the original input position of the key-out)
+  for (size_t i = 0; i < h_keys_out.size(); ++i)
+  {
+    REQUIRE(h_values_out[i] >= 0);
+    REQUIRE(h_values_out[i] < num_items);
+    REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]);
+  }
+
   std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater<KeyT>{});
 
   auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true);
@@ -305,7 +316,18 @@ C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]
           == cub::DeviceTopK::MinPairs(
             d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
 
-  thrust::host_vector<KeyT> h_keys_out = d_keys_out;
+  thrust::host_vector<KeyT> h_keys_out  = d_keys_out;
+  thrust::host_vector<int> h_values_out = d_values_out;
+
+  // Verify pair association: each returned value indexes back to the corresponding key
+  // (recall h_values_in[i] = i, so value-out is the original input position of the key-out)
+  for (size_t i = 0; i < h_keys_out.size(); ++i)
+  {
+    REQUIRE(h_values_out[i] >= 0);
+    REQUIRE(h_values_out[i] < num_items);
+    REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]);
+  }
+
   std::sort(h_keys_out.begin(), h_keys_out.end());
 
   auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ false);

From b0a851953a4f087fb3eba9d4f80a315c7d6c99a6 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Thu, 14 May 2026 17:06:03 -0700
Subject: [PATCH 07/11] match docs

---
 cub/cub/device/device_topk.cuh | 123 ++++++++++++++++++++++-----------
 1 file changed, 82 insertions(+), 41 deletions(-)

diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh
index 39476b584af..7cd59efea17 100644
--- a/cub/cub/device/device_topk.cuh
+++ b/cub/cub/device/device_topk.cuh
@@ -600,25 +600,28 @@ struct DeviceTopK
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] d_values_in
-  //!   Pointer to the input values
+  //!   Random-access iterator to the input sequence containing the values associated to each key
   //!
   //! @param[out] d_values_out
-  //!   Pointer to the K output values
+  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
+  //!   written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] decomposer
-  //!   Decomposer object for interpreting user-defined key types
+  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
+  //!   types.
   //!
   //! @param[in] env
   //!   @rst
@@ -773,6 +776,10 @@ struct DeviceTopK
   //! @rst
   //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -817,22 +824,24 @@ struct DeviceTopK
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] d_values_in
-  //!   Pointer to the input values
+  //!   Random-access iterator to the input sequence containing the values associated to each key
   //!
   //! @param[out] d_values_out
-  //!   Pointer to the K output values
+  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
+  //!   written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] env
   //!   @rst
@@ -1018,6 +1027,10 @@ struct DeviceTopK
   //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs,
   //! using a decomposer to interpret user-defined key types.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -1059,31 +1072,35 @@ struct DeviceTopK
   //!  The integral type of variable k
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of decomposer
+  //!   **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its
+  //!   constituent arithmetic types.
   //!
   //! @tparam EnvT
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] d_values_in
-  //!   Pointer to the input values
+  //!   Random-access iterator to the input sequence containing the values associated to each key
   //!
   //! @param[out] d_values_out
-  //!   Pointer to the K output values
+  //!   Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be
+  //!   written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in` and `d_values_in` each
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] decomposer
-  //!   Decomposer object for interpreting user-defined key types
+  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
+  //!   types.
   //!
   //! @param[in] env
   //!   @rst
@@ -1221,6 +1238,10 @@ struct DeviceTopK
   //! @rst
   //! Finds the largest K keys from an unordered input sequence.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -1259,16 +1280,17 @@ struct DeviceTopK
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in`
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] env
   //!   @rst
@@ -1429,6 +1451,10 @@ struct DeviceTopK
   //! Finds the largest K keys from an unordered input sequence,
   //! using a decomposer to interpret user-defined key types.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -1464,25 +1490,28 @@ struct DeviceTopK
   //!  The integral type of variable k
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of decomposer
+  //!   **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its
+  //!   constituent arithmetic types.
   //!
   //! @tparam EnvT
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in`
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] decomposer
-  //!   Decomposer object for interpreting user-defined key types
+  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
+  //!   types.
   //!
   //! @param[in] env
   //!   @rst
@@ -1625,6 +1654,10 @@ struct DeviceTopK
   //! @rst
   //! Finds the smallest K keys from an unordered input sequence.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -1663,16 +1696,17 @@ struct DeviceTopK
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in`
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] env
   //!   @rst
@@ -1833,6 +1867,10 @@ struct DeviceTopK
   //! Finds the smallest K keys from an unordered input sequence,
   //! using a decomposer to interpret user-defined key types.
   //!
+  //! .. note::
+  //!
+  //!    The behavior is undefined if the input and output ranges overlap in any way.
+  //!
   //! .. versionadded:: 3.5.0
   //!    First appears in CUDA Toolkit 13.5.
   //!
@@ -1868,25 +1906,28 @@ struct DeviceTopK
   //!  The integral type of variable k
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of decomposer
+  //!   **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its
+  //!   constituent arithmetic types.
   //!
   //! @tparam EnvT
   //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
   //!
   //! @param[in] d_keys_in
-  //!   Pointer to the input keys
+  //!   Random-access iterator to the input sequence containing the keys
   //!
   //! @param[out] d_keys_out
-  //!   Pointer to the K output keys
+  //!   Random-access iterator to the output sequence of keys, where K values will be written to
   //!
   //! @param[in] num_items
-  //!   Number of input items
+  //!   Number of items to be read and processed from `d_keys_in`
   //!
   //! @param[in] k
-  //!   The K value
+  //!   The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of
+  //!   `num_items`.
   //!
   //! @param[in] decomposer
-  //!   Decomposer object for interpreting user-defined key types
+  //!   Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic
+  //!   types.
   //!
   //! @param[in] env
   //!   @rst

From e3aeaf0f1b576100a51e7063c27bba6d1af79928 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Mon, 18 May 2026 17:52:53 -0700
Subject: [PATCH 08/11] move enable_ifs from return into the template head

---
 cub/cub/device/device_topk.cuh | 132 +++++++++++++++++----------------
 1 file changed, 68 insertions(+), 64 deletions(-)

diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh
index 7cd59efea17..6cefad73a27 100644
--- a/cub/cub/device/device_topk.cuh
+++ b/cub/cub/device/device_topk.cuh
@@ -627,25 +627,26 @@ struct DeviceTopK
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
-  template <typename KeyInputIteratorT,
-            typename KeyOutputIteratorT,
-            typename ValueInputIteratorT,
-            typename ValueOutputIteratorT,
-            typename NumItemsT,
-            typename NumOutItemsT,
-            typename DecomposerT,
-            typename EnvT = ::cuda::std::execution::env<>>
-  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename ValueInputIteratorT,
+    typename ValueOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename DecomposerT,
+    typename EnvT                 = ::cuda::std::execution::env<>,
     ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
-                             cudaError_t>
-    MaxPairs(KeyInputIteratorT d_keys_in,
-             KeyOutputIteratorT d_keys_out,
-             ValueInputIteratorT d_values_in,
-             ValueOutputIteratorT d_values_out,
-             NumItemsT num_items,
-             NumOutItemsT k,
-             DecomposerT decomposer,
-             EnvT env = {})
+                             int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueInputIteratorT d_values_in,
+    ValueOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    DecomposerT decomposer,
+    EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs");
     using key_t = detail::it_value_t<KeyInputIteratorT>;
@@ -1106,25 +1107,26 @@ struct DeviceTopK
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
-  template <typename KeyInputIteratorT,
-            typename KeyOutputIteratorT,
-            typename ValueInputIteratorT,
-            typename ValueOutputIteratorT,
-            typename NumItemsT,
-            typename NumOutItemsT,
-            typename DecomposerT,
-            typename EnvT = ::cuda::std::execution::env<>>
-  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename ValueInputIteratorT,
+    typename ValueOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename DecomposerT,
+    typename EnvT                 = ::cuda::std::execution::env<>,
     ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
-                             cudaError_t>
-    MinPairs(KeyInputIteratorT d_keys_in,
-             KeyOutputIteratorT d_keys_out,
-             ValueInputIteratorT d_values_in,
-             ValueOutputIteratorT d_values_out,
-             NumItemsT num_items,
-             NumOutItemsT k,
-             DecomposerT decomposer,
-             EnvT env = {})
+                             int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueInputIteratorT d_values_in,
+    ValueOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    DecomposerT decomposer,
+    EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs");
     using key_t = detail::it_value_t<KeyInputIteratorT>;
@@ -1517,21 +1519,22 @@ struct DeviceTopK
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
-  template <typename KeyInputIteratorT,
-            typename KeyOutputIteratorT,
-            typename NumItemsT,
-            typename NumOutItemsT,
-            typename DecomposerT,
-            typename EnvT = ::cuda::std::execution::env<>>
-  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename DecomposerT,
+    typename EnvT                 = ::cuda::std::execution::env<>,
     ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
-                             cudaError_t>
-    MaxKeys(KeyInputIteratorT d_keys_in,
-            KeyOutputIteratorT d_keys_out,
-            NumItemsT num_items,
-            NumOutItemsT k,
-            DecomposerT decomposer,
-            EnvT env = {})
+                             int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    DecomposerT decomposer,
+    EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys");
     using key_t = detail::it_value_t<KeyInputIteratorT>;
@@ -1933,21 +1936,22 @@ struct DeviceTopK
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
-  template <typename KeyInputIteratorT,
-            typename KeyOutputIteratorT,
-            typename NumItemsT,
-            typename NumOutItemsT,
-            typename DecomposerT,
-            typename EnvT = ::cuda::std::execution::env<>>
-  [[nodiscard]] CUB_RUNTIME_FUNCTION static //
+  template <
+    typename KeyInputIteratorT,
+    typename KeyOutputIteratorT,
+    typename NumItemsT,
+    typename NumOutItemsT,
+    typename DecomposerT,
+    typename EnvT                 = ::cuda::std::execution::env<>,
     ::cuda::std::enable_if_t<detail::radix::is_valid_decomposer<detail::it_value_t<KeyInputIteratorT>, DecomposerT>,
-                             cudaError_t>
-    MinKeys(KeyInputIteratorT d_keys_in,
-            KeyOutputIteratorT d_keys_out,
-            NumItemsT num_items,
-            NumOutItemsT k,
-            DecomposerT decomposer,
-            EnvT env = {})
+                             int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys(
+    KeyInputIteratorT d_keys_in,
+    KeyOutputIteratorT d_keys_out,
+    NumItemsT num_items,
+    NumOutItemsT k,
+    DecomposerT decomposer,
+    EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys");
     using key_t = detail::it_value_t<KeyInputIteratorT>;

From 1ddd14a99f2298cf8b79373d62a13f249d27fa60 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Mon, 18 May 2026 17:54:06 -0700
Subject: [PATCH 09/11] move result vectors into visible block

---
 cub/test/catch2_test_device_topk_env_api.cu | 32 ++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu
index 6d5964a5f2b..d1b67a7db4f 100644
--- a/cub/test/catch2_test_device_topk_env_api.cu
+++ b/cub/test/catch2_test_device_topk_env_api.cu
@@ -60,6 +60,7 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]")
   {
     std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n';
   }
+  thrust::device_vector<int> expected{9, 8, 7}; // possibly in different order
   // example-end topk-max-keys-env
 
   stream.sync();
@@ -67,7 +68,6 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]")
 
   // Result order is unspecified for TopK; sort before compare.
   thrust::sort(d_out.begin(), d_out.end(), cuda::std::greater<int>{});
-  thrust::device_vector<int> expected{9, 8, 7};
   REQUIRE(d_out == expected);
 }
 
@@ -89,13 +89,13 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]")
   {
     std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n';
   }
+  thrust::device_vector<int> expected{0, 1, 2}; // possibly in different order
   // example-end topk-min-keys-env
 
   stream.sync();
   REQUIRE(error == cudaSuccess);
 
   thrust::sort(d_out.begin(), d_out.end());
-  thrust::device_vector<int> expected{0, 1, 2};
   REQUIRE(d_out == expected);
 }
 
@@ -126,13 +126,13 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]"
   {
     std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n';
   }
+  thrust::device_vector<int> expected_keys{9, 8, 7}; // possibly in different order
   // example-end topk-max-pairs-env
 
   stream.sync();
   REQUIRE(error == cudaSuccess);
 
   thrust::sort(d_keys_out.begin(), d_keys_out.end(), cuda::std::greater<int>{});
-  thrust::device_vector<int> expected_keys{9, 8, 7};
   REQUIRE(d_keys_out == expected_keys);
 }
 
@@ -163,13 +163,13 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]"
   {
     std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n';
   }
+  thrust::device_vector<int> expected_keys{0, 1, 2}; // possibly in different order
   // example-end topk-min-pairs-env
 
   stream.sync();
   REQUIRE(error == cudaSuccess);
 
   thrust::sort(d_keys_out.begin(), d_keys_out.end());
-  thrust::device_vector<int> expected_keys{0, 1, 2};
   REQUIRE(d_keys_out == expected_keys);
 }
 
@@ -194,6 +194,7 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref"
   {
     std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n';
   }
+  thrust::host_vector<int> expected_ranks{9, 8, 7}; // possibly in different order
   // example-end topk-max-keys-decomposer-env
 
   stream.sync();
@@ -203,9 +204,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref"
   std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
     return a.rank > b.rank;
   });
-  REQUIRE(h_out[0].rank == 9);
-  REQUIRE(h_out[1].rank == 8);
-  REQUIRE(h_out[2].rank == 7);
+  thrust::host_vector<int> actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank};
+  REQUIRE(actual_ranks == expected_ranks);
 }
 
 C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref", "[topk][env]")
@@ -229,6 +229,7 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref"
   {
     std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n';
   }
+  thrust::host_vector<int> expected_ranks{0, 1, 2}; // possibly in different order
   // example-end topk-min-keys-decomposer-env
 
   stream.sync();
@@ -238,9 +239,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref"
   std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
     return a.rank < b.rank;
   });
-  REQUIRE(h_out[0].rank == 0);
-  REQUIRE(h_out[1].rank == 1);
-  REQUIRE(h_out[2].rank == 2);
+  thrust::host_vector<int> actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank};
+  REQUIRE(actual_ranks == expected_ranks);
 }
 
 C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref", "[topk][env]")
@@ -273,6 +273,7 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref
   {
     std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n';
   }
+  thrust::host_vector<int> expected_ranks{9, 8, 7}; // possibly in different order
   // example-end topk-max-pairs-decomposer-env
 
   stream.sync();
@@ -282,9 +283,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref
   std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
     return a.rank > b.rank;
   });
-  REQUIRE(h_keys_out[0].rank == 9);
-  REQUIRE(h_keys_out[1].rank == 8);
-  REQUIRE(h_keys_out[2].rank == 7);
+  thrust::host_vector<int> actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank};
+  REQUIRE(actual_ranks == expected_ranks);
 }
 
 C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref", "[topk][env]")
@@ -317,6 +317,7 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref
   {
     std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n';
   }
+  thrust::host_vector<int> expected_ranks{0, 1, 2}; // possibly in different order
   // example-end topk-min-pairs-decomposer-env
 
   stream.sync();
@@ -326,7 +327,6 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref
   std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) {
     return a.rank < b.rank;
   });
-  REQUIRE(h_keys_out[0].rank == 0);
-  REQUIRE(h_keys_out[1].rank == 1);
-  REQUIRE(h_keys_out[2].rank == 2);
+  thrust::host_vector<int> actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank};
+  REQUIRE(actual_ranks == expected_ranks);
 }

From 9d192aa6d139a27ffd15f0444bcaa410c8826a30 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Mon, 18 May 2026 17:55:34 -0700
Subject: [PATCH 10/11] use c2h vectors in unit test, not thrust::vectors

---
 cub/test/catch2_test_device_topk_env.cu | 92 +++++++++++--------------
 1 file changed, 40 insertions(+), 52 deletions(-)

diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu
index 8cf410d716e..ef97714d796 100644
--- a/cub/test/catch2_test_device_topk_env.cu
+++ b/cub/test/catch2_test_device_topk_env.cu
@@ -11,7 +11,6 @@ struct stream_registry_factory_t;
 
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <thrust/sort.h>
 
 #include <cuda/__execution/determinism.h>
@@ -189,13 +188,12 @@ C2H_TEST("DeviceTopK::MinPairs can be tuned", "[topk][device]", block_sizes)
 
 namespace
 {
-template <typename T>
-thrust::host_vector<T> sorted_top_k(const thrust::host_vector<T>& h_in, int k, bool largest)
+c2h::host_vector<int> sorted_top_k(const c2h::host_vector<int>& h_in, int k, bool largest)
 {
   auto sorted = h_in;
   if (largest)
   {
-    std::sort(sorted.begin(), sorted.end(), cuda::std::greater<T>{});
+    std::sort(sorted.begin(), sorted.end(), cuda::std::greater<int>{});
   }
   else
   {
@@ -206,78 +204,70 @@ thrust::host_vector<T> sorted_top_k(const thrust::host_vector<T>& h_in, int k, b
 }
 } // namespace
 
-using topk_element_types = c2h::type_list<int8_t, int16_t, int32_t, uint32_t, int64_t, float, double>;
-
-C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]", topk_element_types)
+C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]")
 {
-  using T = c2h::get<0, TestType>;
-
   const int num_items = 256;
-  thrust::host_vector<T> h_in(num_items);
+  c2h::host_vector<int> h_in(num_items);
   for (int i = 0; i < num_items; ++i)
   {
-    h_in[i] = static_cast<T>((i * 1664525 + 1013904223) % 251);
+    h_in[i] = static_cast<int>((i * 1664525 + 1013904223) % 251);
   }
-  thrust::device_vector<T> d_in    = h_in;
-  thrust::device_vector<T> d_out_k = thrust::device_vector<T>(8);
+  c2h::device_vector<int> d_in    = h_in;
+  c2h::device_vector<int> d_out_k = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
   REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env));
 
-  thrust::host_vector<T> h_out = d_out_k;
-  std::sort(h_out.begin(), h_out.end(), cuda::std::greater<T>{});
+  c2h::host_vector<int> h_out = d_out_k;
+  std::sort(h_out.begin(), h_out.end(), cuda::std::greater<int>{});
 
   auto expected = sorted_top_k(h_in, 8, /*largest*/ true);
   REQUIRE(h_out == expected);
 }
 
-C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]", topk_element_types)
+C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]")
 {
-  using T = c2h::get<0, TestType>;
-
   const int num_items = 256;
-  thrust::host_vector<T> h_in(num_items);
+  c2h::host_vector<int> h_in(num_items);
   for (int i = 0; i < num_items; ++i)
   {
-    h_in[i] = static_cast<T>((i * 1664525 + 1013904223) % 251);
+    h_in[i] = static_cast<int>((i * 1664525 + 1013904223) % 251);
   }
-  thrust::device_vector<T> d_in    = h_in;
-  thrust::device_vector<T> d_out_k = thrust::device_vector<T>(8);
+  c2h::device_vector<int> d_in    = h_in;
+  c2h::device_vector<int> d_out_k = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
   REQUIRE(cudaSuccess == cub::DeviceTopK::MinKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env));
 
-  thrust::host_vector<T> h_out = d_out_k;
+  c2h::host_vector<int> h_out = d_out_k;
   std::sort(h_out.begin(), h_out.end());
 
   auto expected = sorted_top_k(h_in, 8, /*largest*/ false);
   REQUIRE(h_out == expected);
 }
 
-C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", topk_element_types)
+C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]")
 {
-  using KeyT = c2h::get<0, TestType>;
-
   const int num_items = 256;
-  thrust::host_vector<KeyT> h_keys_in(num_items);
-  thrust::host_vector<int> h_values_in(num_items);
+  c2h::host_vector<int> h_keys_in(num_items);
+  c2h::host_vector<int> h_values_in(num_items);
   for (int i = 0; i < num_items; ++i)
   {
-    h_keys_in[i]   = static_cast<KeyT>((i * 1664525 + 1013904223) % 251);
+    h_keys_in[i]   = static_cast<int>((i * 1664525 + 1013904223) % 251);
     h_values_in[i] = i;
   }
-  thrust::device_vector<KeyT> d_keys_in   = h_keys_in;
-  thrust::device_vector<int> d_values_in  = h_values_in;
-  thrust::device_vector<KeyT> d_keys_out  = thrust::device_vector<KeyT>(8);
-  thrust::device_vector<int> d_values_out = thrust::device_vector<int>(8);
+  c2h::device_vector<int> d_keys_in    = h_keys_in;
+  c2h::device_vector<int> d_values_in  = h_values_in;
+  c2h::device_vector<int> d_keys_out   = c2h::device_vector<int>(8);
+  c2h::device_vector<int> d_values_out = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
   REQUIRE(cudaSuccess
           == cub::DeviceTopK::MaxPairs(
             d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
 
-  thrust::host_vector<KeyT> h_keys_out  = d_keys_out;
-  thrust::host_vector<int> h_values_out = d_values_out;
+  c2h::host_vector<int> h_keys_out   = d_keys_out;
+  c2h::host_vector<int> h_values_out = d_values_out;
 
   // Verify pair association: each returned value indexes back to the corresponding key
   // (recall h_values_in[i] = i, so value-out is the original input position of the key-out)
@@ -288,36 +278,34 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]",
     REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]);
   }
 
-  std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater<KeyT>{});
+  std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater<int>{});
 
   auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true);
   REQUIRE(h_keys_out == expected);
 }
 
-C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]", topk_element_types)
+C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]")
 {
-  using KeyT = c2h::get<0, TestType>;
-
   const int num_items = 256;
-  thrust::host_vector<KeyT> h_keys_in(num_items);
-  thrust::host_vector<int> h_values_in(num_items);
+  c2h::host_vector<int> h_keys_in(num_items);
+  c2h::host_vector<int> h_values_in(num_items);
   for (int i = 0; i < num_items; ++i)
   {
-    h_keys_in[i]   = static_cast<KeyT>((i * 1664525 + 1013904223) % 251);
+    h_keys_in[i]   = static_cast<int>((i * 1664525 + 1013904223) % 251);
     h_values_in[i] = i;
   }
-  thrust::device_vector<KeyT> d_keys_in   = h_keys_in;
-  thrust::device_vector<int> d_values_in  = h_values_in;
-  thrust::device_vector<KeyT> d_keys_out  = thrust::device_vector<KeyT>(8);
-  thrust::device_vector<int> d_values_out = thrust::device_vector<int>(8);
+  c2h::device_vector<int> d_keys_in    = h_keys_in;
+  c2h::device_vector<int> d_values_in  = h_values_in;
+  c2h::device_vector<int> d_keys_out   = c2h::device_vector<int>(8);
+  c2h::device_vector<int> d_values_out = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
   REQUIRE(cudaSuccess
           == cub::DeviceTopK::MinPairs(
             d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env));
 
-  thrust::host_vector<KeyT> h_keys_out  = d_keys_out;
-  thrust::host_vector<int> h_values_out = d_values_out;
+  c2h::host_vector<int> h_keys_out   = d_keys_out;
+  c2h::host_vector<int> h_values_out = d_values_out;
 
   // Verify pair association: each returned value indexes back to the corresponding key
   // (recall h_values_in[i] = i, so value-out is the original input position of the key-out)
@@ -336,15 +324,15 @@ C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]
 
 C2H_TEST("DeviceTopK::MaxKeys env-alloc handles K equal to num_items", "[topk][env]")
 {
-  thrust::device_vector<int> d_in{5, 2, 9, 1, 7};
-  thrust::device_vector<int> d_out(d_in.size());
+  c2h::device_vector<int> d_in{5, 2, 9, 1, 7};
+  c2h::device_vector<int> d_out(d_in.size());
 
   auto env = topk_requirements();
   REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), 5, 5, env));
 
-  thrust::host_vector<int> h_out = d_out;
+  c2h::host_vector<int> h_out = d_out;
   std::sort(h_out.begin(), h_out.end(), cuda::std::greater<int>{});
-  thrust::host_vector<int> expected{9, 7, 5, 2, 1};
+  c2h::host_vector<int> expected{9, 7, 5, 2, 1};
   REQUIRE(h_out == expected);
 }
 

From 5ec92826899652fc88f1f478a94eba7afbd24b5c Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 19 May 2026 13:56:28 -0700
Subject: [PATCH 11/11] final reviews

---
 cub/test/catch2_test_device_topk_env.cu     | 45 ++++++++-------------
 cub/test/catch2_test_device_topk_env_api.cu | 24 +++++++----
 2 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu
index ef97714d796..53aad4f7b30 100644
--- a/cub/test/catch2_test_device_topk_env.cu
+++ b/cub/test/catch2_test_device_topk_env.cu
@@ -21,6 +21,7 @@ struct stream_registry_factory_t;
 #include <cuda/std/functional>
 
 #include <algorithm>
+#include <numeric>
 
 #include "catch2_test_env_launch_helper.h"
 
@@ -193,11 +194,11 @@ c2h::host_vector<int> sorted_top_k(const c2h::host_vector<int>& h_in, int k, boo
   auto sorted = h_in;
   if (largest)
   {
-    std::sort(sorted.begin(), sorted.end(), cuda::std::greater<int>{});
+    std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end(), cuda::std::greater<int>{});
   }
   else
   {
-    std::sort(sorted.begin(), sorted.end());
+    std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end());
   }
   sorted.resize(static_cast<size_t>(k));
   return sorted;
@@ -207,12 +208,9 @@ c2h::host_vector<int> sorted_top_k(const c2h::host_vector<int>& h_in, int k, boo
 C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]")
 {
   const int num_items = 256;
-  c2h::host_vector<int> h_in(num_items);
-  for (int i = 0; i < num_items; ++i)
-  {
-    h_in[i] = static_cast<int>((i * 1664525 + 1013904223) % 251);
-  }
-  c2h::device_vector<int> d_in    = h_in;
+  c2h::device_vector<int> d_in(num_items);
+  c2h::gen(C2H_SEED(1), d_in);
+  c2h::host_vector<int> h_in      = d_in;
   c2h::device_vector<int> d_out_k = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
@@ -228,12 +226,9 @@ C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]")
 C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]")
 {
   const int num_items = 256;
-  c2h::host_vector<int> h_in(num_items);
-  for (int i = 0; i < num_items; ++i)
-  {
-    h_in[i] = static_cast<int>((i * 1664525 + 1013904223) % 251);
-  }
-  c2h::device_vector<int> d_in    = h_in;
+  c2h::device_vector<int> d_in(num_items);
+  c2h::gen(C2H_SEED(1), d_in);
+  c2h::host_vector<int> h_in      = d_in;
   c2h::device_vector<int> d_out_k = c2h::device_vector<int>(8);
 
   auto env = topk_requirements();
@@ -249,14 +244,11 @@ C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]"
 C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]")
 {
   const int num_items = 256;
-  c2h::host_vector<int> h_keys_in(num_items);
+  c2h::device_vector<int> d_keys_in(num_items);
+  c2h::gen(C2H_SEED(1), d_keys_in);
+  c2h::host_vector<int> h_keys_in = d_keys_in;
   c2h::host_vector<int> h_values_in(num_items);
-  for (int i = 0; i < num_items; ++i)
-  {
-    h_keys_in[i]   = static_cast<int>((i * 1664525 + 1013904223) % 251);
-    h_values_in[i] = i;
-  }
-  c2h::device_vector<int> d_keys_in    = h_keys_in;
+  std::iota(h_values_in.begin(), h_values_in.end(), 0);
   c2h::device_vector<int> d_values_in  = h_values_in;
   c2h::device_vector<int> d_keys_out   = c2h::device_vector<int>(8);
   c2h::device_vector<int> d_values_out = c2h::device_vector<int>(8);
@@ -287,14 +279,11 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]")
 C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]")
 {
   const int num_items = 256;
-  c2h::host_vector<int> h_keys_in(num_items);
+  c2h::device_vector<int> d_keys_in(num_items);
+  c2h::gen(C2H_SEED(1), d_keys_in);
+  c2h::host_vector<int> h_keys_in = d_keys_in;
   c2h::host_vector<int> h_values_in(num_items);
-  for (int i = 0; i < num_items; ++i)
-  {
-    h_keys_in[i]   = static_cast<int>((i * 1664525 + 1013904223) % 251);
-    h_values_in[i] = i;
-  }
-  c2h::device_vector<int> d_keys_in    = h_keys_in;
+  std::iota(h_values_in.begin(), h_values_in.end(), 0);
   c2h::device_vector<int> d_values_in  = h_values_in;
   c2h::device_vector<int> d_keys_out   = c2h::device_vector<int>(8);
   c2h::device_vector<int> d_values_out = c2h::device_vector<int>(8);
diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu
index d1b67a7db4f..39d163d3f88 100644
--- a/cub/test/catch2_test_device_topk_env_api.cu
+++ b/cub/test/catch2_test_device_topk_env_api.cu
@@ -52,7 +52,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]")
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, env);
@@ -81,7 +82,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]")
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MinKeys(d_in.begin(), d_out.begin(), static_cast<int>(d_in.size()), k, env);
@@ -111,7 +113,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]"
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MaxPairs(
@@ -148,7 +151,8 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]"
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MinPairs(
@@ -185,7 +189,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref"
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MaxKeys(
@@ -220,7 +225,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref"
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MinKeys(
@@ -257,7 +263,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MaxPairs(
@@ -301,7 +308,8 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref
   cuda::stream stream{cuda::devices[0]};
   cuda::stream_ref stream_ref{stream};
   auto env = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted),
+    cuda::execution::require(cuda::execution::determinism::not_guaranteed, //
+                             cuda::execution::output_ordering::unsorted),
     stream_ref};
 
   auto error = cub::DeviceTopK::MinPairs(