From 1cd8c3d5524605af4c5e222c0674e807d093efcb Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 22:40:10 -0700 Subject: [PATCH 01/11] add env DeviceTopK without temp_storage args --- cub/cub/device/device_topk.cuh | 868 +++++++++++++++++++- cub/test/catch2_test_device_topk_api.cu | 90 ++ cub/test/catch2_test_device_topk_env_api.cu | 332 ++++++++ 3 files changed, 1257 insertions(+), 33 deletions(-) create mode 100644 cub/test/catch2_test_device_topk_env_api.cu diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh index 0567b76149d..1deca561e8d 100644 --- a/cub/cub/device/device_topk.cuh +++ b/cub/cub/device/device_topk.cuh @@ -18,6 +18,7 @@ #endif // no system header #include +#include #include #include @@ -293,6 +294,115 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-pairs-env + //! :end-before: example-end topk-max-pairs-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -433,6 +543,119 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs, + //! using a decomposer to interpret user-defined key types. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-pairs-decomposer-env + //! :end-before: example-end topk-max-pairs-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] d_values_in + //! Pointer to the input values + //! + //! @param[out] d_values_out + //! Pointer to the K output values + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] decomposer + //! Decomposer object for interpreting user-defined key types + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static // + ::cuda::std::enable_if_t, DecomposerT>, + cudaError_t> + MaxPairs(KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -545,6 +768,109 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-pairs-env + //! :end-before: example-end topk-min-pairs-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] d_values_in + //! Pointer to the input values + //! + //! @param[out] d_values_out + //! Pointer to the K output values + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -608,40 +934,157 @@ struct DeviceTopK //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its //! constituent arithmetic types. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param decomposer + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`. + //! @endrst + template > + CUB_RUNTIME_FUNCTION static // + ::cuda::std::enable_if_t, DecomposerT>, + cudaError_t> + MinPairs(void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MinPairs"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_topk( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + decomposer, + ::cuda::std::move(env)); + } + + //! @rst + //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs, + //! using a decomposer to interpret user-defined key types. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-pairs-decomposer-env + //! :end-before: example-end topk-min-pairs-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of decomposer //! - //! @param[in,out] temp_storage_bytes - //! Reference to size in bytes of `d_temp_storage` allocation + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Random-access iterator to the input sequence containing the keys + //! Pointer to the input keys //! //! @param[out] d_keys_out - //! Random-access iterator to the output sequence of keys, where K values will be written to + //! Pointer to the K output keys //! //! @param[in] d_values_in - //! Random-access iterator to the input sequence containing the values associated to each key + //! Pointer to the input values //! //! @param[out] d_values_out - //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be - //! written to + //! Pointer to the K output values //! //! @param[in] num_items - //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! Number of input items //! //! @param[in] k - //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of - //! `num_items`. + //! The K value //! - //! @param decomposer - //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic - //! types. + //! @param[in] decomposer + //! Decomposer object for interpreting user-defined key types //! //! @param[in] env //! @rst - //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst template > - CUB_RUNTIME_FUNCTION static // + [[nodiscard]] CUB_RUNTIME_FUNCTION static // ::cuda::std::enable_if_t, DecomposerT>, cudaError_t> - MinPairs(void* d_temp_storage, - size_t& temp_storage_bytes, - KeyInputIteratorT d_keys_in, + MinPairs(KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, ValueInputIteratorT d_values_in, ValueOutputIteratorT d_values_out, @@ -665,24 +1106,17 @@ struct DeviceTopK DecomposerT decomposer, EnvT env = {}) { - _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MinPairs"); + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); using key_t = detail::it_value_t; static_assert(!detail::radix::can_twiddle, "Custom decomposers are not supported for fundamental types; " "use the non-decomposer API overload instead"); - return detail::dispatch_topk( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - k, - decomposer, - ::cuda::std::move(env)); + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); } //! @rst @@ -780,6 +1214,89 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys from an unordered input sequence. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-keys-env + //! :end-before: example-end topk-max-keys-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( + KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -903,6 +1420,107 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys from an unordered input sequence, + //! using a decomposer to interpret user-defined key types. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-keys-decomposer-env + //! :end-before: example-end topk-max-keys-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of decomposer + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] decomposer + //! Decomposer object for interpreting user-defined key types + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static // + ::cuda::std::enable_if_t, DecomposerT>, + cudaError_t> + MaxKeys(KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -998,6 +1616,89 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the smallest K keys from an unordered input sequence. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-keys-env + //! :end-before: example-end topk-min-keys-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys( + KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -1120,6 +1821,107 @@ struct DeviceTopK decomposer, ::cuda::std::move(env)); } + + //! @rst + //! Finds the smallest K keys from an unordered input sequence, + //! using a decomposer to interpret user-defined key types. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-keys-decomposer-env + //! :end-before: example-end topk-min-keys-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of decomposer + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Pointer to the input keys + //! + //! @param[out] d_keys_out + //! Pointer to the K output keys + //! + //! @param[in] num_items + //! Number of input items + //! + //! @param[in] k + //! The K value + //! + //! @param[in] decomposer + //! Decomposer object for interpreting user-defined key types + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static // + ::cuda::std::enable_if_t, DecomposerT>, + cudaError_t> + MinKeys(KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); + } }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu index 8b7d7a52a70..a988e2ca7ff 100644 --- a/cub/test/catch2_test_device_topk_api.cu +++ b/cub/test/catch2_test_device_topk_api.cu @@ -495,3 +495,93 @@ C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]") REQUIRE(expected_vals == vals_out); } } + +// Guard tests: each public DeviceTopK method's legacy temp-storage form must resolve +// unambiguously when called in its minimal form (no explicit stream, all defaults +// implicit), even though the env-alloc overloads (which drop the temp-storage args) +// also live in scope. If the env-overload SFINAE drifts, these become "ambiguous +// overload" compile errors. + +C2H_TEST("DeviceTopK::MaxKeys legacy size-query is unambiguous", "[topk][device]") +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + int* d_keys_in = nullptr; + int* d_keys_out = nullptr; + int num_items = 0; + int k = 0; + + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MaxKeys( + d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements)); +} + +C2H_TEST("DeviceTopK::MinKeys legacy size-query is unambiguous", "[topk][device]") +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + int* d_keys_in = nullptr; + int* d_keys_out = nullptr; + int num_items = 0; + int k = 0; + + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MinKeys( + d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements)); +} + +C2H_TEST("DeviceTopK::MaxPairs legacy size-query is unambiguous", "[topk][device]") +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + int* d_keys_in = nullptr; + int* d_keys_out = nullptr; + int* d_values_in = nullptr; + int* d_values_out = nullptr; + int num_items = 0; + int k = 0; + + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MaxPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + requirements)); +} + +C2H_TEST("DeviceTopK::MinPairs legacy size-query is unambiguous", "[topk][device]") +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + int* d_keys_in = nullptr; + int* d_keys_out = nullptr; + int* d_values_in = nullptr; + int* d_values_out = nullptr; + int num_items = 0; + int k = 0; + + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MinPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + requirements)); +} diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu new file mode 100644 index 00000000000..6d5964a5f2b --- /dev/null +++ b/cub/test/catch2_test_device_topk_env_api.cu @@ -0,0 +1,332 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// Simple user-defined key type for the decomposer-based examples. +struct topk_custom_t +{ + int rank; + int payload; +}; + +struct topk_custom_decomposer_t +{ + __host__ __device__ ::cuda::std::tuple operator()(topk_custom_t& key) const + { + return {key.rank}; + } + __host__ __device__ ::cuda::std::tuple operator()(const topk_custom_t& key) const + { + return {key.rank}; + } +}; + +C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-keys-env + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; + } + // example-end topk-max-keys-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + // Result order is unspecified for TopK; sort before compare. + thrust::sort(d_out.begin(), d_out.end(), cuda::std::greater{}); + thrust::device_vector expected{9, 8, 7}; + REQUIRE(d_out == expected); +} + +C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-keys-env + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; + } + // example-end topk-min-keys-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_out.begin(), d_out.end()); + thrust::device_vector expected{0, 1, 2}; + REQUIRE(d_out == expected); +} + +C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-pairs-env + auto d_keys_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_values_in = thrust::device_vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto d_keys_out = thrust::device_vector(3); + auto d_values_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; + } + // example-end topk-max-pairs-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_keys_out.begin(), d_keys_out.end(), cuda::std::greater{}); + thrust::device_vector expected_keys{9, 8, 7}; + REQUIRE(d_keys_out == expected_keys); +} + +C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-pairs-env + auto d_keys_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_values_in = thrust::device_vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto d_keys_out = thrust::device_vector(3); + auto d_values_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; + } + // example-end topk-min-pairs-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_keys_out.begin(), d_keys_out.end()); + thrust::device_vector expected_keys{0, 1, 2}; + REQUIRE(d_keys_out == expected_keys); +} + +C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-keys-decomposer-env + thrust::host_vector h_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_in = h_in; + thrust::device_vector d_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxKeys( + d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, topk_custom_decomposer_t{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; + } + // example-end topk-max-keys-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank > b.rank; + }); + REQUIRE(h_out[0].rank == 9); + REQUIRE(h_out[1].rank == 8); + REQUIRE(h_out[2].rank == 7); +} + +C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-keys-decomposer-env + thrust::host_vector h_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_in = h_in; + thrust::device_vector d_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinKeys( + d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, topk_custom_decomposer_t{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; + } + // example-end topk-min-keys-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank < b.rank; + }); + REQUIRE(h_out[0].rank == 0); + REQUIRE(h_out[1].rank == 1); + REQUIRE(h_out[2].rank == 2); +} + +C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-pairs-decomposer-env + thrust::host_vector h_keys_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + thrust::device_vector d_keys_out(3); + thrust::device_vector d_values_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + topk_custom_decomposer_t{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; + } + // example-end topk-max-pairs-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank > b.rank; + }); + REQUIRE(h_keys_out[0].rank == 9); + REQUIRE(h_keys_out[1].rank == 8); + REQUIRE(h_keys_out[2].rank == 7); +} + +C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-pairs-decomposer-env + thrust::host_vector h_keys_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + thrust::device_vector d_keys_out(3); + thrust::device_vector d_values_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + topk_custom_decomposer_t{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; + } + // example-end topk-min-pairs-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank < b.rank; + }); + REQUIRE(h_keys_out[0].rank == 0); + REQUIRE(h_keys_out[1].rank == 1); + REQUIRE(h_keys_out[2].rank == 2); +} From a4e3f4682a686860849282ca9b7d65939a7890c7 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 22:58:48 -0700 Subject: [PATCH 02/11] remove non-env api tests --- cub/test/catch2_test_device_topk_api.cu | 587 ------------------------ 1 file changed, 587 deletions(-) delete mode 100644 cub/test/catch2_test_device_topk_api.cu diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu deleted file mode 100644 index a988e2ca7ff..00000000000 --- a/cub/test/catch2_test_device_topk_api.cu +++ /dev/null @@ -1,587 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -C2H_TEST("DeviceTopK::MinKeys API example for non-deterministic, unsorted results", "[device][device_transform]") -{ - // example-begin topk-min-keys-non-deterministic-unsorted - const int k = 4; - auto input = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; - auto output = thrust::device_vector(k, thrust::no_init); - - // Specify that we do not require a specific output order and do not require deterministic results - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - // Prepare CUDA stream - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - cuda::stream_ref stream_ref{stream}; - - // Create the environment with the stream and requirements - auto env = cuda::std::execution::env{stream_ref, requirements}; - - // Query temporary storage requirements - size_t temp_storage_bytes{}; - cub::DeviceTopK::MinKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env); - - // Allocate temporary storage - thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); - - cub::DeviceTopK::MinKeys( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - input.begin(), - output.begin(), - input.size(), - k, - env); - - // Get the top-k results into sorted order for easy comparison - thrust::sort(output.begin(), output.end()); - thrust::host_vector expected{-3, 1, 2, 4}; - // example-end topk-min-keys-non-deterministic-unsorted - - REQUIRE(output == expected); -} - -C2H_TEST("DeviceTopK::MaxKeys API example for non-deterministic, unsorted results", "[device][device_transform]") -{ - // example-begin topk-max-keys-non-deterministic-unsorted - const int k = 4; - auto input = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; - auto output = thrust::device_vector(k, thrust::no_init); - - // Specify that we do not require a specific output order and do not require deterministic results - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - // Prepare CUDA stream - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - cuda::stream_ref stream_ref{stream}; - - // Create the environment with the stream and requirements - auto env = cuda::std::execution::env{stream_ref, requirements}; - - // Query temporary storage requirements - size_t temp_storage_bytes{}; - cub::DeviceTopK::MaxKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env); - - // Allocate temporary storage - thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); - - cub::DeviceTopK::MaxKeys( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - input.begin(), - output.begin(), - input.size(), - k, - env); - - // Get the top-k results into sorted order for easy comparison - thrust::sort(output.begin(), output.end(), cuda::std::greater{}); - thrust::host_vector expected{8, 7, 6, 5}; - // example-end topk-max-keys-non-deterministic-unsorted - - REQUIRE(output == expected); -} - -C2H_TEST("DeviceTopK::MinPairs API example for non-deterministic, unsorted results", "[device][device_transform]") -{ - // example-begin topk-min-pairs-non-deterministic-unsorted - const int k = 4; - auto keys = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; - auto values = cuda::make_counting_iterator(0); - auto keys_out = thrust::device_vector(k, thrust::no_init); - auto values_out = thrust::device_vector(k, thrust::no_init); - - // Specify that we do not require a specific output order and do not require deterministic results - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - // Prepare CUDA stream - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - cuda::stream_ref stream_ref{stream}; - - // Create the environment with the stream and requirements - auto env = cuda::std::execution::env{stream_ref, requirements}; - - // Query temporary storage requirements - size_t temp_storage_bytes{}; - cub::DeviceTopK::MinPairs( - nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env); - - // Allocate temporary storage - thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); - - cub::DeviceTopK::MinPairs( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - keys.begin(), - keys_out.begin(), - values, - values_out.begin(), - keys.size(), - k, - env); - - // Get the top-k results into sorted order for easy comparison - thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin()); - thrust::host_vector expected_keys{-3, 1, 2, 4}; - thrust::host_vector expected_values{1, 2, 5, 6}; - // example-end topk-min-pairs-non-deterministic-unsorted - - REQUIRE(keys_out == expected_keys); - REQUIRE(values_out == expected_values); -} - -C2H_TEST("DeviceTopK::MaxPairs API example for non-deterministic, unsorted results", "[device][device_transform]") -{ - // example-begin topk-max-pairs-non-deterministic-unsorted - const int k = 4; - auto keys = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; - auto values = cuda::make_counting_iterator(0); - auto keys_out = thrust::device_vector(k, thrust::no_init); - auto values_out = thrust::device_vector(k, thrust::no_init); - - // Specify that we do not require a specific output order and do not require deterministic results - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - // Prepare CUDA stream - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - cuda::stream_ref stream_ref{stream}; - - // Create the environment with the stream and requirements - auto env = cuda::std::execution::env{stream_ref, requirements}; - - // Query temporary storage requirements - size_t temp_storage_bytes{}; - cub::DeviceTopK::MaxPairs( - nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env); - - // Allocate temporary storage - thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); - - cub::DeviceTopK::MaxPairs( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - keys.begin(), - keys_out.begin(), - values, - values_out.begin(), - keys.size(), - k, - env); - - // Get the top-k results into sorted order for easy comparison - thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin(), cuda::std::greater<>{}); - thrust::host_vector expected_keys{8, 7, 6, 5}; - thrust::host_vector expected_values{4, 3, 7, 0}; - // example-end topk-max-pairs-non-deterministic-unsorted - - REQUIRE(keys_out == expected_keys); - REQUIRE(values_out == expected_values); -} - -// example-begin topk-custom-type -struct custom_t -{ - float f; - int unused; - long long int lli; - - custom_t() = default; - custom_t(float f, long long int lli) - : f(f) - , unused(42) - , lli(lli) - {} -}; - -struct decomposer_t -{ - __host__ __device__ cuda::std::tuple operator()(custom_t& key) const - { - return {key.f, key.lli}; - } -}; -// example-end topk-custom-type - -static __host__ std::ostream& operator<<(std::ostream& os, const custom_t& self) -{ - return os << "{ " << self.f << ", " << self.lli << " }"; -} - -static __host__ __device__ bool operator==(const custom_t& lhs, const custom_t& rhs) -{ - return lhs.f == rhs.f && lhs.lli == rhs.lli; -} - -static __host__ __device__ bool operator<(const custom_t& lhs, const custom_t& rhs) -{ - return lhs.lli == rhs.lli ? lhs.f < rhs.f : lhs.lli < rhs.lli; -} - -static __host__ __device__ bool operator>(const custom_t& lhs, const custom_t& rhs) -{ - return rhs < lhs; -} - -C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]") -{ - SECTION("MaxKeys") - { - // example-begin topk-max-keys-custom-type - constexpr int num_items = 6; - constexpr int k = 3; - - thrust::device_vector in = { - {+2.5f, 4}, // - {-2.5f, 0}, // - {+1.1f, 3}, // - {+0.0f, 1}, // - {-0.0f, 2}, // - {+3.7f, 5} // - }; - - thrust::device_vector out(k); - - const custom_t* d_in = thrust::raw_pointer_cast(in.data()); - custom_t* d_out = thrust::raw_pointer_cast(out.data()); - - auto requirements = cuda::execution::require( - cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - // 1) Get temp storage size - std::uint8_t* d_temp_storage{}; - std::size_t temp_storage_bytes{}; - - cub::DeviceTopK::MaxKeys( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); - - // 2) Allocate temp storage - thrust::device_vector temp_storage(temp_storage_bytes); - d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - - // 3) Find the top-k largest keys - cub::DeviceTopK::MaxKeys( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); - - // Sort output for comparison (output order is not guaranteed) - thrust::sort(out.begin(), out.end(), cuda::std::greater<>{}); - thrust::device_vector expected = { - {+3.7f, 5}, // - {+2.5f, 4}, // - {+1.1f, 3} // - }; - // example-end topk-max-keys-custom-type - - REQUIRE(expected == out); - } - - SECTION("MinKeys") - { - // example-begin topk-min-keys-custom-type - constexpr int num_items = 6; - constexpr int k = 3; - - thrust::device_vector in = { - {+2.5f, 4}, // - {-2.5f, 0}, // - {+1.1f, 3}, // - {+0.0f, 1}, // - {-0.0f, 2}, // - {+3.7f, 5} // - }; - - thrust::device_vector out(k); - - const custom_t* d_in = thrust::raw_pointer_cast(in.data()); - custom_t* d_out = thrust::raw_pointer_cast(out.data()); - - auto requirements = cuda::execution::require( - cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - std::uint8_t* d_temp_storage{}; - std::size_t temp_storage_bytes{}; - - cub::DeviceTopK::MinKeys( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); - - thrust::device_vector temp_storage(temp_storage_bytes); - d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - - cub::DeviceTopK::MinKeys( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); - - // Sort output for comparison (output order is not guaranteed) - thrust::sort(out.begin(), out.end()); - thrust::device_vector expected = { - {-2.5f, 0}, // - {+0.0f, 1}, // - {-0.0f, 2} // - }; - // example-end topk-min-keys-custom-type - - REQUIRE(expected == out); - } - - SECTION("MaxPairs") - { - // example-begin topk-max-pairs-custom-type - constexpr int num_items = 6; - constexpr int k = 3; - - thrust::device_vector keys_in = { - {+2.5f, 4}, // - {-2.5f, 0}, // - {+1.1f, 3}, // - {+0.0f, 1}, // - {-0.0f, 2}, // - {+3.7f, 5} // - }; - - thrust::device_vector keys_out(k); - - const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data()); - custom_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data()); - - thrust::device_vector vals_in = {0, 1, 2, 3, 4, 5}; - thrust::device_vector vals_out(k); - - const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data()); - int* d_vals_out = thrust::raw_pointer_cast(vals_out.data()); - - auto requirements = cuda::execution::require( - cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - std::uint8_t* d_temp_storage{}; - std::size_t temp_storage_bytes{}; - - cub::DeviceTopK::MaxPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_vals_in, - d_vals_out, - num_items, - k, - decomposer_t{}, - requirements); - - thrust::device_vector temp_storage(temp_storage_bytes); - d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - - cub::DeviceTopK::MaxPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_vals_in, - d_vals_out, - num_items, - k, - decomposer_t{}, - requirements); - - // Sort by key for comparison (output order is not guaranteed) - thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin(), cuda::std::greater<>{}); - - thrust::device_vector expected_keys = { - {+3.7f, 5}, // - {+2.5f, 4}, // - {+1.1f, 3} // - }; - - thrust::device_vector expected_vals = {5, 0, 2}; - // example-end topk-max-pairs-custom-type - - REQUIRE(expected_keys == keys_out); - REQUIRE(expected_vals == vals_out); - } - - SECTION("MinPairs") - { - // example-begin topk-min-pairs-custom-type - constexpr int num_items = 6; - constexpr int k = 3; - - thrust::device_vector keys_in = { - {+2.5f, 4}, // - {-2.5f, 0}, // - {+1.1f, 3}, // - {+0.0f, 1}, // - {-0.0f, 2}, // - {+3.7f, 5} // - }; - - thrust::device_vector keys_out(k); - - const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data()); - custom_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data()); - - thrust::device_vector vals_in = {0, 1, 2, 3, 4, 5}; - thrust::device_vector vals_out(k); - - const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data()); - int* d_vals_out = thrust::raw_pointer_cast(vals_out.data()); - - auto requirements = cuda::execution::require( - cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - - std::uint8_t* d_temp_storage{}; - std::size_t temp_storage_bytes{}; - - cub::DeviceTopK::MinPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_vals_in, - d_vals_out, - num_items, - k, - decomposer_t{}, - requirements); - - thrust::device_vector temp_storage(temp_storage_bytes); - d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - - cub::DeviceTopK::MinPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_vals_in, - d_vals_out, - num_items, - k, - decomposer_t{}, - requirements); - - // Sort by key for comparison (output order is not guaranteed) - thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin()); - - thrust::device_vector expected_keys = { - {-2.5f, 0}, // - {+0.0f, 1}, // - {-0.0f, 2} // - }; - - thrust::device_vector expected_vals = {1, 3, 4}; - // example-end topk-min-pairs-custom-type - - REQUIRE(expected_keys == keys_out); - REQUIRE(expected_vals == vals_out); - } -} - -// Guard tests: each public DeviceTopK method's legacy temp-storage form must resolve -// unambiguously when called in its minimal form (no explicit stream, all defaults -// implicit), even though the env-alloc overloads (which drop the temp-storage args) -// also live in scope. If the env-overload SFINAE drifts, these become "ambiguous -// overload" compile errors. - -C2H_TEST("DeviceTopK::MaxKeys legacy size-query is unambiguous", "[topk][device]") -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - int* d_keys_in = nullptr; - int* d_keys_out = nullptr; - int num_items = 0; - int k = 0; - - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - REQUIRE(cudaSuccess - == cub::DeviceTopK::MaxKeys( - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements)); -} - -C2H_TEST("DeviceTopK::MinKeys legacy size-query is unambiguous", "[topk][device]") -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - int* d_keys_in = nullptr; - int* d_keys_out = nullptr; - int num_items = 0; - int k = 0; - - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - REQUIRE(cudaSuccess - == cub::DeviceTopK::MinKeys( - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, k, requirements)); -} - -C2H_TEST("DeviceTopK::MaxPairs legacy size-query is unambiguous", "[topk][device]") -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - int* d_keys_in = nullptr; - int* d_keys_out = nullptr; - int* d_values_in = nullptr; - int* d_values_out = nullptr; - int num_items = 0; - int k = 0; - - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - REQUIRE(cudaSuccess - == cub::DeviceTopK::MaxPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - k, - requirements)); -} - -C2H_TEST("DeviceTopK::MinPairs legacy size-query is unambiguous", "[topk][device]") -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - int* d_keys_in = nullptr; - int* d_keys_out = nullptr; - int* d_values_in = nullptr; - int* d_values_out = nullptr; - int num_items = 0; - int k = 0; - - auto requirements = - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); - REQUIRE(cudaSuccess - == cub::DeviceTopK::MinPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - k, - requirements)); -} From e199d20c21d739c0ff7a923bb251a220e1d24022 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 23:02:15 -0700 Subject: [PATCH 03/11] revert device_topk_api --- cub/test/catch2_test_device_topk_api.cu | 497 ++++++++++++++++++++++++ 1 file changed, 497 insertions(+) create mode 100644 cub/test/catch2_test_device_topk_api.cu diff --git a/cub/test/catch2_test_device_topk_api.cu b/cub/test/catch2_test_device_topk_api.cu new file mode 100644 index 00000000000..8b7d7a52a70 --- /dev/null +++ b/cub/test/catch2_test_device_topk_api.cu @@ -0,0 +1,497 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +C2H_TEST("DeviceTopK::MinKeys API example for non-deterministic, unsorted results", "[device][device_transform]") +{ + // example-begin topk-min-keys-non-deterministic-unsorted + const int k = 4; + auto input = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; + auto output = thrust::device_vector(k, thrust::no_init); + + // Specify that we do not require a specific output order and do not require deterministic results + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + // Prepare CUDA stream + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + cuda::stream_ref stream_ref{stream}; + + // Create the environment with the stream and requirements + auto env = cuda::std::execution::env{stream_ref, requirements}; + + // Query temporary storage requirements + size_t temp_storage_bytes{}; + cub::DeviceTopK::MinKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env); + + // Allocate temporary storage + thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); + + cub::DeviceTopK::MinKeys( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + input.begin(), + output.begin(), + input.size(), + k, + env); + + // Get the top-k results into sorted order for easy comparison + thrust::sort(output.begin(), output.end()); + thrust::host_vector expected{-3, 1, 2, 4}; + // example-end topk-min-keys-non-deterministic-unsorted + + REQUIRE(output == expected); +} + +C2H_TEST("DeviceTopK::MaxKeys API example for non-deterministic, unsorted results", "[device][device_transform]") +{ + // example-begin topk-max-keys-non-deterministic-unsorted + const int k = 4; + auto input = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; + auto output = thrust::device_vector(k, thrust::no_init); + + // Specify that we do not require a specific output order and do not require deterministic results + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + // Prepare CUDA stream + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + cuda::stream_ref stream_ref{stream}; + + // Create the environment with the stream and requirements + auto env = cuda::std::execution::env{stream_ref, requirements}; + + // Query temporary storage requirements + size_t temp_storage_bytes{}; + cub::DeviceTopK::MaxKeys(nullptr, temp_storage_bytes, input.begin(), output.begin(), input.size(), k, env); + + // Allocate temporary storage + thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); + + cub::DeviceTopK::MaxKeys( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + input.begin(), + output.begin(), + input.size(), + k, + env); + + // Get the top-k results into sorted order for easy comparison + thrust::sort(output.begin(), output.end(), cuda::std::greater{}); + thrust::host_vector expected{8, 7, 6, 5}; + // example-end topk-max-keys-non-deterministic-unsorted + + REQUIRE(output == expected); +} + +C2H_TEST("DeviceTopK::MinPairs API example for non-deterministic, unsorted results", "[device][device_transform]") +{ + // example-begin topk-min-pairs-non-deterministic-unsorted + const int k = 4; + auto keys = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; + auto values = cuda::make_counting_iterator(0); + auto keys_out = thrust::device_vector(k, thrust::no_init); + auto values_out = thrust::device_vector(k, thrust::no_init); + + // Specify that we do not require a specific output order and do not require deterministic results + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + // Prepare CUDA stream + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + cuda::stream_ref stream_ref{stream}; + + // Create the environment with the stream and requirements + auto env = cuda::std::execution::env{stream_ref, requirements}; + + // Query temporary storage requirements + size_t temp_storage_bytes{}; + cub::DeviceTopK::MinPairs( + nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env); + + // Allocate temporary storage + thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); + + cub::DeviceTopK::MinPairs( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + keys.begin(), + keys_out.begin(), + values, + values_out.begin(), + keys.size(), + k, + env); + + // Get the top-k results into sorted order for easy comparison + thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin()); + thrust::host_vector expected_keys{-3, 1, 2, 4}; + thrust::host_vector expected_values{1, 2, 5, 6}; + // example-end topk-min-pairs-non-deterministic-unsorted + + REQUIRE(keys_out == expected_keys); + REQUIRE(values_out == expected_values); +} + +C2H_TEST("DeviceTopK::MaxPairs API example for non-deterministic, unsorted results", "[device][device_transform]") +{ + // example-begin topk-max-pairs-non-deterministic-unsorted + const int k = 4; + auto keys = thrust::device_vector{5, -3, 1, 7, 8, 2, 4, 6}; + auto values = cuda::make_counting_iterator(0); + auto keys_out = thrust::device_vector(k, thrust::no_init); + auto values_out = thrust::device_vector(k, thrust::no_init); + + // Specify that we do not require a specific output order and do not require deterministic results + auto requirements = + cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + // Prepare CUDA stream + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + cuda::stream_ref stream_ref{stream}; + + // Create the environment with the stream and requirements + auto env = cuda::std::execution::env{stream_ref, requirements}; + + // Query temporary storage requirements + size_t temp_storage_bytes{}; + cub::DeviceTopK::MaxPairs( + nullptr, temp_storage_bytes, keys.begin(), keys_out.begin(), values, values_out.begin(), keys.size(), k, env); + + // Allocate temporary storage + thrust::device_vector temp_storage(temp_storage_bytes, thrust::no_init); + + cub::DeviceTopK::MaxPairs( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + keys.begin(), + keys_out.begin(), + values, + values_out.begin(), + keys.size(), + k, + env); + + // Get the top-k results into sorted order for easy comparison + thrust::sort_by_key(keys_out.begin(), keys_out.end(), values_out.begin(), cuda::std::greater<>{}); + thrust::host_vector expected_keys{8, 7, 6, 5}; + thrust::host_vector expected_values{4, 3, 7, 0}; + // example-end topk-max-pairs-non-deterministic-unsorted + + REQUIRE(keys_out == expected_keys); + REQUIRE(values_out == expected_values); +} + +// example-begin topk-custom-type +struct custom_t +{ + float f; + int unused; + long long int lli; + + custom_t() = default; + custom_t(float f, long long int lli) + : f(f) + , unused(42) + , lli(lli) + {} +}; + +struct decomposer_t +{ + __host__ __device__ cuda::std::tuple operator()(custom_t& key) const + { + return {key.f, key.lli}; + } +}; +// example-end topk-custom-type + +static __host__ std::ostream& operator<<(std::ostream& os, const custom_t& self) +{ + return os << "{ " << self.f << ", " << self.lli << " }"; +} + +static __host__ __device__ bool operator==(const custom_t& lhs, const custom_t& rhs) +{ + return lhs.f == rhs.f && lhs.lli == rhs.lli; +} + +static __host__ __device__ bool operator<(const custom_t& lhs, const custom_t& rhs) +{ + return lhs.lli == rhs.lli ? lhs.f < rhs.f : lhs.lli < rhs.lli; +} + +static __host__ __device__ bool operator>(const custom_t& lhs, const custom_t& rhs) +{ + return rhs < lhs; +} + +C2H_TEST("DeviceTopK works with custom types and decomposer", "[device][topk]") +{ + SECTION("MaxKeys") + { + // example-begin topk-max-keys-custom-type + constexpr int num_items = 6; + constexpr int k = 3; + + thrust::device_vector in = { + {+2.5f, 4}, // + {-2.5f, 0}, // + {+1.1f, 3}, // + {+0.0f, 1}, // + {-0.0f, 2}, // + {+3.7f, 5} // + }; + + thrust::device_vector out(k); + + const custom_t* d_in = thrust::raw_pointer_cast(in.data()); + custom_t* d_out = thrust::raw_pointer_cast(out.data()); + + auto requirements = cuda::execution::require( + cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + // 1) Get temp storage size + std::uint8_t* d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + cub::DeviceTopK::MaxKeys( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); + + // 2) Allocate temp storage + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + // 3) Find the top-k largest keys + cub::DeviceTopK::MaxKeys( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); + + // Sort output for comparison (output order is not guaranteed) + thrust::sort(out.begin(), out.end(), cuda::std::greater<>{}); + thrust::device_vector expected = { + {+3.7f, 5}, // + {+2.5f, 4}, // + {+1.1f, 3} // + }; + // example-end topk-max-keys-custom-type + + REQUIRE(expected == out); + } + + SECTION("MinKeys") + { + // example-begin topk-min-keys-custom-type + constexpr int num_items = 6; + constexpr int k = 3; + + thrust::device_vector in = { + {+2.5f, 4}, // + {-2.5f, 0}, // + {+1.1f, 3}, // + {+0.0f, 1}, // + {-0.0f, 2}, // + {+3.7f, 5} // + }; + + thrust::device_vector out(k); + + const custom_t* d_in = thrust::raw_pointer_cast(in.data()); + custom_t* d_out = thrust::raw_pointer_cast(out.data()); + + auto requirements = cuda::execution::require( + cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + std::uint8_t* d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + cub::DeviceTopK::MinKeys( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceTopK::MinKeys( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, k, decomposer_t{}, requirements); + + // Sort output for comparison (output order is not guaranteed) + thrust::sort(out.begin(), out.end()); + thrust::device_vector expected = { + {-2.5f, 0}, // + {+0.0f, 1}, // + {-0.0f, 2} // + }; + // example-end topk-min-keys-custom-type + + REQUIRE(expected == out); + } + + SECTION("MaxPairs") + { + // example-begin topk-max-pairs-custom-type + constexpr int num_items = 6; + constexpr int k = 3; + + thrust::device_vector keys_in = { + {+2.5f, 4}, // + {-2.5f, 0}, // + {+1.1f, 3}, // + {+0.0f, 1}, // + {-0.0f, 2}, // + {+3.7f, 5} // + }; + + thrust::device_vector keys_out(k); + + const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data()); + custom_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data()); + + thrust::device_vector vals_in = {0, 1, 2, 3, 4, 5}; + thrust::device_vector vals_out(k); + + const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data()); + int* d_vals_out = thrust::raw_pointer_cast(vals_out.data()); + + auto requirements = cuda::execution::require( + cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + std::uint8_t* d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + cub::DeviceTopK::MaxPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_vals_in, + d_vals_out, + num_items, + k, + decomposer_t{}, + requirements); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceTopK::MaxPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_vals_in, + d_vals_out, + num_items, + k, + decomposer_t{}, + requirements); + + // Sort by key for comparison (output order is not guaranteed) + thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin(), cuda::std::greater<>{}); + + thrust::device_vector expected_keys = { + {+3.7f, 5}, // + {+2.5f, 4}, // + {+1.1f, 3} // + }; + + thrust::device_vector expected_vals = {5, 0, 2}; + // example-end topk-max-pairs-custom-type + + REQUIRE(expected_keys == keys_out); + REQUIRE(expected_vals == vals_out); + } + + SECTION("MinPairs") + { + // example-begin topk-min-pairs-custom-type + constexpr int num_items = 6; + constexpr int k = 3; + + thrust::device_vector keys_in = { + {+2.5f, 4}, // + {-2.5f, 0}, // + {+1.1f, 3}, // + {+0.0f, 1}, // + {-0.0f, 2}, // + {+3.7f, 5} // + }; + + thrust::device_vector keys_out(k); + + const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data()); + custom_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data()); + + thrust::device_vector vals_in = {0, 1, 2, 3, 4, 5}; + thrust::device_vector vals_out(k); + + const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data()); + int* d_vals_out = thrust::raw_pointer_cast(vals_out.data()); + + auto requirements = cuda::execution::require( + cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted); + + std::uint8_t* d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + cub::DeviceTopK::MinPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_vals_in, + d_vals_out, + num_items, + k, + decomposer_t{}, + requirements); + + thrust::device_vector temp_storage(temp_storage_bytes); + d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + cub::DeviceTopK::MinPairs( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_vals_in, + d_vals_out, + num_items, + k, + decomposer_t{}, + requirements); + + // Sort by key for comparison (output order is not guaranteed) + thrust::sort_by_key(keys_out.begin(), keys_out.end(), vals_out.begin()); + + thrust::device_vector expected_keys = { + {-2.5f, 0}, // + {+0.0f, 1}, // + {-0.0f, 2} // + }; + + thrust::device_vector expected_vals = {1, 3, 4}; + // example-end topk-min-pairs-custom-type + + REQUIRE(expected_keys == keys_out); + REQUIRE(expected_vals == vals_out); + } +} From 59755df01832a7d0839d60199d674038e276ee79 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 23:05:22 -0700 Subject: [PATCH 04/11] add unit test --- cub/test/catch2_test_device_topk_env.cu | 142 ++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu index 296eb603908..3b9fd3b31bd 100644 --- a/cub/test/catch2_test_device_topk_env.cu +++ b/cub/test/catch2_test_device_topk_env.cu @@ -11,6 +11,7 @@ struct stream_registry_factory_t; #include #include +#include #include #include @@ -20,6 +21,8 @@ struct stream_registry_factory_t; #include #include +#include + #include "catch2_test_env_launch_helper.h" // %PARAM% TEST_LAUNCH lid 0:2 @@ -184,4 +187,143 @@ C2H_TEST("DeviceTopK::MinPairs can be tuned", "[topk][device]", block_sizes) REQUIRE(d_block_size[0] == target_block_size); } +namespace +{ +template +thrust::host_vector sorted_top_k(const thrust::host_vector& h_in, int k, bool largest) +{ + auto sorted = h_in; + if (largest) + { + std::sort(sorted.begin(), sorted.end(), cuda::std::greater{}); + } + else + { + std::sort(sorted.begin(), sorted.end()); + } + sorted.resize(static_cast(k)); + return sorted; +} +} // namespace + +using topk_element_types = c2h::type_list; + +C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]", topk_element_types) +{ + using T = c2h::get<0, TestType>; + + const int num_items = 256; + thrust::host_vector h_in(num_items); + for (int i = 0; i < num_items; ++i) + { + h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + } + thrust::device_vector d_in = h_in; + thrust::device_vector d_out_k = thrust::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); + + thrust::host_vector h_out = d_out_k; + std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); + + auto expected = sorted_top_k(h_in, 8, /*largest*/ true); + REQUIRE(h_out == expected); +} + +C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]", topk_element_types) +{ + using T = c2h::get<0, TestType>; + + const int num_items = 256; + thrust::host_vector h_in(num_items); + for (int i = 0; i < num_items; ++i) + { + h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + } + thrust::device_vector d_in = h_in; + thrust::device_vector d_out_k = thrust::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MinKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); + + thrust::host_vector h_out = d_out_k; + std::sort(h_out.begin(), h_out.end()); + + auto expected = sorted_top_k(h_in, 8, /*largest*/ false); + REQUIRE(h_out == expected); +} + +C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", topk_element_types) +{ + using KeyT = c2h::get<0, TestType>; + + const int num_items = 256; + thrust::host_vector h_keys_in(num_items); + thrust::host_vector h_values_in(num_items); + for (int i = 0; i < num_items; ++i) + { + h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_values_in[i] = i; + } + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in = h_values_in; + thrust::device_vector d_keys_out = thrust::device_vector(8); + thrust::device_vector d_values_out = thrust::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater{}); + + auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true); + REQUIRE(h_keys_out == expected); +} + +C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]", topk_element_types) +{ + using KeyT = c2h::get<0, TestType>; + + const int num_items = 256; + thrust::host_vector h_keys_in(num_items); + thrust::host_vector h_values_in(num_items); + for (int i = 0; i < num_items; ++i) + { + h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_values_in[i] = i; + } + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in = h_values_in; + thrust::device_vector d_keys_out = thrust::device_vector(8); + thrust::device_vector d_values_out = thrust::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MinPairs( + d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end()); + + auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ false); + REQUIRE(h_keys_out == expected); +} + +C2H_TEST("DeviceTopK::MaxKeys env-alloc handles K equal to num_items", "[topk][env]") +{ + thrust::device_vector d_in{5, 2, 9, 1, 7}; + thrust::device_vector d_out(d_in.size()); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), 5, 5, env)); + + thrust::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); + thrust::host_vector expected{9, 7, 5, 2, 1}; + REQUIRE(h_out == expected); +} + #endif // TEST_LAUNCH != 1 From 1637e52d0e6c604d6c83fba3f0599ff4ba498cd4 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 23:06:23 -0700 Subject: [PATCH 05/11] pre commit --- cub/cub/device/device_topk.cuh | 180 +++++++++++++++++---------------- 1 file changed, 94 insertions(+), 86 deletions(-) diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh index 1deca561e8d..39476b584af 100644 --- a/cub/cub/device/device_topk.cuh +++ b/cub/cub/device/device_topk.cuh @@ -388,19 +388,20 @@ struct DeviceTopK { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - k, - detail::identity_decomposer_t{}, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); } //! @rst @@ -650,10 +651,11 @@ struct DeviceTopK "Custom decomposers are not supported for fundamental types; " "use the non-decomposer API overload instead"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); } //! @rst @@ -856,19 +858,20 @@ struct DeviceTopK { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - k, - detail::identity_decomposer_t{}, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); } //! @rst @@ -1113,10 +1116,11 @@ struct DeviceTopK "Custom decomposers are not supported for fundamental types; " "use the non-decomposer API overload instead"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); } //! @rst @@ -1282,19 +1286,20 @@ struct DeviceTopK { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - static_cast(nullptr), - static_cast(nullptr), - num_items, - k, - detail::identity_decomposer_t{}, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); } //! @rst @@ -1506,19 +1511,20 @@ struct DeviceTopK "Custom decomposers are not supported for fundamental types; " "use the non-decomposer API overload instead"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - static_cast(nullptr), - static_cast(nullptr), - num_items, - k, - decomposer, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); } //! @rst @@ -1684,19 +1690,20 @@ struct DeviceTopK { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - static_cast(nullptr), - static_cast(nullptr), - num_items, - k, - detail::identity_decomposer_t{}, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); } //! @rst @@ -1908,19 +1915,20 @@ struct DeviceTopK "Custom decomposers are not supported for fundamental types; " "use the non-decomposer API overload instead"); - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { - return detail::dispatch_topk( - storage, - bytes, - d_keys_in, - d_keys_out, - static_cast(nullptr), - static_cast(nullptr), - num_items, - k, - decomposer, - env); - }); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); } }; From 776655d635d09f80b54bf599298f613e52c5de91 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 13 May 2026 23:19:03 -0700 Subject: [PATCH 06/11] add tests for key-value pairing --- cub/test/catch2_test_device_topk_env.cu | 26 +++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu index 3b9fd3b31bd..8cf410d716e 100644 --- a/cub/test/catch2_test_device_topk_env.cu +++ b/cub/test/catch2_test_device_topk_env.cu @@ -276,7 +276,18 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", == cub::DeviceTopK::MaxPairs( d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); - thrust::host_vector h_keys_out = d_keys_out; + thrust::host_vector h_keys_out = d_keys_out; + thrust::host_vector h_values_out = d_values_out; + + // Verify pair association: each returned value indexes back to the corresponding key + // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) + for (size_t i = 0; i < h_keys_out.size(); ++i) + { + REQUIRE(h_values_out[i] >= 0); + REQUIRE(h_values_out[i] < num_items); + REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]); + } + std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater{}); auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true); @@ -305,7 +316,18 @@ C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env] == cub::DeviceTopK::MinPairs( d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); - thrust::host_vector h_keys_out = d_keys_out; + thrust::host_vector h_keys_out = d_keys_out; + thrust::host_vector h_values_out = d_values_out; + + // Verify pair association: each returned value indexes back to the corresponding key + // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) + for (size_t i = 0; i < h_keys_out.size(); ++i) + { + REQUIRE(h_values_out[i] >= 0); + REQUIRE(h_values_out[i] < num_items); + REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]); + } + std::sort(h_keys_out.begin(), h_keys_out.end()); auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ false); From b0a851953a4f087fb3eba9d4f80a315c7d6c99a6 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 14 May 2026 17:06:03 -0700 Subject: [PATCH 07/11] match docs --- cub/cub/device/device_topk.cuh | 123 ++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 41 deletions(-) diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh index 39476b584af..7cd59efea17 100644 --- a/cub/cub/device/device_topk.cuh +++ b/cub/cub/device/device_topk.cuh @@ -600,25 +600,28 @@ struct DeviceTopK //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] d_values_in - //! Pointer to the input values + //! Random-access iterator to the input sequence containing the values associated to each key //! //! @param[out] d_values_out - //! Pointer to the K output values + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each //! //! @param[in] k - //! The K value + //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. //! //! @param[in] decomposer - //! Decomposer object for interpreting user-defined key types + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. //! //! @param[in] env //! @rst @@ -773,6 +776,10 @@ struct DeviceTopK //! @rst //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -817,22 +824,24 @@ struct DeviceTopK //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] d_values_in - //! Pointer to the input values + //! Random-access iterator to the input sequence containing the values associated to each key //! //! @param[out] d_values_out - //! Pointer to the K output values + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each //! //! @param[in] k - //! The K value + //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. //! //! @param[in] env //! @rst @@ -1018,6 +1027,10 @@ struct DeviceTopK //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs, //! using a decomposer to interpret user-defined key types. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -1059,31 +1072,35 @@ struct DeviceTopK //! The integral type of variable k //! //! @tparam DecomposerT - //! **[inferred]** Type of decomposer + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. //! //! @tparam EnvT //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] d_values_in - //! Pointer to the input values + //! Random-access iterator to the input sequence containing the values associated to each key //! //! @param[out] d_values_out - //! Pointer to the K output values + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each //! //! @param[in] k - //! The K value + //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. //! //! @param[in] decomposer - //! Decomposer object for interpreting user-defined key types + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. //! //! @param[in] env //! @rst @@ -1221,6 +1238,10 @@ struct DeviceTopK //! @rst //! Finds the largest K keys from an unordered input sequence. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -1259,16 +1280,17 @@ struct DeviceTopK //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` //! //! @param[in] k - //! The K value + //! The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. //! //! @param[in] env //! @rst @@ -1429,6 +1451,10 @@ struct DeviceTopK //! Finds the largest K keys from an unordered input sequence, //! using a decomposer to interpret user-defined key types. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -1464,25 +1490,28 @@ struct DeviceTopK //! The integral type of variable k //! //! @tparam DecomposerT - //! **[inferred]** Type of decomposer + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. //! //! @tparam EnvT //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` //! //! @param[in] k - //! The K value + //! The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. //! //! @param[in] decomposer - //! Decomposer object for interpreting user-defined key types + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. //! //! @param[in] env //! @rst @@ -1625,6 +1654,10 @@ struct DeviceTopK //! @rst //! Finds the smallest K keys from an unordered input sequence. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -1663,16 +1696,17 @@ struct DeviceTopK //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` //! //! @param[in] k - //! The K value + //! The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. //! //! @param[in] env //! @rst @@ -1833,6 +1867,10 @@ struct DeviceTopK //! Finds the smallest K keys from an unordered input sequence, //! using a decomposer to interpret user-defined key types. //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! //! .. versionadded:: 3.5.0 //! First appears in CUDA Toolkit 13.5. //! @@ -1868,25 +1906,28 @@ struct DeviceTopK //! The integral type of variable k //! //! @tparam DecomposerT - //! **[inferred]** Type of decomposer + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. //! //! @tparam EnvT //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in - //! Pointer to the input keys + //! Random-access iterator to the input sequence containing the keys //! //! @param[out] d_keys_out - //! Pointer to the K output keys + //! Random-access iterator to the output sequence of keys, where K values will be written to //! //! @param[in] num_items - //! Number of input items + //! Number of items to be read and processed from `d_keys_in` //! //! @param[in] k - //! The K value + //! The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. //! //! @param[in] decomposer - //! Decomposer object for interpreting user-defined key types + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. //! //! @param[in] env //! @rst From e3aeaf0f1b576100a51e7063c27bba6d1af79928 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 18 May 2026 17:52:53 -0700 Subject: [PATCH 08/11] move enable_ifs from return into the template head --- cub/cub/device/device_topk.cuh | 132 +++++++++++++++++---------------- 1 file changed, 68 insertions(+), 64 deletions(-) diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh index 7cd59efea17..6cefad73a27 100644 --- a/cub/cub/device/device_topk.cuh +++ b/cub/cub/device/device_topk.cuh @@ -627,25 +627,26 @@ struct DeviceTopK //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template > - [[nodiscard]] CUB_RUNTIME_FUNCTION static // + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t, DecomposerT>, - cudaError_t> - MaxPairs(KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, - NumItemsT num_items, - NumOutItemsT k, - DecomposerT decomposer, - EnvT env = {}) + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); using key_t = detail::it_value_t; @@ -1106,25 +1107,26 @@ struct DeviceTopK //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template > - [[nodiscard]] CUB_RUNTIME_FUNCTION static // + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t, DecomposerT>, - cudaError_t> - MinPairs(KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, - NumItemsT num_items, - NumOutItemsT k, - DecomposerT decomposer, - EnvT env = {}) + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); using key_t = detail::it_value_t; @@ -1517,21 +1519,22 @@ struct DeviceTopK //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template > - [[nodiscard]] CUB_RUNTIME_FUNCTION static // + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t, DecomposerT>, - cudaError_t> - MaxKeys(KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - NumItemsT num_items, - NumOutItemsT k, - DecomposerT decomposer, - EnvT env = {}) + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); using key_t = detail::it_value_t; @@ -1933,21 +1936,22 @@ struct DeviceTopK //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template > - [[nodiscard]] CUB_RUNTIME_FUNCTION static // + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t, DecomposerT>, - cudaError_t> - MinKeys(KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - NumItemsT num_items, - NumOutItemsT k, - DecomposerT decomposer, - EnvT env = {}) + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); using key_t = detail::it_value_t; From 1ddd14a99f2298cf8b79373d62a13f249d27fa60 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 18 May 2026 17:54:06 -0700 Subject: [PATCH 09/11] move result vectors into visible block --- cub/test/catch2_test_device_topk_env_api.cu | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu index 6d5964a5f2b..d1b67a7db4f 100644 --- a/cub/test/catch2_test_device_topk_env_api.cu +++ b/cub/test/catch2_test_device_topk_env_api.cu @@ -60,6 +60,7 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]") { std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; } + thrust::device_vector expected{9, 8, 7}; // possibly in different order // example-end topk-max-keys-env stream.sync(); @@ -67,7 +68,6 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]") // Result order is unspecified for TopK; sort before compare. thrust::sort(d_out.begin(), d_out.end(), cuda::std::greater{}); - thrust::device_vector expected{9, 8, 7}; REQUIRE(d_out == expected); } @@ -89,13 +89,13 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]") { std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; } + thrust::device_vector expected{0, 1, 2}; // possibly in different order // example-end topk-min-keys-env stream.sync(); REQUIRE(error == cudaSuccess); thrust::sort(d_out.begin(), d_out.end()); - thrust::device_vector expected{0, 1, 2}; REQUIRE(d_out == expected); } @@ -126,13 +126,13 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]" { std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; } + thrust::device_vector expected_keys{9, 8, 7}; // possibly in different order // example-end topk-max-pairs-env stream.sync(); REQUIRE(error == cudaSuccess); thrust::sort(d_keys_out.begin(), d_keys_out.end(), cuda::std::greater{}); - thrust::device_vector expected_keys{9, 8, 7}; REQUIRE(d_keys_out == expected_keys); } @@ -163,13 +163,13 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]" { std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; } + thrust::device_vector expected_keys{0, 1, 2}; // possibly in different order // example-end topk-min-pairs-env stream.sync(); REQUIRE(error == cudaSuccess); thrust::sort(d_keys_out.begin(), d_keys_out.end()); - thrust::device_vector expected_keys{0, 1, 2}; REQUIRE(d_keys_out == expected_keys); } @@ -194,6 +194,7 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref" { std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; } + thrust::host_vector expected_ranks{9, 8, 7}; // possibly in different order // example-end topk-max-keys-decomposer-env stream.sync(); @@ -203,9 +204,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref" std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { return a.rank > b.rank; }); - REQUIRE(h_out[0].rank == 9); - REQUIRE(h_out[1].rank == 8); - REQUIRE(h_out[2].rank == 7); + thrust::host_vector actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); } C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref", "[topk][env]") @@ -229,6 +229,7 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref" { std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; } + thrust::host_vector expected_ranks{0, 1, 2}; // possibly in different order // example-end topk-min-keys-decomposer-env stream.sync(); @@ -238,9 +239,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref" std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { return a.rank < b.rank; }); - REQUIRE(h_out[0].rank == 0); - REQUIRE(h_out[1].rank == 1); - REQUIRE(h_out[2].rank == 2); + thrust::host_vector actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); } C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") @@ -273,6 +273,7 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref { std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; } + thrust::host_vector expected_ranks{9, 8, 7}; // possibly in different order // example-end topk-max-pairs-decomposer-env stream.sync(); @@ -282,9 +283,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { return a.rank > b.rank; }); - REQUIRE(h_keys_out[0].rank == 9); - REQUIRE(h_keys_out[1].rank == 8); - REQUIRE(h_keys_out[2].rank == 7); + thrust::host_vector actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); } C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") @@ -317,6 +317,7 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref { std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; } + thrust::host_vector expected_ranks{0, 1, 2}; // possibly in different order // example-end topk-min-pairs-decomposer-env stream.sync(); @@ -326,7 +327,6 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { return a.rank < b.rank; }); - REQUIRE(h_keys_out[0].rank == 0); - REQUIRE(h_keys_out[1].rank == 1); - REQUIRE(h_keys_out[2].rank == 2); + thrust::host_vector actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); } From 9d192aa6d139a27ffd15f0444bcaa410c8826a30 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Mon, 18 May 2026 17:55:34 -0700 Subject: [PATCH 10/11] use c2h vectors in unit test, not thrust::vectors --- cub/test/catch2_test_device_topk_env.cu | 92 +++++++++++-------------- 1 file changed, 40 insertions(+), 52 deletions(-) diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu index 8cf410d716e..ef97714d796 100644 --- a/cub/test/catch2_test_device_topk_env.cu +++ b/cub/test/catch2_test_device_topk_env.cu @@ -11,7 +11,6 @@ struct stream_registry_factory_t; #include #include -#include #include #include @@ -189,13 +188,12 @@ C2H_TEST("DeviceTopK::MinPairs can be tuned", "[topk][device]", block_sizes) namespace { -template -thrust::host_vector sorted_top_k(const thrust::host_vector& h_in, int k, bool largest) +c2h::host_vector sorted_top_k(const c2h::host_vector& h_in, int k, bool largest) { auto sorted = h_in; if (largest) { - std::sort(sorted.begin(), sorted.end(), cuda::std::greater{}); + std::sort(sorted.begin(), sorted.end(), cuda::std::greater{}); } else { @@ -206,78 +204,70 @@ thrust::host_vector sorted_top_k(const thrust::host_vector& h_in, int k, b } } // namespace -using topk_element_types = c2h::type_list; - -C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]", topk_element_types) +C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]") { - using T = c2h::get<0, TestType>; - const int num_items = 256; - thrust::host_vector h_in(num_items); + c2h::host_vector h_in(num_items); for (int i = 0; i < num_items; ++i) { - h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); } - thrust::device_vector d_in = h_in; - thrust::device_vector d_out_k = thrust::device_vector(8); + c2h::device_vector d_in = h_in; + c2h::device_vector d_out_k = c2h::device_vector(8); auto env = topk_requirements(); REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); - thrust::host_vector h_out = d_out_k; - std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); + c2h::host_vector h_out = d_out_k; + std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); auto expected = sorted_top_k(h_in, 8, /*largest*/ true); REQUIRE(h_out == expected); } -C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]", topk_element_types) +C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]") { - using T = c2h::get<0, TestType>; - const int num_items = 256; - thrust::host_vector h_in(num_items); + c2h::host_vector h_in(num_items); for (int i = 0; i < num_items; ++i) { - h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); } - thrust::device_vector d_in = h_in; - thrust::device_vector d_out_k = thrust::device_vector(8); + c2h::device_vector d_in = h_in; + c2h::device_vector d_out_k = c2h::device_vector(8); auto env = topk_requirements(); REQUIRE(cudaSuccess == cub::DeviceTopK::MinKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); - thrust::host_vector h_out = d_out_k; + c2h::host_vector h_out = d_out_k; std::sort(h_out.begin(), h_out.end()); auto expected = sorted_top_k(h_in, 8, /*largest*/ false); REQUIRE(h_out == expected); } -C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", topk_element_types) +C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]") { - using KeyT = c2h::get<0, TestType>; - const int num_items = 256; - thrust::host_vector h_keys_in(num_items); - thrust::host_vector h_values_in(num_items); + c2h::host_vector h_keys_in(num_items); + c2h::host_vector h_values_in(num_items); for (int i = 0; i < num_items; ++i) { - h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); h_values_in[i] = i; } - thrust::device_vector d_keys_in = h_keys_in; - thrust::device_vector d_values_in = h_values_in; - thrust::device_vector d_keys_out = thrust::device_vector(8); - thrust::device_vector d_values_out = thrust::device_vector(8); + c2h::device_vector d_keys_in = h_keys_in; + c2h::device_vector d_values_in = h_values_in; + c2h::device_vector d_keys_out = c2h::device_vector(8); + c2h::device_vector d_values_out = c2h::device_vector(8); auto env = topk_requirements(); REQUIRE(cudaSuccess == cub::DeviceTopK::MaxPairs( d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); - thrust::host_vector h_keys_out = d_keys_out; - thrust::host_vector h_values_out = d_values_out; + c2h::host_vector h_keys_out = d_keys_out; + c2h::host_vector h_values_out = d_values_out; // Verify pair association: each returned value indexes back to the corresponding key // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) @@ -288,36 +278,34 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]", REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]); } - std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater{}); + std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater{}); auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true); REQUIRE(h_keys_out == expected); } -C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]", topk_element_types) +C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]") { - using KeyT = c2h::get<0, TestType>; - const int num_items = 256; - thrust::host_vector h_keys_in(num_items); - thrust::host_vector h_values_in(num_items); + c2h::host_vector h_keys_in(num_items); + c2h::host_vector h_values_in(num_items); for (int i = 0; i < num_items; ++i) { - h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); + h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); h_values_in[i] = i; } - thrust::device_vector d_keys_in = h_keys_in; - thrust::device_vector d_values_in = h_values_in; - thrust::device_vector d_keys_out = thrust::device_vector(8); - thrust::device_vector d_values_out = thrust::device_vector(8); + c2h::device_vector d_keys_in = h_keys_in; + c2h::device_vector d_values_in = h_values_in; + c2h::device_vector d_keys_out = c2h::device_vector(8); + c2h::device_vector d_values_out = c2h::device_vector(8); auto env = topk_requirements(); REQUIRE(cudaSuccess == cub::DeviceTopK::MinPairs( d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); - thrust::host_vector h_keys_out = d_keys_out; - thrust::host_vector h_values_out = d_values_out; + c2h::host_vector h_keys_out = d_keys_out; + c2h::host_vector h_values_out = d_values_out; // Verify pair association: each returned value indexes back to the corresponding key // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) @@ -336,15 +324,15 @@ C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env] C2H_TEST("DeviceTopK::MaxKeys env-alloc handles K equal to num_items", "[topk][env]") { - thrust::device_vector d_in{5, 2, 9, 1, 7}; - thrust::device_vector d_out(d_in.size()); + c2h::device_vector d_in{5, 2, 9, 1, 7}; + c2h::device_vector d_out(d_in.size()); auto env = topk_requirements(); REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), 5, 5, env)); - thrust::host_vector h_out = d_out; + c2h::host_vector h_out = d_out; std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); - thrust::host_vector expected{9, 7, 5, 2, 1}; + c2h::host_vector expected{9, 7, 5, 2, 1}; REQUIRE(h_out == expected); } From 5ec92826899652fc88f1f478a94eba7afbd24b5c Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 19 May 2026 13:56:28 -0700 Subject: [PATCH 11/11] final reviews --- cub/test/catch2_test_device_topk_env.cu | 45 ++++++++------------- cub/test/catch2_test_device_topk_env_api.cu | 24 +++++++---- 2 files changed, 33 insertions(+), 36 deletions(-) diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu index ef97714d796..53aad4f7b30 100644 --- a/cub/test/catch2_test_device_topk_env.cu +++ b/cub/test/catch2_test_device_topk_env.cu @@ -21,6 +21,7 @@ struct stream_registry_factory_t; #include #include +#include #include "catch2_test_env_launch_helper.h" @@ -193,11 +194,11 @@ c2h::host_vector sorted_top_k(const c2h::host_vector& h_in, int k, boo auto sorted = h_in; if (largest) { - std::sort(sorted.begin(), sorted.end(), cuda::std::greater{}); + std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end(), cuda::std::greater{}); } else { - std::sort(sorted.begin(), sorted.end()); + std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end()); } sorted.resize(static_cast(k)); return sorted; @@ -207,12 +208,9 @@ c2h::host_vector sorted_top_k(const c2h::host_vector& h_in, int k, boo C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]") { const int num_items = 256; - c2h::host_vector h_in(num_items); - for (int i = 0; i < num_items; ++i) - { - h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); - } - c2h::device_vector d_in = h_in; + c2h::device_vector d_in(num_items); + c2h::gen(C2H_SEED(1), d_in); + c2h::host_vector h_in = d_in; c2h::device_vector d_out_k = c2h::device_vector(8); auto env = topk_requirements(); @@ -228,12 +226,9 @@ C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]") C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]") { const int num_items = 256; - c2h::host_vector h_in(num_items); - for (int i = 0; i < num_items; ++i) - { - h_in[i] = static_cast((i * 1664525 + 1013904223) % 251); - } - c2h::device_vector d_in = h_in; + c2h::device_vector d_in(num_items); + c2h::gen(C2H_SEED(1), d_in); + c2h::host_vector h_in = d_in; c2h::device_vector d_out_k = c2h::device_vector(8); auto env = topk_requirements(); @@ -249,14 +244,11 @@ C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]" C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]") { const int num_items = 256; - c2h::host_vector h_keys_in(num_items); + c2h::device_vector d_keys_in(num_items); + c2h::gen(C2H_SEED(1), d_keys_in); + c2h::host_vector h_keys_in = d_keys_in; c2h::host_vector h_values_in(num_items); - for (int i = 0; i < num_items; ++i) - { - h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); - h_values_in[i] = i; - } - c2h::device_vector d_keys_in = h_keys_in; + std::iota(h_values_in.begin(), h_values_in.end(), 0); c2h::device_vector d_values_in = h_values_in; c2h::device_vector d_keys_out = c2h::device_vector(8); c2h::device_vector d_values_out = c2h::device_vector(8); @@ -287,14 +279,11 @@ C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]") C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]") { const int num_items = 256; - c2h::host_vector h_keys_in(num_items); + c2h::device_vector d_keys_in(num_items); + c2h::gen(C2H_SEED(1), d_keys_in); + c2h::host_vector h_keys_in = d_keys_in; c2h::host_vector h_values_in(num_items); - for (int i = 0; i < num_items; ++i) - { - h_keys_in[i] = static_cast((i * 1664525 + 1013904223) % 251); - h_values_in[i] = i; - } - c2h::device_vector d_keys_in = h_keys_in; + std::iota(h_values_in.begin(), h_values_in.end(), 0); c2h::device_vector d_values_in = h_values_in; c2h::device_vector d_keys_out = c2h::device_vector(8); c2h::device_vector d_values_out = c2h::device_vector(8); diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu index d1b67a7db4f..39d163d3f88 100644 --- a/cub/test/catch2_test_device_topk_env_api.cu +++ b/cub/test/catch2_test_device_topk_env_api.cu @@ -52,7 +52,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]") cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); @@ -81,7 +82,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]") cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MinKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); @@ -111,7 +113,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]" cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MaxPairs( @@ -148,7 +151,8 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]" cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MinPairs( @@ -185,7 +189,8 @@ C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref" cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MaxKeys( @@ -220,7 +225,8 @@ C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref" cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MinKeys( @@ -257,7 +263,8 @@ C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MaxPairs( @@ -301,7 +308,8 @@ C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref cuda::stream stream{cuda::devices[0]}; cuda::stream_ref stream_ref{stream}; auto env = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted), + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), stream_ref}; auto error = cub::DeviceTopK::MinPairs(