diff --git a/cub/cub/device/device_topk.cuh b/cub/cub/device/device_topk.cuh index 0567b76149d..6cefad73a27 100644 --- a/cub/cub/device/device_topk.cuh +++ b/cub/cub/device/device_topk.cuh @@ -18,6 +18,7 @@ #endif // no system header #include +#include #include #include @@ -293,6 +294,116 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-pairs-env + //! :end-before: example-end topk-max-pairs-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -433,6 +544,124 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys and their corresponding values from an unordered input sequence of key-value pairs, + //! using a decomposer to interpret user-defined key types. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-pairs-decomposer-env + //! :end-before: example-end topk-max-pairs-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] decomposer + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, DecomposerT>, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxPairs"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -545,6 +774,116 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-pairs-env + //! :end-before: example-end topk-min-pairs-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -677,39 +1016,253 @@ struct DeviceTopK temp_storage_bytes, d_keys_in, d_keys_out, - d_values_in, - d_values_out, + d_values_in, + d_values_out, + num_items, + k, + decomposer, + ::cuda::std::move(env)); + } + + //! @rst + //! Finds the smallest K keys and their corresponding values from an unordered input sequence of key-value pairs, + //! using a decomposer to interpret user-defined key types. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-pairs-decomposer-env + //! :end-before: example-end topk-min-pairs-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access input iterator type for writing output values @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] d_values_in + //! Random-access iterator to the input sequence containing the values associated to each key + //! + //! @param[out] d_values_out + //! Random-access iterator to the output sequence of values, corresponding to the top k keys, where k values will be + //! written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` and `d_values_in` each + //! + //! @param[in] k + //! The value of K, which is the number of lowest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] decomposer + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename ValueInputIteratorT, + typename ValueOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, DecomposerT>, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinPairs( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinPairs"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, k, decomposer, env); + }); + } + + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! Finds the largest K keys from an unordered input sequence of keys. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! - @devicestorage + //! + //! .. versionadded:: 3.3.0 + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The following code snippet demonstrates how to use the `cub::DeviceTopK::MinKeys` function to find the largest K + //! items: + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-keys-non-deterministic-unsorted + //! :end-before: example-end topk-max-keys-non-deterministic-unsorted + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` + //! + //! @param[in] k + //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MaxKeys"); + + return detail::dispatch_topk( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), num_items, k, - decomposer, + detail::identity_decomposer_t{}, ::cuda::std::move(env)); } //! @rst - //! Overview - //! +++++++++++++++++++++++++++++++++++++++++++++ - //! - //! Finds the largest K keys from an unordered input sequence of keys. + //! Finds the largest K keys from an unordered input sequence. //! //! .. note:: //! //! The behavior is undefined if the input and output ranges overlap in any way. //! - //! - @devicestorage + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. //! - //! .. versionadded:: 3.3.0 + //! This is an environment-based API that allows customization of: //! - //! A Simple Example - //! +++++++++++++++++++++++++++++++++++++++++++++ + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` //! - //! The following code snippet demonstrates how to use the `cub::DeviceTopK::MinKeys` function to find the largest K - //! items: + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. //! - //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_api.cu + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu //! :language: c++ //! :dedent: - //! :start-after: example-begin topk-max-keys-non-deterministic-unsorted - //! :end-before: example-end topk-max-keys-non-deterministic-unsorted + //! :start-after: example-begin topk-max-keys-env + //! :end-before: example-end topk-max-keys-env //! //! @endrst //! @@ -725,12 +1278,8 @@ struct DeviceTopK //! @tparam NumOutItemsT //! The integral type of variable k //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work is done. - //! - //! @param[in,out] temp_storage_bytes - //! Reference to size in bytes of `d_temp_storage` allocation + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. //! //! @param[in] d_keys_in //! Random-access iterator to the input sequence containing the keys @@ -742,12 +1291,12 @@ struct DeviceTopK //! Number of items to be read and processed from `d_keys_in` //! //! @param[in] k - //! The value of K, which is the number of largest pairs to find from `num_items` pairs. Capped to a maximum of + //! The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of //! `num_items`. //! //! @param[in] env //! @rst - //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst template < typename KeyInputIteratorT, @@ -756,28 +1305,25 @@ struct DeviceTopK typename NumOutItemsT, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t, EnvT>, int> = 0> - CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - NumItemsT num_items, - NumOutItemsT k, - EnvT env = {}) + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( + KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {}) { - _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceTopK::MaxKeys"); + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); - return detail::dispatch_topk( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - static_cast(nullptr), - static_cast(nullptr), - num_items, - k, - detail::identity_decomposer_t{}, - ::cuda::std::move(env)); + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); } //! @rst @@ -903,6 +1449,116 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the largest K keys from an unordered input sequence, + //! using a decomposer to interpret user-defined key types. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-max-keys-decomposer-env + //! :end-before: example-end topk-max-keys-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` + //! + //! @param[in] k + //! The value of K, which is the number of largest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] decomposer + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, DecomposerT>, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MaxKeys( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MaxKeys"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -998,6 +1654,95 @@ struct DeviceTopK ::cuda::std::move(env)); } + //! @rst + //! Finds the smallest K keys from an unordered input sequence. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-keys-env + //! :end-before: example-end topk-min-keys-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` + //! + //! @param[in] k + //! The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, EnvT>, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys( + KeyInputIteratorT d_keys_in, KeyOutputIteratorT d_keys_out, NumItemsT num_items, NumOutItemsT k, EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + detail::identity_decomposer_t{}, + env); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -1120,6 +1865,116 @@ struct DeviceTopK decomposer, ::cuda::std::move(env)); } + + //! @rst + //! Finds the smallest K keys from an unordered input sequence, + //! using a decomposer to interpret user-defined key types. + //! + //! .. note:: + //! + //! The behavior is undefined if the input and output ranges overlap in any way. + //! + //! .. versionadded:: 3.5.0 + //! First appears in CUDA Toolkit 13.5. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Unlike the temp-storage overload, this overload allocates and manages the required temporary + //! storage internally using the memory resource queried from the environment. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_topk_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin topk-min-keys-decomposer-env + //! :end-before: example-end topk-min-keys-decomposer-env + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output keys @iterator + //! + //! @tparam NumItemsT + //! The integral type of variable num_items + //! + //! @tparam NumOutItemsT + //! The integral type of variable k + //! + //! @tparam DecomposerT + //! **[inferred]** Type of a callable object responsible for decomposing a key into a tuple of references to its + //! constituent arithmetic types. + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! + //! @param[in] d_keys_in + //! Random-access iterator to the input sequence containing the keys + //! + //! @param[out] d_keys_out + //! Random-access iterator to the output sequence of keys, where K values will be written to + //! + //! @param[in] num_items + //! Number of items to be read and processed from `d_keys_in` + //! + //! @param[in] k + //! The value of K, which is the number of lowest keys to find from `num_items` keys. Capped to a maximum of + //! `num_items`. + //! + //! @param[in] decomposer + //! Callable object responsible for decomposing a key into a tuple of references to its constituent arithmetic + //! types. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template < + typename KeyInputIteratorT, + typename KeyOutputIteratorT, + typename NumItemsT, + typename NumOutItemsT, + typename DecomposerT, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t, DecomposerT>, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MinKeys( + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + NumItemsT num_items, + NumOutItemsT k, + DecomposerT decomposer, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTopK::MinKeys"); + using key_t = detail::it_value_t; + + static_assert(!detail::radix::can_twiddle, + "Custom decomposers are not supported for fundamental types; " + "use the non-decomposer API overload instead"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, [[maybe_unused]] auto stream) { + return detail::dispatch_topk( + storage, + bytes, + d_keys_in, + d_keys_out, + static_cast(nullptr), + static_cast(nullptr), + num_items, + k, + decomposer, + env); + }); + } }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_topk_env.cu b/cub/test/catch2_test_device_topk_env.cu index 296eb603908..53aad4f7b30 100644 --- a/cub/test/catch2_test_device_topk_env.cu +++ b/cub/test/catch2_test_device_topk_env.cu @@ -20,6 +20,9 @@ struct stream_registry_factory_t; #include #include +#include +#include + #include "catch2_test_env_launch_helper.h" // %PARAM% TEST_LAUNCH lid 0:2 @@ -184,4 +187,142 @@ C2H_TEST("DeviceTopK::MinPairs can be tuned", "[topk][device]", block_sizes) REQUIRE(d_block_size[0] == target_block_size); } +namespace +{ +c2h::host_vector sorted_top_k(const c2h::host_vector& h_in, int k, bool largest) +{ + auto sorted = h_in; + if (largest) + { + std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end(), cuda::std::greater{}); + } + else + { + std::partial_sort(sorted.begin(), sorted.begin() + k, sorted.end()); + } + sorted.resize(static_cast(k)); + return sorted; +} +} // namespace + +C2H_TEST("DeviceTopK::MaxKeys env-alloc returns correct top K", "[topk][env]") +{ + const int num_items = 256; + c2h::device_vector d_in(num_items); + c2h::gen(C2H_SEED(1), d_in); + c2h::host_vector h_in = d_in; + c2h::device_vector d_out_k = c2h::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); + + c2h::host_vector h_out = d_out_k; + std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); + + auto expected = sorted_top_k(h_in, 8, /*largest*/ true); + REQUIRE(h_out == expected); +} + +C2H_TEST("DeviceTopK::MinKeys env-alloc returns correct bottom K", "[topk][env]") +{ + const int num_items = 256; + c2h::device_vector d_in(num_items); + c2h::gen(C2H_SEED(1), d_in); + c2h::host_vector h_in = d_in; + c2h::device_vector d_out_k = c2h::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MinKeys(d_in.begin(), d_out_k.begin(), num_items, 8, env)); + + c2h::host_vector h_out = d_out_k; + std::sort(h_out.begin(), h_out.end()); + + auto expected = sorted_top_k(h_in, 8, /*largest*/ false); + REQUIRE(h_out == expected); +} + +C2H_TEST("DeviceTopK::MaxPairs env-alloc returns correct top K", "[topk][env]") +{ + const int num_items = 256; + c2h::device_vector d_keys_in(num_items); + c2h::gen(C2H_SEED(1), d_keys_in); + c2h::host_vector h_keys_in = d_keys_in; + c2h::host_vector h_values_in(num_items); + std::iota(h_values_in.begin(), h_values_in.end(), 0); + c2h::device_vector d_values_in = h_values_in; + c2h::device_vector d_keys_out = c2h::device_vector(8); + c2h::device_vector d_values_out = c2h::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); + + c2h::host_vector h_keys_out = d_keys_out; + c2h::host_vector h_values_out = d_values_out; + + // Verify pair association: each returned value indexes back to the corresponding key + // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) + for (size_t i = 0; i < h_keys_out.size(); ++i) + { + REQUIRE(h_values_out[i] >= 0); + REQUIRE(h_values_out[i] < num_items); + REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]); + } + + std::sort(h_keys_out.begin(), h_keys_out.end(), cuda::std::greater{}); + + auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ true); + REQUIRE(h_keys_out == expected); +} + +C2H_TEST("DeviceTopK::MinPairs env-alloc returns correct bottom K", "[topk][env]") +{ + const int num_items = 256; + c2h::device_vector d_keys_in(num_items); + c2h::gen(C2H_SEED(1), d_keys_in); + c2h::host_vector h_keys_in = d_keys_in; + c2h::host_vector h_values_in(num_items); + std::iota(h_values_in.begin(), h_values_in.end(), 0); + c2h::device_vector d_values_in = h_values_in; + c2h::device_vector d_keys_out = c2h::device_vector(8); + c2h::device_vector d_values_out = c2h::device_vector(8); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess + == cub::DeviceTopK::MinPairs( + d_keys_in.begin(), d_keys_out.begin(), d_values_in.begin(), d_values_out.begin(), num_items, 8, env)); + + c2h::host_vector h_keys_out = d_keys_out; + c2h::host_vector h_values_out = d_values_out; + + // Verify pair association: each returned value indexes back to the corresponding key + // (recall h_values_in[i] = i, so value-out is the original input position of the key-out) + for (size_t i = 0; i < h_keys_out.size(); ++i) + { + REQUIRE(h_values_out[i] >= 0); + REQUIRE(h_values_out[i] < num_items); + REQUIRE(h_keys_out[i] == h_keys_in[h_values_out[i]]); + } + + std::sort(h_keys_out.begin(), h_keys_out.end()); + + auto expected = sorted_top_k(h_keys_in, 8, /*largest*/ false); + REQUIRE(h_keys_out == expected); +} + +C2H_TEST("DeviceTopK::MaxKeys env-alloc handles K equal to num_items", "[topk][env]") +{ + c2h::device_vector d_in{5, 2, 9, 1, 7}; + c2h::device_vector d_out(d_in.size()); + + auto env = topk_requirements(); + REQUIRE(cudaSuccess == cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), 5, 5, env)); + + c2h::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), cuda::std::greater{}); + c2h::host_vector expected{9, 7, 5, 2, 1}; + REQUIRE(h_out == expected); +} + #endif // TEST_LAUNCH != 1 diff --git a/cub/test/catch2_test_device_topk_env_api.cu b/cub/test/catch2_test_device_topk_env_api.cu new file mode 100644 index 00000000000..39d163d3f88 --- /dev/null +++ b/cub/test/catch2_test_device_topk_env_api.cu @@ -0,0 +1,340 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// Simple user-defined key type for the decomposer-based examples. +struct topk_custom_t +{ + int rank; + int payload; +}; + +struct topk_custom_decomposer_t +{ + __host__ __device__ ::cuda::std::tuple operator()(topk_custom_t& key) const + { + return {key.rank}; + } + __host__ __device__ ::cuda::std::tuple operator()(const topk_custom_t& key) const + { + return {key.rank}; + } +}; + +C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-keys-env + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; + } + thrust::device_vector expected{9, 8, 7}; // possibly in different order + // example-end topk-max-keys-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + // Result order is unspecified for TopK; sort before compare. + thrust::sort(d_out.begin(), d_out.end(), cuda::std::greater{}); + REQUIRE(d_out == expected); +} + +C2H_TEST("cub::DeviceTopK::MinKeys env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-keys-env + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinKeys(d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; + } + thrust::device_vector expected{0, 1, 2}; // possibly in different order + // example-end topk-min-keys-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_out.begin(), d_out.end()); + REQUIRE(d_out == expected); +} + +C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-pairs-env + auto d_keys_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_values_in = thrust::device_vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto d_keys_out = thrust::device_vector(3); + auto d_values_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; + } + thrust::device_vector expected_keys{9, 8, 7}; // possibly in different order + // example-end topk-max-pairs-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_keys_out.begin(), d_keys_out.end(), cuda::std::greater{}); + REQUIRE(d_keys_out == expected_keys); +} + +C2H_TEST("cub::DeviceTopK::MinPairs env-alloc accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-pairs-env + auto d_keys_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9, 1, 4, 2}; + auto d_values_in = thrust::device_vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto d_keys_out = thrust::device_vector(3); + auto d_values_out = thrust::device_vector(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; + } + thrust::device_vector expected_keys{0, 1, 2}; // possibly in different order + // example-end topk-min-pairs-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::sort(d_keys_out.begin(), d_keys_out.end()); + REQUIRE(d_keys_out == expected_keys); +} + +C2H_TEST("cub::DeviceTopK::MaxKeys env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-keys-decomposer-env + thrust::host_vector h_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_in = h_in; + thrust::device_vector d_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxKeys( + d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, topk_custom_decomposer_t{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxKeys failed with status: " << error << '\n'; + } + thrust::host_vector expected_ranks{9, 8, 7}; // possibly in different order + // example-end topk-max-keys-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank > b.rank; + }); + thrust::host_vector actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); +} + +C2H_TEST("cub::DeviceTopK::MinKeys env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-keys-decomposer-env + thrust::host_vector h_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_in = h_in; + thrust::device_vector d_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinKeys( + d_in.begin(), d_out.begin(), static_cast(d_in.size()), k, topk_custom_decomposer_t{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinKeys failed with status: " << error << '\n'; + } + thrust::host_vector expected_ranks{0, 1, 2}; // possibly in different order + // example-end topk-min-keys-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_out = d_out; + std::sort(h_out.begin(), h_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank < b.rank; + }); + thrust::host_vector actual_ranks{h_out[0].rank, h_out[1].rank, h_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); +} + +C2H_TEST("cub::DeviceTopK::MaxPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-max-pairs-decomposer-env + thrust::host_vector h_keys_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + thrust::device_vector d_keys_out(3); + thrust::device_vector d_values_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MaxPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + topk_custom_decomposer_t{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MaxPairs failed with status: " << error << '\n'; + } + thrust::host_vector expected_ranks{9, 8, 7}; // possibly in different order + // example-end topk-max-pairs-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank > b.rank; + }); + thrust::host_vector actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); +} + +C2H_TEST("cub::DeviceTopK::MinPairs env-alloc with decomposer accepts stream_ref", "[topk][env]") +{ + // example-begin topk-min-pairs-decomposer-env + thrust::host_vector h_keys_in{ + {8, 0}, {6, 1}, {7, 2}, {5, 3}, {3, 4}, {0, 5}, {9, 6}, {1, 7}, {4, 8}, {2, 9}}; + thrust::device_vector d_keys_in = h_keys_in; + thrust::device_vector d_values_in{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + thrust::device_vector d_keys_out(3); + thrust::device_vector d_values_out(3); + int k = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{ + cuda::execution::require(cuda::execution::determinism::not_guaranteed, // + cuda::execution::output_ordering::unsorted), + stream_ref}; + + auto error = cub::DeviceTopK::MinPairs( + d_keys_in.begin(), + d_keys_out.begin(), + d_values_in.begin(), + d_values_out.begin(), + static_cast(d_keys_in.size()), + k, + topk_custom_decomposer_t{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceTopK::MinPairs failed with status: " << error << '\n'; + } + thrust::host_vector expected_ranks{0, 1, 2}; // possibly in different order + // example-end topk-min-pairs-decomposer-env + + stream.sync(); + REQUIRE(error == cudaSuccess); + + thrust::host_vector h_keys_out = d_keys_out; + std::sort(h_keys_out.begin(), h_keys_out.end(), [](const topk_custom_t& a, const topk_custom_t& b) { + return a.rank < b.rank; + }); + thrust::host_vector actual_ranks{h_keys_out[0].rank, h_keys_out[1].rank, h_keys_out[2].rank}; + REQUIRE(actual_ranks == expected_ranks); +}