Skip to content
98 changes: 41 additions & 57 deletions cub/cub/device/device_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <cuda/__functional/call_or.h>
#include <cuda/__stream/get_stream.h>
#include <cuda/std/__execution/env.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/cstdint>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -155,19 +157,10 @@ struct DeviceAdjacentDifference
//! @endrst
//!
//! @tparam InputIteratorT
//! @rst
//! is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
//! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
//! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
//! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
//! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
//! ``value_types``.
//! @endrst
//! **[inferred]** Random-access input iterator type for reading input elements @iterator
//!
//! @tparam OutputIteratorT
//! @rst
//! is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
//! @endrst
//! **[inferred]** Random-access output iterator type for writing output elements @iterator
//!
//! @tparam DifferenceOpT
//! Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`.
Expand Down Expand Up @@ -277,13 +270,7 @@ struct DeviceAdjacentDifference
//! @endrst
//!
//! @tparam RandomAccessIteratorT
//! @rst
//! is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
//! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
//! ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the
//! return type of ``x - y`` should be convertible to a type in
//! ``RandomAccessIteratorT``'s set of ``value_types``.
//! @endrst
//! **[inferred]** Random-access iterator type for reading and writing elements @iterator
//!
//! @tparam DifferenceOpT
//! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
Expand Down Expand Up @@ -394,22 +381,13 @@ struct DeviceAdjacentDifference
//! @endrst
//!
//! @tparam InputIteratorT
//! @rst
//! is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
//! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
//! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
//! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
//! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
//! ``value_types``.
//! @endrst
//! **[inferred]** Random-access input iterator type for reading input elements @iterator
//!
//! @tparam OutputIteratorT
//! @rst
//! is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
//! @endrst
//! **[inferred]** Random-access output iterator type for writing output elements @iterator
//!
//! @tparam DifferenceOpT
//! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
//! Its `result_type` is convertible to a type in `OutputIteratorT`'s
//! set of `value_types`.
//!
//! @tparam NumItemsT
Expand Down Expand Up @@ -507,13 +485,7 @@ struct DeviceAdjacentDifference
//! @endrst
//!
//! @tparam RandomAccessIteratorT
//! @rst
//! is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
//! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
//! ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the
//! return type of ``x - y`` should be convertible to a type in
//! ``RandomAccessIteratorT``'s set of ``value_types``.
//! @endrst
//! **[inferred]** Random-access iterator type for reading and writing elements @iterator
//!
//! @tparam DifferenceOpT
//! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
Expand Down Expand Up @@ -567,6 +539,7 @@ struct DeviceAdjacentDifference
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -626,12 +599,16 @@ struct DeviceAdjacentDifference
//! @endrst
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputIteratorT, void*>, int> = 0>
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractLeftCopy(
InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, EnvT env = {})
InputIteratorT d_input,
OutputIteratorT d_output,
NumItemsT num_items,
DifferenceOpT difference_op = {},
EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeftCopy");

Expand Down Expand Up @@ -661,6 +638,7 @@ struct DeviceAdjacentDifference
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -711,12 +689,12 @@ struct DeviceAdjacentDifference
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename RandomAccessIteratorT,
typename DifferenceOpT,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_same_v<RandomAccessIteratorT, void*>, int> = 0>
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_integral_v<EnvT>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
SubtractLeft(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, EnvT env = {})
SubtractLeft(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeft");

Expand Down Expand Up @@ -747,6 +725,7 @@ struct DeviceAdjacentDifference
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -808,12 +787,16 @@ struct DeviceAdjacentDifference
//! @endrst
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputIteratorT, void*>, int> = 0>
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractRightCopy(
InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, EnvT env = {})
InputIteratorT d_input,
OutputIteratorT d_output,
NumItemsT num_items,
DifferenceOpT difference_op = {},
EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractRightCopy");

Expand Down Expand Up @@ -843,6 +826,7 @@ struct DeviceAdjacentDifference
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -893,12 +877,12 @@ struct DeviceAdjacentDifference
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename RandomAccessIteratorT,
typename DifferenceOpT,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_same_v<RandomAccessIteratorT, void*>, int> = 0>
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_integral_v<EnvT>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
SubtractRight(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, EnvT env = {})
SubtractRight(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractRight");

Expand Down
12 changes: 4 additions & 8 deletions cub/cub/device/device_copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,6 @@ struct DeviceCopy
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
//!
//! .. note::
//!
//! If any input range aliases any output range the behavior is undefined.
Expand Down Expand Up @@ -360,17 +358,15 @@ struct DeviceCopy
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but
//! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout
//! transformations.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
//!
//! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but
//! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout
//! transformations.
//!
//! Preconditions
//! +++++++++++++
//!
Expand Down
29 changes: 15 additions & 14 deletions cub/cub/device/device_find.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ struct DeviceFind
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! For each ``value`` in ``[d_values, d_values + values_num_items)``, performs a binary search in the range
//! ``[d_range, d_range + range_num_items)``, using ``comp`` as the comparator to find the iterator to the element
//! of said range which **is not** ordered **before** ``value``.
//! ``[d_range, d_range + range_num_items)``, using ``comp`` as the comparator to find the iterator to the
//! **first** element of said range which **is not** ordered **before** ``value``.
//!
//! - The range ``[first, last)`` must be sorted consistently with ``comp``.
//! - The range ``[d_range, d_range + range_num_items)`` must be sorted consistently with ``comp``.
//!
//! .. versionadded:: 3.3.0
//!
Expand Down Expand Up @@ -236,8 +236,8 @@ struct DeviceFind
//!
//! For each ``value`` in ``[d_values, d_values + values_num_items)``, performs a binary search in the range
//! ``[d_range, d_range + range_num_items)``,
//! using ``comp`` as the comparator to find the iterator to the element of said range which **is** ordered
//! **after** ``value``.
//! using ``comp`` as the comparator to find the iterator to the **first** element of said range which **is**
//! ordered **after** ``value``.
//!
//! - The range ``[d_range, d_range + range_num_items)`` must be sorted consistently with ``comp``.
//!
Expand Down Expand Up @@ -352,6 +352,11 @@ struct DeviceFind
//! @rst
//! Finds the first element in the input sequence that satisfies the given predicate.
//!
//! - The search terminates at the first element where the predicate evaluates to true.
//! - The index of the found element is written to ``d_out``.
//! - If no element satisfies the predicate, ``num_items`` is written to ``d_out``.
//! - The range ``[d_out, d_out + 1)`` shall not overlap ``[d_in, d_in + num_items)`` in any way.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
Expand All @@ -360,11 +365,7 @@ struct DeviceFind
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - The search terminates at the first element where the predicate evaluates to true.
//! - The index of the found element is written to ``d_out``.
//! - If no element satisfies the predicate, ``num_items`` is written to ``d_out``.
//! - The range ``[d_out, d_out + 1)`` shall not overlap ``[d_in, d_in + num_items)`` in any way.
//!

//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
Expand Down Expand Up @@ -434,8 +435,8 @@ struct DeviceFind

//! @rst
//! For each ``value`` in ``[d_values, d_values + values_num_items)``, performs a binary search in the range
//! ``[d_range, d_range + range_num_items)``, using ``comp`` as the comparator to find the iterator to the element
//! of said range which **is not** ordered **before** ``value``.
//! ``[d_range, d_range + range_num_items)``, using ``comp`` as the comparator to find the iterator to the
//! **first** element of said range which **is not** ordered **before** ``value``.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
Expand Down Expand Up @@ -554,8 +555,8 @@ struct DeviceFind
//! @rst
//! For each ``value`` in ``[d_values, d_values + values_num_items)``, performs a binary search in the range
//! ``[d_range, d_range + range_num_items)``,
//! using ``comp`` as the comparator to find the iterator to the element of said range which **is** ordered
//! **after** ``value``.
//! using ``comp`` as the comparator to find the iterator to the **first** element of said range which **is**
//! ordered **after** ``value``.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
Expand Down
40 changes: 22 additions & 18 deletions cub/cub/device/device_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ public:
//! ``d_in`` into a partitioned sequence ``d_out``.
//! The total number of items copied into the first partition is written to ``d_num_selected_out``.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
Expand Down Expand Up @@ -306,11 +309,7 @@ public:
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
typename ::cuda::std::enable_if_t<
::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<InputIteratorT, void*>
&& !::cuda::std::is_same_v<FlagIterator, size_t&>,
int> = 0>
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
InputIteratorT d_in,
FlagIterator d_flags,
Expand Down Expand Up @@ -493,6 +492,9 @@ public:
//! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written
//! to ``d_num_selected_out``.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
Expand Down Expand Up @@ -553,15 +555,12 @@ public:
//!
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
typename ::cuda::std::
enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<InputIteratorT, void*>, int> = 0>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
If(InputIteratorT d_in,
OutputIteratorT d_out,
Expand Down Expand Up @@ -882,6 +881,9 @@ public:
//! @rst
//! Uses two functors to split the corresponding items from ``d_in`` into three partitioned sequences
//! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``.
//! The total number of items copied into the first partition is written
//! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written
//! to ``d_num_selected_out[1]``.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
Expand All @@ -896,9 +898,11 @@ public:
//! - Copies of the items selected by ``select_second_part_op`` are compacted
//! into ``d_second_part_out`` and maintain their original relative ordering.
//! - Copies of the unselected items are compacted into the ``d_unselected_out`` in reverse order.
//! - The total number of items copied into the first partition is written
//! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written
//! to ``d_num_selected_out[1]``.
//! - The ranges ``[d_out, d_out + num_items)``,
//! ``[d_first_part_out, d_first_part_out + d_num_selected_out[0])``,
//! ``[d_second_part_out, d_second_part_out + d_num_selected_out[1])``,
//! ``[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])``,
//! shall not overlap in any way.
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -985,7 +989,7 @@ public:
typename SelectSecondPartOp,
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_arithmetic_v<FirstOutputIteratorT>, int> = 0>
::cuda::std::enable_if_t<!::cuda::std::is_same_v<FirstOutputIteratorT, size_t>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
If(InputIteratorT d_in,
FirstOutputIteratorT d_first_part_out,
Expand Down
Loading
Loading