Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 27 additions & 49 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,18 @@
# endif // TUNE_LOAD

template <typename InputT>
struct policy_hub_t
struct policy_selector
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
-> cub::detail::select::select_if_policy
{
using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_t>;
};

using MaxPolicy = policy_t;
return {TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_policy};
}
};
#endif // TUNE_BASE

Expand All @@ -70,31 +68,11 @@ void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_o
template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
cub::SelectImpl::Partition
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
Expand All @@ -106,9 +84,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
thrust::device_vector<offset_t> num_selected(1);
thrust::device_vector<T> out(elements);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = thrust::raw_pointer_cast(flags.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
const T* d_in = thrust::raw_pointer_cast(in.data());
const bool* d_flags = thrust::raw_pointer_cast(flags.data());
offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);

Expand All @@ -118,25 +96,25 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
state.add_global_memory_writes<T>(elements);
state.add_global_memory_writes<offset_t>(1);

std::size_t temp_size{};
dispatch_t::Dispatch(
nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, nullptr);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

caching_allocator_t alloc;
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
temp_storage,
temp_size,
auto env = cub_bench_env(
alloc,
launch
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<T>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DevicePartition::Flagged,
"Flagged failed",
d_in,
d_flags,
d_out,
d_num_selected,
select_op_t{},
equality_op_t{},
elements,
launch.get_stream());
static_cast<offset_t>(elements),
env);
});
}

Expand Down
74 changes: 26 additions & 48 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,18 @@
# endif // TUNE_LOAD

template <typename InputT>
struct policy_hub_t
struct policy_selector
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
-> cub::detail::select::select_if_policy
{
using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_t>;
};

using MaxPolicy = policy_t;
return {TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_policy};
}
};
#endif // !TUNE_BASE

Expand All @@ -71,31 +69,12 @@ void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_par
template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = cub::NullType*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;

using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
cub::SelectImpl::Partition
#if !TUNE_BASE
,
policy_hub_t<T>
#endif // !TUNE_BASE
>;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
Expand All @@ -108,9 +87,8 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct

thrust::device_vector<T> out(elements);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = nullptr;
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
const T* d_in = thrust::raw_pointer_cast(in.data());
offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);

Expand All @@ -119,25 +97,25 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
state.add_global_memory_writes<T>(elements);
state.add_global_memory_writes<offset_t>(1);

std::size_t temp_size{};
dispatch_t::Dispatch(
nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, nullptr);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

caching_allocator_t alloc;
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
temp_storage,
temp_size,
auto env = cub_bench_env(
alloc,
launch
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<T>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DevicePartition::If,
"If failed",
d_in,
d_flags,
d_out,
d_num_selected,
static_cast<offset_t>(elements),
select_op,
equality_op_t{},
elements,
launch.get_stream());
env);
});
}

Expand Down
64 changes: 27 additions & 37 deletions cub/benchmarks/bench/partition/three_way.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#include <cub/device/dispatch/dispatch_three_way_partition.cuh>
#include <cub/device/dispatch/tuning/tuning_three_way_partition.cuh>
#include <cub/device/device_partition.cuh>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>
Expand All @@ -16,7 +15,7 @@

#if !TUNE_BASE
template <typename InputT>
struct tuned_policy_selector_t
struct policy_selector
{
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
-> cub::detail::three_way_partition::three_way_partition_policy
Expand All @@ -26,19 +25,16 @@ struct tuned_policy_selector_t
TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_DEFAULT,
cub::BLOCK_SCAN_WARP_SCANS,
cub::detail::delay_constructor_policy_from_type<delay_constructor_t>};
delay_constructor_policy};
}
};
#endif // !TUNE_BASE

template <typename T, typename OffsetT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using input_it_t = const T*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using offset_t = OffsetT;
using select_op_t = less_then_t<T>;
using offset_t = OffsetT;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand All @@ -54,50 +50,44 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
select_op_t select_op_2{right_border};

thrust::device_vector<T> in = generate(elements, entropy, min_val, max_val);
thrust::device_vector<offset_t> num_selected(1);
thrust::device_vector<offset_t> num_selected(2);
thrust::device_vector<T> out_1(elements);
thrust::device_vector<T> out_2(elements);
thrust::device_vector<T> out_3(elements);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
output_it_t d_out_1 = thrust::raw_pointer_cast(out_1.data());
output_it_t d_out_2 = thrust::raw_pointer_cast(out_2.data());
output_it_t d_out_3 = thrust::raw_pointer_cast(out_3.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
const T* d_in = thrust::raw_pointer_cast(in.data());
T* d_out_1 = thrust::raw_pointer_cast(out_1.data());
T* d_out_2 = thrust::raw_pointer_cast(out_2.data());
T* d_out_3 = thrust::raw_pointer_cast(out_3.data());
offset_t* d_num_selected = thrust::raw_pointer_cast(num_selected.data());

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);
state.add_global_memory_writes<offset_t>(1);
state.add_global_memory_writes<offset_t>(2);

std::size_t temp_size{};
auto dispatch = [&](void* temp_storage, cudaStream_t stream) {
return cub::detail::three_way_partition::dispatch(
temp_storage,
temp_size,
caching_allocator_t alloc;
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
auto env = cub_bench_env(
alloc,
launch
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<T>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DevicePartition::If,
"If three-way failed",
d_in,
d_out_1,
d_out_2,
d_out_3,
d_num_selected,
static_cast<offset_t>(elements),
select_op_1,
select_op_2,
static_cast<offset_t>(elements),
stream
#if !TUNE_BASE
,
policy_selector_t{}
#endif // !TUNE_BASE
);
};

dispatch(nullptr, nullptr);

thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch(temp_storage, launch.get_stream());
env);
});
}

Expand Down
Loading
Loading