Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion libcudacxx/include/cuda/std/__internal/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@
#define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK() && !_CCCL_COMPILER(CLANG))
#define _CCCL_HAS_SIMD_F32X2_PTX() (__cccl_ptx_isa >= 860ULL)

#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX())
#define _CCCL_HAS_SIMD_F32X2() \
(_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) && !_CCCL_TILE_COMPILATION()

// Third party libraries

Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/__simd/basic_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
"Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
const auto __data = ::cuda::std::ranges::__data_cpo{}(__range);
__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
::cuda::std::simd::__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < __size; ++__i)
{
Expand All @@ -215,7 +215,7 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
"Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
const auto __data = ::cuda::std::ranges::__data_cpo{}(__range);
__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
::cuda::std::simd::__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < __size; ++__i)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@

#if _CCCL_HAS_SIMD_F32X2()

# include <cuda/std/__fwd/simd.h>
# include <cuda/std/__simd/abi.h>
# include <cuda/std/__simd/specializations/fixed_size_vec.h>
# include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
# include <cuda/std/__simd/specializations/fp32x2_intrinsics_array.h>

# include <nv/target>

# include <cuda/std/__cccl/prologue.h>

Expand All @@ -44,11 +48,11 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix

_CCCL_HOST_DEVICE_API static constexpr void __increment(_SimdStorage& __s) noexcept
{
[[maybe_unused]] constexpr _SimdStorage __one = __base::__broadcast(1.0f);
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
constexpr _SimdStorage __one = __base::__broadcast(1.0f);
__s = ::cuda::std::simd::__plus_f32x2(__s, __one);
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, ({
__s = ::cuda::std::simd::__plus_f32x2(__s, __one);
return;
}));
}
Expand All @@ -57,11 +61,11 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix

_CCCL_HOST_DEVICE_API static constexpr void __decrement(_SimdStorage& __s) noexcept
{
[[maybe_unused]] constexpr _SimdStorage __one = __base::__broadcast(1.0f);
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
constexpr _SimdStorage __one = __base::__broadcast(1.0f);
__s = ::cuda::std::simd::__minus_f32x2(__s, __one);
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, ({
__s = ::cuda::std::simd::__minus_f32x2(__s, __one);
return;
}));
}
Expand All @@ -70,12 +74,10 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix

[[nodiscard]] _CCCL_HOST_DEVICE_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
{
[[maybe_unused]] constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
return ::cuda::std::simd::__minus_f32x2(__zero, __s);
}))
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
}
return __base::__unary_minus(__s);
}
Expand All @@ -85,7 +87,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
{
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
}
return __base::__plus(__lhs, __rhs);
}
Expand All @@ -95,7 +97,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
{
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
}
return __base::__minus(__lhs, __rhs);
}
Expand All @@ -105,7 +107,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
{
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
}
return __base::__multiplies(__lhs, __rhs);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cuda/__utility/in_range.h>
#include <cuda/std/__cstddef/types.h>
#include <cuda/std/__fwd/simd.h>
#include <cuda/std/__simd/abi.h>
#include <cuda/std/__simd/specializations/fixed_size_storage.h>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__utility/integer_sequence.h>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <cuda/__utility/in_range.h>
#include <cuda/std/__fwd/simd.h>
#include <cuda/std/__simd/abi.h>

#include <cuda/std/__cccl/prologue.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#endif // no system header

#include <cuda/std/__fwd/simd.h>
#include <cuda/std/__simd/abi.h>
#include <cuda/std/__simd/specializations/fixed_size_mask.h>
#include <cuda/std/__simd/specializations/fixed_size_storage.h>
#include <cuda/std/__type_traits/integral_constant.h>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,178 +26,133 @@
// TODO(fbusato): check 5361571, remove this path once the feature is supported
#if _CCCL_HAS_SIMD_F32X2()

# include <cuda/std/__simd/specializations/fixed_size_storage.h>

# include <nv/target>

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD

[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __add_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
[[nodiscard]] _CCCL_DEVICE_API inline ::float2
__add_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
{
::float2 __result{};
# if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fadd2_rn(__lhs, __rhs);))
NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
(__result = ::__fadd2_rn(__lhs, __rhs);),
(_CCCL_VERIFY(false, "cuda::std::simd::__add_f32x2: Unsupported architecture");))
# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
asm("{.reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"add.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;}"
: "=f"(__result.x), "=f"(__result.y)
: "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y));
NV_IF_ELSE_TARGET(
NV_IS_EXACTLY_SM_100,
(asm("{"
".reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"add.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;"
"}" //
Comment on lines +46 to +52
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we do just

Suggested change
(asm("{"
".reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"add.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;"
"}" //
(asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

without NV_IF_TARGET the code fails to compile with SM != 100, see https://godbolt.org/z/TqWEszv3a

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant keeping the NV_IF_TARGET but making it a one liner instead of 6 :D

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried but ptxas or clang complained about it. I will try again

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't work. add.f32x2 requires .b64 not two f inputs

: "=f"(__result.x),
"=f"(__result.y) //
: "f"(__lhs.x),
"f"(__lhs.y),
"f"(__rhs.x),
"f"(__rhs.y));),
(_CCCL_VERIFY(false, "cuda::std::simd::__add_f32x2: Unsupported architecture");))
# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
return __result;
}

[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __mul_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
[[nodiscard]] _CCCL_DEVICE_API inline ::float2
__mul_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
{
::float2 __result{};
# if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fmul2_rn(__lhs, __rhs);))
NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
(__result = ::__fmul2_rn(__lhs, __rhs);),
(_CCCL_VERIFY(false, "cuda::std::simd::__mul_f32x2: Unsupported architecture");))
# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
asm("{.reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"mul.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;}"
: "=f"(__result.x), "=f"(__result.y)
: "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y));
NV_IF_ELSE_TARGET(
NV_IS_EXACTLY_SM_100,
(asm("{"
".reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"mul.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;"
"}" //
: "=f"(__result.x),
"=f"(__result.y) //
: "f"(__lhs.x),
"f"(__lhs.y),
"f"(__rhs.x),
"f"(__rhs.y));),
(_CCCL_VERIFY(false, "cuda::std::simd::__mul_f32x2: Unsupported architecture");))
# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
return __result;
}

[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __sub_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
[[nodiscard]] _CCCL_DEVICE_API inline ::float2
__sub_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
{
::float2 __result{};
# if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fadd2_rn(__lhs, ::float2{-__rhs.x, -__rhs.y});))
NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
(__result = ::__fadd2_rn(__lhs, ::float2{-__rhs.x, -__rhs.y});),
(_CCCL_VERIFY(false, "cuda::std::simd::__sub_f32x2: Unsupported architecture");))
# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
NV_IF_TARGET(
NV_IF_ELSE_TARGET(
NV_IS_EXACTLY_SM_100,
(asm("{.reg .b64 __lhs, __rhs, __result;"
(asm("{"
".reg .b64 __lhs, __rhs, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"sub.f32x2 __result, __lhs, __rhs;"
"mov.b64 {%0, %1}, __result;}" : "=f"(__result.x),
"=f"(__result.y) : "f"(__lhs.x),
"mov.b64 {%0, %1}, __result;"
"}" //
: "=f"(__result.x),
"=f"(__result.y) //
: "f"(__lhs.x),
"f"(__lhs.y),
"f"(__rhs.x),
"f"(__rhs.y));))
"f"(__rhs.y));),
(_CCCL_VERIFY(false, "cuda::std::simd::__sub_f32x2: Unsupported architecture");))
# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
return __result;
}

[[nodiscard]] _CCCL_DEVICE_API inline ::float2
__fma_f32x2(const ::float2 __lhs, const ::float2 __rhs, const ::float2 __add) noexcept
[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __fma_f32x2(
[[maybe_unused]] const ::float2 __lhs,
[[maybe_unused]] const ::float2 __rhs,
[[maybe_unused]] const ::float2 __add) noexcept
{
::float2 __result{};
# if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__ffma2_rn(__lhs, __rhs, __add);))
NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
(__result = ::__ffma2_rn(__lhs, __rhs, __add);),
(_CCCL_VERIFY(false, "cuda::std::simd::__fma_f32x2: Unsupported architecture");))
# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
asm("{.reg .b64 __lhs, __rhs, __add, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"mov.b64 __add, {%6, %7};"
"fma.rn.f32x2 __result, __lhs, __rhs, __add;"
"mov.b64 {%0, %1}, __result;}"
: "=f"(__result.x), "=f"(__result.y)
: "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y), "f"(__add.x), "f"(__add.y));
NV_IF_ELSE_TARGET(
NV_IS_EXACTLY_SM_100,
(asm("{"
".reg .b64 __lhs, __rhs, __add, __result;"
"mov.b64 __lhs, {%2, %3};"
"mov.b64 __rhs, {%4, %5};"
"mov.b64 __add, {%6, %7};"
"fma.rn.f32x2 __result, __lhs, __rhs, __add;"
"mov.b64 {%0, %1}, __result;"
"}" //
: "=f"(__result.x),
"=f"(__result.y) //
: "f"(__lhs.x),
"f"(__lhs.y),
"f"(__rhs.x),
"f"(__rhs.y),
"f"(__add.x),
"f"(__add.y));),
(_CCCL_VERIFY(false, "cuda::std::simd::__fma_f32x2: Unsupported architecture");))
# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
return __result;
}

template <__simd_size_type _Np>
using __simd_storage_f32 = __simd_storage<float, __fixed_size<_Np>>;

template <__simd_size_type _Np>
[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
{
__simd_storage_f32<_Np> __result;
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
{
const auto __lhs_value = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
const auto __rhs_value = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
const auto __value = ::cuda::std::simd::__add_f32x2(__lhs_value, __rhs_value);
__result.__data[__i] = __value.x;
__result.__data[__i + 1] = __value.y;
}
if constexpr (_Np % 2 != 0)
{
__result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1];
}
return __result;
}

template <__simd_size_type _Np>
[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
{
__simd_storage_f32<_Np> __result;
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
{
const auto __lhs_value = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
const auto __rhs_value = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
const auto __value = ::cuda::std::simd::__sub_f32x2(__lhs_value, __rhs_value);
__result.__data[__i] = __value.x;
__result.__data[__i + 1] = __value.y;
}
if constexpr (_Np % 2 != 0)
{
__result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1];
}
return __result;
}

template <__simd_size_type _Np>
[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
{
__simd_storage_f32<_Np> __result;
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
{
const auto __lhs_value = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
const auto __rhs_value = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
const auto __value = ::cuda::std::simd::__mul_f32x2(__lhs_value, __rhs_value);
__result.__data[__i] = __value.x;
__result.__data[__i + 1] = __value.y;
}
if constexpr (_Np % 2 != 0)
{
__result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1];
}
return __result;
}

template <__simd_size_type _Np>
[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
__fma_f32x2(const __simd_storage_f32<_Np>& __lhs,
const __simd_storage_f32<_Np>& __rhs,
const __simd_storage_f32<_Np>& __add) noexcept
{
__simd_storage_f32<_Np> __result;
_CCCL_PRAGMA_UNROLL_FULL()
for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
{
const auto __lhs_value = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
const auto __rhs_value = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
const auto __add_value = ::float2{__add.__data[__i], __add.__data[__i + 1]};
const auto __value = ::cuda::std::simd::__fma_f32x2(__lhs_value, __rhs_value, __add_value);
__result.__data[__i] = __value.x;
__result.__data[__i + 1] = __value.y;
}
if constexpr (_Np % 2 != 0)
{
__result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1];
}
return __result;
}

_CCCL_END_NAMESPACE_CUDA_STD_SIMD

# include <cuda/std/__cccl/epilogue.h>
Expand Down
Loading
Loading