NVIDIA · fbusato · May 19, 2026 · davebayer · May 13, 2026 · fbusato
@@ -104,7 +104,8 @@
 #define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK() && !_CCCL_COMPILER(CLANG))
 #define _CCCL_HAS_SIMD_F32X2_PTX()        (__cccl_ptx_isa >= 860ULL)
 
-#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX())
+#define _CCCL_HAS_SIMD_F32X2() \
+  (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) && !_CCCL_TILE_COMPILATION()
 
 // Third party libraries
 

@@ -199,7 +199,7 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
     static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
                   "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
     const auto __data = ::cuda::std::ranges::__data_cpo{}(__range);
-    __assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
+    ::cuda::std::simd::__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < __size; ++__i)
     {
@@ -215,7 +215,7 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
     static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
                   "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
     const auto __data = ::cuda::std::ranges::__data_cpo{}(__range);
-    __assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
+    ::cuda::std::simd::__assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < __size; ++__i)
     {

@@ -23,8 +23,12 @@
 
 #if _CCCL_HAS_SIMD_F32X2()
 
+#  include <cuda/std/__fwd/simd.h>
+#  include <cuda/std/__simd/abi.h>
 #  include <cuda/std/__simd/specializations/fixed_size_vec.h>
-#  include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
+#  include <cuda/std/__simd/specializations/fp32x2_intrinsics_array.h>
+
+#  include <nv/target>
 
 #  include <cuda/std/__cccl/prologue.h>
 
@@ -44,11 +48,11 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
 
   _CCCL_HOST_DEVICE_API static constexpr void __increment(_SimdStorage& __s) noexcept
   {
+    [[maybe_unused]] constexpr _SimdStorage __one = __base::__broadcast(1.0f);
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
-                     constexpr _SimdStorage __one = __base::__broadcast(1.0f);
-                     __s                          = ::cuda::std::simd::__plus_f32x2(__s, __one);
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, ({
+                     __s = ::cuda::std::simd::__plus_f32x2(__s, __one);
                      return;
                    }));
     }
@@ -57,11 +61,11 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
 
   _CCCL_HOST_DEVICE_API static constexpr void __decrement(_SimdStorage& __s) noexcept
   {
+    [[maybe_unused]] constexpr _SimdStorage __one = __base::__broadcast(1.0f);
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
-                     constexpr _SimdStorage __one = __base::__broadcast(1.0f);
-                     __s                          = ::cuda::std::simd::__minus_f32x2(__s, __one);
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, ({
+                     __s = ::cuda::std::simd::__minus_f32x2(__s, __one);
                      return;
                    }));
     }
@@ -70,12 +74,10 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
 
   [[nodiscard]] _CCCL_HOST_DEVICE_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
   {
+    [[maybe_unused]] constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, ({
-                     constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
-                     return ::cuda::std::simd::__minus_f32x2(__zero, __s);
-                   }))
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
     }
     return __base::__unary_minus(__s);
   }
@@ -85,7 +87,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
   {
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
     }
     return __base::__plus(__lhs, __rhs);
   }
@@ -95,7 +97,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
   {
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
     }
     return __base::__minus(__lhs, __rhs);
   }
@@ -105,7 +107,7 @@ struct __simd_operations<float, __fixed_size<_Np>, __simd_operations_kind::__fix
   {
     _CCCL_IF_NOT_CONSTEVAL_DEFAULT
     {
-      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_100f, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
     }
     return __base::__multiplies(__lhs, __rhs);
   }

@@ -24,6 +24,7 @@
 #include <cuda/__utility/in_range.h>
 #include <cuda/std/__cstddef/types.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/abi.h>
 #include <cuda/std/__simd/specializations/fixed_size_storage.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>

@@ -23,6 +23,7 @@
 
 #include <cuda/__utility/in_range.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/abi.h>
 
 #include <cuda/std/__cccl/prologue.h>
 

@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/abi.h>
 #include <cuda/std/__simd/specializations/fixed_size_mask.h>
 #include <cuda/std/__simd/specializations/fixed_size_storage.h>
 #include <cuda/std/__type_traits/integral_constant.h>

@@ -26,178 +26,133 @@
 // TODO(fbusato): check 5361571, remove this path once the feature is supported
 #if _CCCL_HAS_SIMD_F32X2()
 
-#  include <cuda/std/__simd/specializations/fixed_size_storage.h>
-
 #  include <nv/target>
 
 #  include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
-[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __add_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
+[[nodiscard]] _CCCL_DEVICE_API inline ::float2
+__add_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
 {
   ::float2 __result{};
 #  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
-  NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fadd2_rn(__lhs, __rhs);))
+  NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
+                    (__result = ::__fadd2_rn(__lhs, __rhs);),
+                    (_CCCL_VERIFY(false, "cuda::std::simd::__add_f32x2: Unsupported architecture");))
 #  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
-  asm("{.reg .b64 __lhs, __rhs, __result;"
-      "mov.b64 __lhs, {%2, %3};"
-      "mov.b64 __rhs, {%4, %5};"
-      "add.f32x2 __result, __lhs, __rhs;"
-      "mov.b64 {%0, %1}, __result;}"
-      : "=f"(__result.x), "=f"(__result.y)
-      : "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y));
+  NV_IF_ELSE_TARGET(
+    NV_IS_EXACTLY_SM_100,
+    (asm("{"
+         ".reg .b64 __lhs, __rhs, __result;"
+         "mov.b64 __lhs, {%2, %3};"
+         "mov.b64 __rhs, {%4, %5};"
+         "add.f32x2 __result, __lhs, __rhs;"
+         "mov.b64 {%0, %1}, __result;"
+         "}" //
-    (asm("{"
-         ".reg .b64 __lhs, __rhs, __result;"
-         "mov.b64 __lhs, {%2, %3};"
-         "mov.b64 __rhs, {%4, %5};"
-         "add.f32x2 __result, __lhs, __rhs;"
-         "mov.b64 {%0, %1}, __result;"
-         "}" //
+    (asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"
-    (asm("{"
-         ".reg .b64 __lhs, __rhs, __result;"
-         "mov.b64 __lhs, {%2, %3};"
-         "mov.b64 __rhs, {%4, %5};"
-         "add.f32x2 __result, __lhs, __rhs;"
-         "mov.b64 {%0, %1}, __result;"
-         "}" //
+    (asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"
+         : "=f"(__result.x),
+         "=f"(__result.y) //
+         : "f"(__lhs.x),
+         "f"(__lhs.y),
+         "f"(__rhs.x),
+         "f"(__rhs.y));),
+    (_CCCL_VERIFY(false, "cuda::std::simd::__add_f32x2: Unsupported architecture");))
 #  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   return __result;
 }
 
-[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __mul_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
+[[nodiscard]] _CCCL_DEVICE_API inline ::float2
+__mul_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
 {
   ::float2 __result{};
 #  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
-  NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fmul2_rn(__lhs, __rhs);))
+  NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
+                    (__result = ::__fmul2_rn(__lhs, __rhs);),
+                    (_CCCL_VERIFY(false, "cuda::std::simd::__mul_f32x2: Unsupported architecture");))
 #  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
-  asm("{.reg .b64 __lhs, __rhs, __result;"
-      "mov.b64 __lhs, {%2, %3};"
-      "mov.b64 __rhs, {%4, %5};"
-      "mul.f32x2 __result, __lhs, __rhs;"
-      "mov.b64 {%0, %1}, __result;}"
-      : "=f"(__result.x), "=f"(__result.y)
-      : "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y));
+  NV_IF_ELSE_TARGET(
+    NV_IS_EXACTLY_SM_100,
+    (asm("{"
+         ".reg .b64 __lhs, __rhs, __result;"
+         "mov.b64 __lhs, {%2, %3};"
+         "mov.b64 __rhs, {%4, %5};"
+         "mul.f32x2 __result, __lhs, __rhs;"
+         "mov.b64 {%0, %1}, __result;"
+         "}" //
+         : "=f"(__result.x),
+         "=f"(__result.y) //
+         : "f"(__lhs.x),
+         "f"(__lhs.y),
+         "f"(__rhs.x),
+         "f"(__rhs.y));),
+    (_CCCL_VERIFY(false, "cuda::std::simd::__mul_f32x2: Unsupported architecture");))
 #  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   return __result;
 }
 
-[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __sub_f32x2(const ::float2 __lhs, const ::float2 __rhs) noexcept
+[[nodiscard]] _CCCL_DEVICE_API inline ::float2
+__sub_f32x2([[maybe_unused]] const ::float2 __lhs, [[maybe_unused]] const ::float2 __rhs) noexcept
 {
   ::float2 __result{};
 #  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
-  NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__fadd2_rn(__lhs, ::float2{-__rhs.x, -__rhs.y});))
+  NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
+                    (__result = ::__fadd2_rn(__lhs, ::float2{-__rhs.x, -__rhs.y});),
+                    (_CCCL_VERIFY(false, "cuda::std::simd::__sub_f32x2: Unsupported architecture");))
 #  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
-  NV_IF_TARGET(
+  NV_IF_ELSE_TARGET(
     NV_IS_EXACTLY_SM_100,
-    (asm("{.reg .b64 __lhs, __rhs, __result;"
+    (asm("{"
+         ".reg .b64 __lhs, __rhs, __result;"
          "mov.b64 __lhs, {%2, %3};"
          "mov.b64 __rhs, {%4, %5};"
          "sub.f32x2 __result, __lhs, __rhs;"
-         "mov.b64 {%0, %1}, __result;}" : "=f"(__result.x),
-         "=f"(__result.y) : "f"(__lhs.x),
+         "mov.b64 {%0, %1}, __result;"
+         "}" //
+         : "=f"(__result.x),
+         "=f"(__result.y) //
+         : "f"(__lhs.x),
          "f"(__lhs.y),
          "f"(__rhs.x),
-         "f"(__rhs.y));))
+         "f"(__rhs.y));),
+    (_CCCL_VERIFY(false, "cuda::std::simd::__sub_f32x2: Unsupported architecture");))
 #  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   return __result;
 }
 
-[[nodiscard]] _CCCL_DEVICE_API inline ::float2
-__fma_f32x2(const ::float2 __lhs, const ::float2 __rhs, const ::float2 __add) noexcept
+[[nodiscard]] _CCCL_DEVICE_API inline ::float2 __fma_f32x2(
+  [[maybe_unused]] const ::float2 __lhs,
+  [[maybe_unused]] const ::float2 __rhs,
+  [[maybe_unused]] const ::float2 __add) noexcept
 {
   ::float2 __result{};
 #  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
-  NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (__result = ::__ffma2_rn(__lhs, __rhs, __add);))
+  NV_IF_ELSE_TARGET(NV_IS_EXACTLY_SM_100,
+                    (__result = ::__ffma2_rn(__lhs, __rhs, __add);),
+                    (_CCCL_VERIFY(false, "cuda::std::simd::__fma_f32x2: Unsupported architecture");))
 #  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
-  asm("{.reg .b64 __lhs, __rhs, __add, __result;"
-      "mov.b64 __lhs, {%2, %3};"
-      "mov.b64 __rhs, {%4, %5};"
-      "mov.b64 __add, {%6, %7};"
-      "fma.rn.f32x2 __result, __lhs, __rhs, __add;"
-      "mov.b64 {%0, %1}, __result;}"
-      : "=f"(__result.x), "=f"(__result.y)
-      : "f"(__lhs.x), "f"(__lhs.y), "f"(__rhs.x), "f"(__rhs.y), "f"(__add.x), "f"(__add.y));
+  NV_IF_ELSE_TARGET(
+    NV_IS_EXACTLY_SM_100,
+    (asm("{"
+         ".reg .b64 __lhs, __rhs, __add, __result;"
+         "mov.b64 __lhs, {%2, %3};"
+         "mov.b64 __rhs, {%4, %5};"
+         "mov.b64 __add, {%6, %7};"
+         "fma.rn.f32x2 __result, __lhs, __rhs, __add;"
+         "mov.b64 {%0, %1}, __result;"
+         "}" //
+         : "=f"(__result.x),
+         "=f"(__result.y) //
+         : "f"(__lhs.x),
+         "f"(__lhs.y),
+         "f"(__rhs.x),
+         "f"(__rhs.y),
+         "f"(__add.x),
+         "f"(__add.y));),
+    (_CCCL_VERIFY(false, "cuda::std::simd::__fma_f32x2: Unsupported architecture");))
 #  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   return __result;
 }
 
-template <__simd_size_type _Np>
-using __simd_storage_f32 = __simd_storage<float, __fixed_size<_Np>>;
-
-template <__simd_size_type _Np>
-[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
-__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
-{
-  __simd_storage_f32<_Np> __result;
-  _CCCL_PRAGMA_UNROLL_FULL()
-  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
-  {
-    const auto __lhs_value   = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
-    const auto __rhs_value   = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
-    const auto __value       = ::cuda::std::simd::__add_f32x2(__lhs_value, __rhs_value);
-    __result.__data[__i]     = __value.x;
-    __result.__data[__i + 1] = __value.y;
-  }
-  if constexpr (_Np % 2 != 0)
-  {
-    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1];
-  }
-  return __result;
-}
-
-template <__simd_size_type _Np>
-[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
-__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
-{
-  __simd_storage_f32<_Np> __result;
-  _CCCL_PRAGMA_UNROLL_FULL()
-  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
-  {
-    const auto __lhs_value   = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
-    const auto __rhs_value   = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
-    const auto __value       = ::cuda::std::simd::__sub_f32x2(__lhs_value, __rhs_value);
-    __result.__data[__i]     = __value.x;
-    __result.__data[__i + 1] = __value.y;
-  }
-  if constexpr (_Np % 2 != 0)
-  {
-    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1];
-  }
-  return __result;
-}
-
-template <__simd_size_type _Np>
-[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
-__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
-{
-  __simd_storage_f32<_Np> __result;
-  _CCCL_PRAGMA_UNROLL_FULL()
-  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
-  {
-    const auto __lhs_value   = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
-    const auto __rhs_value   = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
-    const auto __value       = ::cuda::std::simd::__mul_f32x2(__lhs_value, __rhs_value);
-    __result.__data[__i]     = __value.x;
-    __result.__data[__i + 1] = __value.y;
-  }
-  if constexpr (_Np % 2 != 0)
-  {
-    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1];
-  }
-  return __result;
-}
-
-template <__simd_size_type _Np>
-[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
-__fma_f32x2(const __simd_storage_f32<_Np>& __lhs,
-            const __simd_storage_f32<_Np>& __rhs,
-            const __simd_storage_f32<_Np>& __add) noexcept
-{
-  __simd_storage_f32<_Np> __result;
-  _CCCL_PRAGMA_UNROLL_FULL()
-  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
-  {
-    const auto __lhs_value   = ::float2{__lhs.__data[__i], __lhs.__data[__i + 1]};
-    const auto __rhs_value   = ::float2{__rhs.__data[__i], __rhs.__data[__i + 1]};
-    const auto __add_value   = ::float2{__add.__data[__i], __add.__data[__i + 1]};
-    const auto __value       = ::cuda::std::simd::__fma_f32x2(__lhs_value, __rhs_value, __add_value);
-    __result.__data[__i]     = __value.x;
-    __result.__data[__i + 1] = __value.y;
-  }
-  if constexpr (_Np % 2 != 0)
-  {
-    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1];
-  }
-  return __result;
-}
-
 _CCCL_END_NAMESPACE_CUDA_STD_SIMD
 
 #  include <cuda/std/__cccl/epilogue.h>