From 317bcf5a31789a45f89808af5c3c0d868fbfb321 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 12 May 2026 10:31:10 -0400 Subject: [PATCH 1/8] Fix nightly GCC8 build regressions in bit_cast and DeviceReduce Two unrelated regressions surfaced together in the 2026-05-12 nightly run on `main`. Both block all GCC builds on older CTKs (12.0, 12.X) for libcu++, and all GCC 8 CUB builds. libcu++: bit_cast<__float128> The memcpy fallback path in `__bit_cast_memcpy` is reached on GCC 8/9/10 because those host compilers don't provide `__builtin_bit_cast`. Commit 8b28e1d244 (#8265) replaced the previous `is_trivially_default_constructible_v` trait with the C++20-style `default_initializable` concept; the concept's emulation evaluates false for `__float128` in C++17 mode, even though the built-in scalar is in fact default-initializable. Restore an equivalent type trait (`is_default_constructible_v`), which queries the compiler builtin directly and works on every supported GCC. The GCC<=7 gate around the static_assert is no longer needed and is removed. CUB: DeviceReduce reduction_op Commit b60f063594 (#8851) merged the three determinism overloads of `reduce_impl` into a single `if constexpr` template. The `__gpu_to_gpu` branch dispatches to RFA, which hardcodes `deterministic_sum_t` internally and does not accept a reduction operator. GCC 8's `-Werror=unused-but-set-parameter` flags `reduction_op` because that branch never references it. The parameter must remain in the signature (the other two branches use it), so the invariant -- enforced upstream by `__transform_reduce`'s `float_double_plus` check -- is acknowledged with `(void) reduction_op` plus a one-line note inside the branch. [skip-vdc][skip-docs][skip-tpt] --- ci/matrix.yaml | 5 +++++ cub/cub/device/device_reduce.cuh | 2 ++ libcudacxx/include/cuda/std/__bit/bit_cast.h | 8 +++----- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index b08c6068ace..43ea697ec62 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -21,6 +21,11 @@ workflows: # args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' } # override: + # Targeted repro of nightly 25713127213 failures. Reset before merging. + # libcu++ bit_cast<__float128> regression: memcpy fallback fires on GCC 8/9/10 (no __builtin_bit_cast) with CTK 12.0/12.X. + - {jobs: ['build'], project: 'libcudacxx', std: 'all', ctk: ['12.0', '12.X'], cxx: ['gcc8', 'gcc9', 'gcc10']} + # CUB device_reduce -Wunused-but-set-parameter regression: GCC 8 only. + - {jobs: ['build'], project: 'cub', std: 17, ctk: ['12.0', '12.X'], cxx: ['gcc8']} pull_request: # Old CTK: Oldest/newest supported host compilers: diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 41a835cfa85..9f408534524 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -116,6 +116,8 @@ private: if constexpr (Determinism == ::cuda::execution::determinism::__determinism_t::__gpu_to_gpu) { + // Only instantiated with `plus`; RFA hardcodes `deterministic_sum_t`. + (void) reduction_op; using default_policy_selector = detail::rfa::policy_selector_from_types; using policy_selector = ::cuda::std::execution::__query_result_or_t; diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h index 98844e3038f..b51d94795ee 100644 --- a/libcudacxx/include/cuda/std/__bit/bit_cast.h +++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include @@ -51,10 +51,8 @@ _CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess") template [[nodiscard]] _CCCL_API inline _To __bit_cast_memcpy(const _From& __from) noexcept { -#if !_CCCL_COMPILER(GCC, <=, 7) - static_assert(::cuda::std::default_initializable<_To>, - "bit_cast memcpy fallback requires the destination type to be default initializable"); -#endif // !_CCCL_COMPILER(GCC, <=, 7) + static_assert(::cuda::std::is_default_constructible_v<_To>, + "bit_cast memcpy fallback requires the destination type to be default constructible"); _To __temp; ::cuda::std::memcpy(&__temp, &__from, sizeof(_To)); return __temp; From 7673823f1ebe3ffa34a4a894d569ca89413e6ab8 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 12 May 2026 17:18:11 -0400 Subject: [PATCH 2/8] Fix unbound-variable error in build_and_test_targets.sh `${#ARRAY}` is syntactic sugar for `${#ARRAY[0]}` -- the length of element 0. Under `set -u`, when an array is empty (no `--ctest-targets`, `--lit-tests`, etc. on the command line), this errors with "unbound variable" before the guard can short-circuit. Use `${#ARRAY[@]}` (total element count), which is defined as 0 on an empty array. --- ci/util/build_and_test_targets.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/util/build_and_test_targets.sh b/ci/util/build_and_test_targets.sh index f79bf2202e0..7714b8e8d5a 100755 --- a/ci/util/build_and_test_targets.sh +++ b/ci/util/build_and_test_targets.sh @@ -73,7 +73,7 @@ if [[ -n "${CONFIGURE_OVERRIDE}" ]]; then if [[ -n "${PRESET}" ]]; then echo "::warning:: --preset ignored due to --configure-override" >&2 fi - if [[ "${#CMAKE_OPTIONS}" -gt 0 ]]; then + if [[ "${#CMAKE_OPTIONS[@]}" -gt 0 ]]; then echo "::warning:: --cmake-options ignored due to --configure-override" >&2 fi fi @@ -103,7 +103,7 @@ if [[ -z "${BUILD_DIR}" ]]; then exit 1 fi -if [[ "${#BUILD_TARGETS}" -gt 0 ]]; then +if [[ "${#BUILD_TARGETS[@]}" -gt 0 ]]; then if ! (set -x; ninja -C "${BUILD_DIR}" "${BUILD_TARGETS[@]}"); then echo "::endgroup::" echo "🔴🛠️ Ninja build failed for targets ($(elapsed_time)): ${BUILD_TARGETS[*]@Q}" @@ -111,7 +111,7 @@ if [[ "${#BUILD_TARGETS}" -gt 0 ]]; then fi fi -if [[ "${#CTEST_TARGETS}" -gt 0 ]]; then +if [[ "${#CTEST_TARGETS[@]}" -gt 0 ]]; then for t in "${CTEST_TARGETS[@]}"; do if ! (set -x; ctest --test-dir "${BUILD_DIR}" -R "$t" -V --output-on-failure); then echo "::endgroup::" @@ -121,7 +121,7 @@ if [[ "${#CTEST_TARGETS}" -gt 0 ]]; then done fi -if [[ "${#LIT_PRECOMPILE_TESTS}" -gt 0 || "${#LIT_TESTS}" -gt 0 ]]; then +if [[ "${#LIT_PRECOMPILE_TESTS[@]}" -gt 0 || "${#LIT_TESTS[@]}" -gt 0 ]]; then lit_site_cfg="${BUILD_DIR}/libcudacxx/test/libcudacxx/lit.site.cfg" if [[ ! -f "${lit_site_cfg}" ]]; then echo "::endgroup::" @@ -130,7 +130,7 @@ if [[ "${#LIT_PRECOMPILE_TESTS}" -gt 0 || "${#LIT_TESTS}" -gt 0 ]]; then fi fi -if [[ "${#LIT_PRECOMPILE_TESTS}" -gt 0 ]]; then +if [[ "${#LIT_PRECOMPILE_TESTS[@]}" -gt 0 ]]; then for t in "${LIT_PRECOMPILE_TESTS[@]}"; do t_path="libcudacxx/test/libcudacxx/${t}" if ! (set -x; LIBCUDACXX_SITE_CONFIG="${lit_site_cfg}" lit -v "-Dexecutor=NoopExecutor()" "${t_path}"); then @@ -141,7 +141,7 @@ if [[ "${#LIT_PRECOMPILE_TESTS}" -gt 0 ]]; then done fi -if [[ "${#LIT_TESTS}" -gt 0 ]]; then +if [[ "${#LIT_TESTS[@]}" -gt 0 ]]; then for t in "${LIT_TESTS[@]}"; do t_path="libcudacxx/test/libcudacxx/${t}" if ! (set -x; LIBCUDACXX_SITE_CONFIG="${lit_site_cfg}" lit -v "${t_path}"); then From 259413dc6591071f7f3d4ff8b1ecc7ae8663e517 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 12 May 2026 17:59:01 -0400 Subject: [PATCH 3/8] Skip simd::basic_mask generator-ctor noexcept assert on GCC 8 GCC 8 infers noexcept on the generator constructor more permissively than GCC 9+, so `static_assert(!noexcept(Mask(is_even{})))` fires. Widen the existing GCC 7 skip to also cover GCC 8. --- .../libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp index 0848ec2e71d..ba8eed0703d 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp @@ -123,7 +123,7 @@ template TEST_FUNC constexpr void test_generator() { using Mask = simd::basic_mask>; -#if _CCCL_COMPILER(GCC, !=, 7) +#if _CCCL_COMPILER(GCC, >=, 9) static_assert(!noexcept(Mask(is_even{}))); #endif From d69de217b0365ab9a5ef6364d2e917a38c009a35 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 12 May 2026 18:16:52 -0400 Subject: [PATCH 4/8] Skip pointer compare in __constexpr_tail_overlap when is_constant_evaluated is faked On GCC 8 there is no __builtin_is_constant_evaluated, so cuda::std::is_constant_evaluated() always returns false. That makes `if (!is_constant_evaluated())` look statically true, so the compiler takes the runtime branch even in a constant expression and trips over the pointer compare, which is not allowed there. Only enable the runtime branch on compilers that actually provide the builtin; everyone else falls through to the constexpr-safe path. --- libcudacxx/include/cuda/std/__algorithm/copy.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h index 8e2f387b6e5..9d3c4cb9e60 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy.h @@ -86,13 +86,16 @@ _CCCL_EXEC_CHECK_DISABLE template _CCCL_API constexpr bool __constexpr_tail_overlap(_Tp* __first, _Up* __needle, [[maybe_unused]] _Tp* __last) { -#if !_CCCL_TILE_COMPILATION() // pointer values cannot be compared + // Without the builtin, is_constant_evaluated() is hard-coded to false, so the runtime branch is + // selected even during constant evaluation -- where the pointer compare below is ill-formed. Fall + // through to the constexpr-safe path on compilers that lack the builtin (notably GCC 8). +#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) if (!::cuda::std::is_constant_evaluated()) { return __first < __needle; } else -#endif // !_CCCL_TILE_COMPILATION() +#endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED { #if defined(_CCCL_BUILTIN_CONSTANT_P) NV_IF_ELSE_TARGET(NV_IS_HOST, From b40e516de19a07524a860d8dfc34e3dd644cb00d Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 13 May 2026 09:04:49 +0200 Subject: [PATCH 5/8] Fix copy runtime optimization --- libcudacxx/include/cuda/std/__algorithm/copy.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h index 9d3c4cb9e60..817ffac7af4 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy.h @@ -86,16 +86,11 @@ _CCCL_EXEC_CHECK_DISABLE template _CCCL_API constexpr bool __constexpr_tail_overlap(_Tp* __first, _Up* __needle, [[maybe_unused]] _Tp* __last) { - // Without the builtin, is_constant_evaluated() is hard-coded to false, so the runtime branch is - // selected even during constant evaluation -- where the pointer compare below is ill-formed. Fall - // through to the constexpr-safe path on compilers that lack the builtin (notably GCC 8). -#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) - if (!::cuda::std::is_constant_evaluated()) + if (!::cuda::std::__cccl_default_is_constant_evaluated()) { return __first < __needle; } else -#endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED { #if defined(_CCCL_BUILTIN_CONSTANT_P) NV_IF_ELSE_TARGET(NV_IS_HOST, From 49749fea77d8d3028fddbf37e0883428140359c1 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 13 May 2026 13:35:57 -0400 Subject: [PATCH 6/8] clear override --- ci/matrix.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 43ea697ec62..b08c6068ace 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -21,11 +21,6 @@ workflows: # args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' } # override: - # Targeted repro of nightly 25713127213 failures. Reset before merging. - # libcu++ bit_cast<__float128> regression: memcpy fallback fires on GCC 8/9/10 (no __builtin_bit_cast) with CTK 12.0/12.X. - - {jobs: ['build'], project: 'libcudacxx', std: 'all', ctk: ['12.0', '12.X'], cxx: ['gcc8', 'gcc9', 'gcc10']} - # CUB device_reduce -Wunused-but-set-parameter regression: GCC 8 only. - - {jobs: ['build'], project: 'cub', std: 17, ctk: ['12.0', '12.X'], cxx: ['gcc8']} pull_request: # Old CTK: Oldest/newest supported host compilers: From 8463734d29bf3a6f670580da09ca70e69ce8b43c Mon Sep 17 00:00:00 2001 From: alliepiper Date: Thu, 14 May 2026 13:19:55 -0400 Subject: [PATCH 7/8] Update libcudacxx/include/cuda/std/__bit/bit_cast.h Co-authored-by: Federico Busato <50413820+fbusato@users.noreply.github.com> --- libcudacxx/include/cuda/std/__bit/bit_cast.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h index b51d94795ee..8d4f203bfc4 100644 --- a/libcudacxx/include/cuda/std/__bit/bit_cast.h +++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h @@ -51,8 +51,10 @@ _CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess") template [[nodiscard]] _CCCL_API inline _To __bit_cast_memcpy(const _From& __from) noexcept { - static_assert(::cuda::std::is_default_constructible_v<_To>, +#if !_CCCL_COMPILER(GCC, <=, 8) + static_assert(::cuda::std::default_initializable<_To>, "bit_cast memcpy fallback requires the destination type to be default constructible"); +#endif // !_CCCL_COMPILER(GCC, <=, 8) _To __temp; ::cuda::std::memcpy(&__temp, &__from, sizeof(_To)); return __temp; From 3242568ef8cf91cc636e2abf1e9a6f8107a7b131 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Thu, 14 May 2026 18:55:19 -0400 Subject: [PATCH 8/8] Fix missing include for default_initializable in bit_cast.h --- libcudacxx/include/cuda/std/__bit/bit_cast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h index 8d4f203bfc4..e62d4ab3019 100644 --- a/libcudacxx/include/cuda/std/__bit/bit_cast.h +++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include