From 23b445ad35e125096b43bf78affa090310e658ae Mon Sep 17 00:00:00 2001 From: Marco Franzreb Salgado Date: Thu, 7 May 2026 00:31:36 -0700 Subject: [PATCH 1/6] Add Windows support --- CMakeLists.txt | 3 + ci/build_common.sh | 22 ++++--- cmake/NVBenchCUPTI.cmake | 14 +++-- cmake/NVBenchConfigTarget.cmake | 18 +++++- nvbench/config.cuh.in | 4 ++ testing/axes_metadata.cu | 1 + testing/cmake/CMakeLists.txt | 15 +++++ testing/cmake/test_export/CMakeLists.txt | 74 +++++++++++++----------- 8 files changed, 105 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 01b39bbe..fcd44bb4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,9 @@ if (${CUDAToolkit_VERSION} VERSION_LESS 11.3) endif() option(BUILD_SHARED_LIBS "Build NVBench as a shared library" ON) +if (WIN32 AND BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON) option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default}) diff --git a/ci/build_common.sh b/ci/build_common.sh index 2c30414a..718e22fa 100755 --- a/ci/build_common.sh +++ b/ci/build_common.sh @@ -12,6 +12,7 @@ CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc` CUDA_ARCHS= # Empty, use presets by default. GLOBAL_CMAKE_OPTIONS=() DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks. +HOST_OS="linux" # "linux" or "windows" # Check if the correct number of arguments has been provided function usage { @@ -21,6 +22,7 @@ function usage { echo echo "Options:" echo " -v/--verbose: enable shell echo for debugging" + echo " -os: Target OS, \"linux\" or \"windows\" (Defaults to linux)" echo " -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" echo " -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" echo " -std: CUDA/C++ standard (Defaults to 17)" @@ -32,6 +34,7 @@ function usage { echo " $ PARALLEL_LEVEL=8 $0 -cxx g++-9" echo " $ $0 -cxx clang++-8" echo " $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc" + echo " $ $0 -os windows -cxx cl.exe -arch native" echo " $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\"" exit 1 } @@ -44,6 +47,7 @@ args=("$@") while [ "${#args[@]}" -ne 0 ]; do case "${args[0]}" in -v | --verbose) VERBOSE=1; args=("${args[@]:1}");; + -os) HOST_OS="${args[1]}"; args=("${args[@]:2}");; -cxx) HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");; -std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");; @@ -66,8 +70,8 @@ while [ "${#args[@]}" -ne 0 ]; do done # Convert to full paths: -HOST_COMPILER=$(which ${HOST_COMPILER}) -CUDA_COMPILER=$(which ${CUDA_COMPILER}) +HOST_COMPILER=$(which "${HOST_COMPILER}") +CUDA_COMPILER=$(which "${CUDA_COMPILER}") if [[ -n "${CUDA_ARCHS}" ]]; then GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}") @@ -91,11 +95,15 @@ BUILD_DIR="../build/${CCCL_BUILD_INFIX}" # The most recent build will always be symlinked to cccl/build/latest mkdir -p $BUILD_DIR -rm -f ../build/latest -ln -sf $BUILD_DIR ../build/latest - -# Now that BUILD_DIR exists, use readlink to canonicalize the path: -BUILD_DIR=$(readlink -f "${BUILD_DIR}") +if [[ "${HOST_OS}" == "windows" ]]; then + # Git Bash on Windows cannot create directory symlinks without elevated privileges + BUILD_DIR=$(cd "${BUILD_DIR}" && pwd) +else + rm -f ../build/latest + ln -sf $BUILD_DIR ../build/latest + # Now that BUILD_DIR exists, use readlink to canonicalize the path: + BUILD_DIR=$(readlink -f "${BUILD_DIR}") +fi # Prepare environment for CMake: export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}" diff --git a/cmake/NVBenchCUPTI.cmake b/cmake/NVBenchCUPTI.cmake index 10a70893..789f4af1 100644 --- a/cmake/NVBenchCUPTI.cmake +++ b/cmake/NVBenchCUPTI.cmake @@ -23,14 +23,20 @@ function(nvbench_add_cupti_dep dep_name) add_library(nvbench::${dep_name_lower} SHARED IMPORTED) find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED - DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit." + DOC "The import library for ${dep_name_lower} from the CUDA Toolkit." HINTS "${nvbench_cupti_root}/lib64" ) mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY) - set_target_properties(nvbench::${dep_name_lower} PROPERTIES - IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}" - ) + if (WIN32) + set_target_properties(nvbench::${dep_name_lower} PROPERTIES + IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}" + ) + else() + set_target_properties(nvbench::${dep_name_lower} PROPERTIES + IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}" + ) + endif() endfunction() nvbench_add_cupti_dep(cupti) diff --git a/cmake/NVBenchConfigTarget.cmake b/cmake/NVBenchConfigTarget.cmake index 7c8a4b93..536e9663 100644 --- a/cmake/NVBenchConfigTarget.cmake +++ b/cmake/NVBenchConfigTarget.cmake @@ -91,11 +91,25 @@ endif() if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # fmtlib uses llvm's _BitInt internally, which is not available when compiling through nvcc: target_compile_definitions(nvbench.build_interface INTERFACE "FMT_USE_BITINT=0") + if (MSVC) + # cudafe cannot evaluate fmtlib's UTF-8 literal check even when /utf-8 is passed to the host compiler: + target_compile_definitions(nvbench.build_interface INTERFACE + $<$:FMT_UNICODE=0> + ) + endif() endif() target_compile_options(nvbench.build_interface INTERFACE $<$:-Xcudafe=--display_error_number> $<$:-Wno-deprecated-gpu-targets> + $<$,$>:-Xcompiler=/utf-8> + # Suppress cudafe diagnostics triggered by fmtlib headers when compiled through MSVC+nvcc: + # 27: character value is out of range (char32_t sentinel values in lookup tables) + # 128: loop is not reachable (dead code in constexpr string comparison) + # 2417: constexpr constructor calls non-constexpr function (bigint default ctor) + $<$,$>:-Xcudafe=--diag_suppress=27> + $<$,$>:-Xcudafe=--diag_suppress=128> + $<$,$>:-Xcudafe=--diag_suppress=2417> ) if (NVBench_ENABLE_WERROR) target_compile_options(nvbench.build_interface INTERFACE @@ -115,8 +129,8 @@ function(nvbench_config_target target_name) # the library path, other times they're in a subdirectory that isn't added to # the library path... # To simplify installed nvbench usage, add the CUPTI libraries path to the - # installed nvbench rpath: - if (NVBench_ENABLE_CUPTI AND nvbench_cupti_root) + # installed nvbench rpath (Unix only; Windows uses PATH for DLL lookup): + if (NVBench_ENABLE_CUPTI AND nvbench_cupti_root AND NOT WIN32) set_target_properties(${target_name} PROPERTIES INSTALL_RPATH "${nvbench_cupti_root}/lib64" ) diff --git a/nvbench/config.cuh.in b/nvbench/config.cuh.in index d151c130..2f89f4cc 100644 --- a/nvbench/config.cuh.in +++ b/nvbench/config.cuh.in @@ -24,7 +24,11 @@ // Defined if NVBench has been built with CUPTI support. #cmakedefine NVBENCH_HAS_CUPTI +#if defined(_MSVC_LANG) +#define NVBENCH_CPLUSPLUS _MSVC_LANG +#else #define NVBENCH_CPLUSPLUS __cplusplus +#endif // Detect current dialect: #if NVBENCH_CPLUSPLUS < 201703L diff --git a/testing/axes_metadata.cu b/testing/axes_metadata.cu index 9e546602..d6cc441d 100644 --- a/testing/axes_metadata.cu +++ b/testing/axes_metadata.cu @@ -24,6 +24,7 @@ #include #include +#include #include #include "test_asserts.cuh" diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt index c4e4eb77..506de5a7 100644 --- a/testing/cmake/CMakeLists.txt +++ b/testing/cmake/CMakeLists.txt @@ -9,6 +9,14 @@ set(cmake_opts -D "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" -D "CMAKE_CUDA_ARCHITECTURES=${arches}" ) +if (WIN32) + list(APPEND cmake_opts + -D "CMAKE_CUDA_HOST_COMPILER=${CMAKE_CXX_COMPILER}" + -D "CMAKE_LINKER=${CMAKE_LINKER}" + -D "CMAKE_RC_COMPILER=${CMAKE_RC_COMPILER}" + -D "CMAKE_MT=${CMAKE_MT}" + ) +endif() # Temporary installation prefix for tests against installed nvbench: set(tmp_install_prefix "${CMAKE_CURRENT_BINARY_DIR}/test_nvbench_install") @@ -32,6 +40,13 @@ function(nvbench_add_compile_test full_test_name_var subdir test_id) ${ARGN} --test-command "${CMAKE_CTEST_COMMAND}" --output-on-failure ) + if (WIN32 AND NVBench_ENABLE_CUPTI AND nvbench_cupti_root) + cmake_path(NATIVE_PATH nvbench_cupti_root cupti_native) + cmake_path(NATIVE_PATH NVBench_EXECUTABLE_OUTPUT_DIR bin_native) + set_tests_properties(${test_name} PROPERTIES + ENVIRONMENT "PATH=${bin_native}\\;${cupti_native}\\lib64\\;$ENV{PATH}" + ) + endif() set(${full_test_name_var} ${test_name} PARENT_SCOPE) endfunction() diff --git a/testing/cmake/test_export/CMakeLists.txt b/testing/cmake/test_export/CMakeLists.txt index e3d7d33c..21faa30a 100644 --- a/testing/cmake/test_export/CMakeLists.txt +++ b/testing/cmake/test_export/CMakeLists.txt @@ -10,45 +10,53 @@ enable_testing() add_test(NAME test_bench COMMAND "$" --timeout 1) add_test(NAME nvbench_ctl COMMAND "$") -# Setup LD_LIBRARY_PATH for testing -if (UNIX) - set(ctl_lib_path "") - set(cupti_lib_path "") - - # Need to find installed libnvbench.so for installed nvbench-ctl. - # Not needed for build_tree test because of RUNPATH. - if (TEST_TYPE STREQUAL "INSTALL_TREE") - get_property(nvbench_config TARGET nvbench::nvbench - PROPERTY IMPORTED_CONFIGURATIONS - ) - - list(LENGTH nvbench_config num_configs) - if (num_configs GREATER 1) - message(WARNING - "Multiple IMPORTED_CONFIGURATIONS for nvbench::nvbench. " - "Picking the first one. This may cause issues." - ) - list(GET nvbench_config 0 nvbench_config) - endif() +# Setup runtime library paths for testing. +# Unix uses LD_LIBRARY_PATH; Windows uses PATH for DLL lookup. +get_property(nvbench_config TARGET nvbench::nvbench + PROPERTY IMPORTED_CONFIGURATIONS +) +list(LENGTH nvbench_config num_configs) +if (num_configs GREATER 1) + message(WARNING + "Multiple IMPORTED_CONFIGURATIONS for nvbench::nvbench. " + "Picking the first one. This may cause issues." + ) + list(GET nvbench_config 0 nvbench_config) +endif() - get_property(ctl_lib_path TARGET nvbench::nvbench - PROPERTY IMPORTED_LOCATION_${nvbench_config} - ) - cmake_path(GET ctl_lib_path PARENT_PATH ctl_lib_path) - endif() +set(nvbench_lib_dir "") +# On Unix the build tree uses RUNPATH so only the install tree needs the path. +# On Windows there is no RUNPATH so we always need the DLL directory. +if (WIN32 OR TEST_TYPE STREQUAL "INSTALL_TREE") + get_property(nvbench_lib TARGET nvbench::nvbench + PROPERTY IMPORTED_LOCATION_${nvbench_config} + ) + cmake_path(GET nvbench_lib PARENT_PATH nvbench_lib_dir) +endif() - # Need to add the CUPTI path to LD_LIBRARY_PATH to make sure CUPTI libraries - # are found at runtime: - if (TARGET nvbench::cupti) - get_property(cupti_lib_path TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION) - cmake_path(GET cupti_lib_path PARENT_PATH cupti_lib_path) +set(cupti_lib_dir "") +if (TARGET nvbench::cupti) + if (WIN32) + get_property(cupti_lib TARGET nvbench::cupti PROPERTY IMPORTED_IMPLIB) + else() + get_property(cupti_lib TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION) endif() + cmake_path(GET cupti_lib PARENT_PATH cupti_lib_dir) +endif() +if (WIN32) + set(lib_dirs "${nvbench_lib_dir}\\;${cupti_lib_dir}") set_property(TEST test_bench PROPERTY - ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_path}" + ENVIRONMENT "PATH=${lib_dirs}\\;$ENV{PATH}" ) set_property(TEST nvbench_ctl PROPERTY - ENVIRONMENT "LD_LIBRARY_PATH=${ctl_lib_path}:${cupti_lib_path}" + ENVIRONMENT "PATH=${lib_dirs}\\;$ENV{PATH}" + ) +else() + set_property(TEST test_bench PROPERTY + ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_dir}" + ) + set_property(TEST nvbench_ctl PROPERTY + ENVIRONMENT "LD_LIBRARY_PATH=${nvbench_lib_dir}:${cupti_lib_dir}" ) - endif() From 787e435e6d5201905e798a23ab067e680d94f3b1 Mon Sep 17 00:00:00 2001 From: Marco Franzreb Salgado Date: Wed, 13 May 2026 10:38:55 -0700 Subject: [PATCH 2/6] Windows support: revert CI specific changes --- ci/build_common.sh | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/ci/build_common.sh b/ci/build_common.sh index 718e22fa..2c30414a 100755 --- a/ci/build_common.sh +++ b/ci/build_common.sh @@ -12,7 +12,6 @@ CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc` CUDA_ARCHS= # Empty, use presets by default. GLOBAL_CMAKE_OPTIONS=() DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks. -HOST_OS="linux" # "linux" or "windows" # Check if the correct number of arguments has been provided function usage { @@ -22,7 +21,6 @@ function usage { echo echo "Options:" echo " -v/--verbose: enable shell echo for debugging" - echo " -os: Target OS, \"linux\" or \"windows\" (Defaults to linux)" echo " -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" echo " -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" echo " -std: CUDA/C++ standard (Defaults to 17)" @@ -34,7 +32,6 @@ function usage { echo " $ PARALLEL_LEVEL=8 $0 -cxx g++-9" echo " $ $0 -cxx clang++-8" echo " $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc" - echo " $ $0 -os windows -cxx cl.exe -arch native" echo " $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\"" exit 1 } @@ -47,7 +44,6 @@ args=("$@") while [ "${#args[@]}" -ne 0 ]; do case "${args[0]}" in -v | --verbose) VERBOSE=1; args=("${args[@]:1}");; - -os) HOST_OS="${args[1]}"; args=("${args[@]:2}");; -cxx) HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");; -std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");; @@ -70,8 +66,8 @@ while [ "${#args[@]}" -ne 0 ]; do done # Convert to full paths: -HOST_COMPILER=$(which "${HOST_COMPILER}") -CUDA_COMPILER=$(which "${CUDA_COMPILER}") +HOST_COMPILER=$(which ${HOST_COMPILER}) +CUDA_COMPILER=$(which ${CUDA_COMPILER}) if [[ -n "${CUDA_ARCHS}" ]]; then GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}") @@ -95,15 +91,11 @@ BUILD_DIR="../build/${CCCL_BUILD_INFIX}" # The most recent build will always be symlinked to cccl/build/latest mkdir -p $BUILD_DIR -if [[ "${HOST_OS}" == "windows" ]]; then - # Git Bash on Windows cannot create directory symlinks without elevated privileges - BUILD_DIR=$(cd "${BUILD_DIR}" && pwd) -else - rm -f ../build/latest - ln -sf $BUILD_DIR ../build/latest - # Now that BUILD_DIR exists, use readlink to canonicalize the path: - BUILD_DIR=$(readlink -f "${BUILD_DIR}") -fi +rm -f ../build/latest +ln -sf $BUILD_DIR ../build/latest + +# Now that BUILD_DIR exists, use readlink to canonicalize the path: +BUILD_DIR=$(readlink -f "${BUILD_DIR}") # Prepare environment for CMake: export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}" From 78b674bf2e125e3c34f65ab221c0001adfd96535 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 15 May 2026 15:27:37 -0500 Subject: [PATCH 3/6] Re- Enable NVBench Windows build job Remove gate that disables Windows NVBench build job in pr.yaml --- .github/workflows/pr.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7ea85397..40427264 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -79,8 +79,6 @@ jobs: nvbench-windows: name: NVBench Windows CUDA${{ matrix.config.cuda }} ${{ matrix.config.host }} C++${{ matrix.config.std }} - # TODO: Re-enable after https://github.com/NVIDIA/nvbench/pull/354 fixes the Windows build. - if: false permissions: id-token: write contents: read From 0ca8414417c4a24c98a876c736c7a57619b94893 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Sat, 16 May 2026 08:43:08 -0500 Subject: [PATCH 4/6] Fixes to support build on Windows 1. Install CUDA Profiler API into toolkit matching to what is installed in dev-container 2. Pass linker argument to use main from static nvbench_main library when linking examples and tests 3. Instruct MSVC to use standard-compliant preprocessor 4. Use environment modification for targets to help them find shared libraries needed as runtime, such as CUPTI on Windows/Linux. Remainder is aggregation of 53 individual commit messages Install CUDA Profiler API into toolkit Add intall_cuda_profiler_api.ps1 Inform MSVC that static library export main Attempt to fix "LINK : fatal error LNK1561: entry point must be defined" when building benchmarks which need main function provided by static library libnvbench_main after #350 Review feedback to PowerShell script Fix how CMAKE_CUDA_HOST_COMPILER is set in call to cmake Filter out empty directories LD_LIBRARY_PATH/PATH Act on review feedback regarding corner cases when testing may dependent on the directory it is performed from Check that cudaVersion and :CUDA_PATH are consistent Do not overwrite ENVIRONMENT property with empty values Implement retry logic in downloading of CUDA Profiler API Strengthen publisher verification of downloaded artifact Prepend new folders to LD_LIBRARY_PATH, do not overwrite Implement timeout, fail on 40x HTTP response code 4xx responses now fail immediately, and the installer is bounded to 15 minutes before being killed and reported as a timeout. USE ENVIRONMENT_MODIFICATION property, not ENVIRONMENT escape environment modification values Fix cmake script error breaking the build Added recommented timeout to Invoke-WebRequest Set cmake_minimum_required version to 3.30.4, consistent with main project Pass NVBENCH environment variables through docker for Windows build Export IMPORTLIB_LOCATION for CUPTI on Windows and use in testing projects Add Zc:preprocessor to host compiler on Windows. Configure runtime env for tests to find CUPTI library Better fix to add /Zc:preprocessor that also propagates to header testing target Address code rabbit concern Validate before casting in PowerShell script decouple nvbench runtime path setup from cupti target detection Normalize multiple ARCH args Better validation of gpu_args parameter use get_imported_location to get CUPTI library to improve multi-config support Validation of combinations of gpu, run_tests and device_testing Resolve code-rabbit concern in handling multiple imported configurations to match build type, if set Reject GPU requests for forks Prevents installing cuda_profiler_api.h into one toolkit while CMake builds with another. Fail fast for deterministic client errors returned by download request more robust imported_location computation Make Linux also use ENVIRONMENT_MODIFICATION to simplify code run_tests=false is not allows when device_testing=true Specify Windows CUDA toolkit version major.minor.patch, derive devcontainer tag from full spec Handle edge case when multiple CUPTI dlls exist, pick up, warn, do not fail Always specify -DNVBench_ENABLE_DEVICE_TESTING=VAL per value of Back to cuda major.minor being input What CUDA Profiler API to install is determined from redist information stored in version.json stored at root of CUDA Toolkit. If version.json is not found, an error occurs Remove parameters intended to enable testing builds on Windows. Deferred for future work Handle import nvbench::nvbench the same as nvbench target in NVBenchConfigTarget Forward cmake variables only if set Use UTF-8 encoding when appending to GITHUB_OUTPUT Avoid power-shell footgun where local variable shadows builtin variable due to case insensitivity enable device testing parameter in build_nvbench, passed as True by workflow Lower CMake version required as much as possible LINKER:/INCLUDE:main for proper CUDA link driver routing Add conda-specific hints for find_library call to find CUPTI test_export must require 3.22 version ENVIRONMENT_MODIFICATION feature was added in 3.22.0 https://cmake.org/cmake/help/latest/prop_test/ENVIRONMENT_MODIFICATION.html Delete unused function Test-Preset Guard the CUPTI runtime path extraction Check before executing cmake_path() in testing/cmake/CMakeLists.txt Also, use nvbench_get_imported_location to extract imported location use the config-aware generator expression for all runtime targets Remove the configure-time imported-location helper entirely. Deduplicate WINDOWS_CI_IMAGE construction --- .github/workflows/build-windows.yml | 51 ++- ci/windows/build_nvbench.ps1 | 10 +- ci/windows/install_cuda_profiler_api.ps1 | 519 +++++++++++++++++++++++ cmake/NVBenchCUPTI.cmake | 73 +++- cmake/NVBenchConfigTarget.cmake | 39 ++ examples/CMakeLists.txt | 1 + exec/CMakeLists.txt | 42 +- nvbench/CMakeLists.txt | 10 + testing/CMakeLists.txt | 1 + testing/cmake/CMakeLists.txt | 30 +- testing/cmake/test_export/CMakeLists.txt | 120 ++++-- testing/device/CMakeLists.txt | 1 + 12 files changed, 837 insertions(+), 60 deletions(-) create mode 100644 ci/windows/install_cuda_profiler_api.ps1 diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 777d2665..a61c4fb5 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -38,7 +38,7 @@ on: workflow_dispatch: inputs: cuda: - description: "CUDA Toolkit version" + description: "CUDA Toolkit major.minor version" type: string required: false default: "13.0" @@ -113,14 +113,50 @@ jobs: aws-region: us-east-2 role-duration-seconds: 43200 + - name: Validate Windows build inputs + id: validate_windows_build_inputs + env: + NVBENCH_WINDOWS_CUDA: ${{ inputs.cuda }} + NVBENCH_WINDOWS_STD: ${{ inputs.std }} + NVBENCH_WINDOWS_ARCH: ${{ inputs.arch }} + run: | + $ErrorActionPreference = "Stop" + + if ($env:NVBENCH_WINDOWS_CUDA -notmatch '^\d+\.\d+$') { + throw "Invalid CUDA version '$env:NVBENCH_WINDOWS_CUDA'. Expected '.', for example '13.0'." + } + + if (@("17", "20") -notcontains $env:NVBENCH_WINDOWS_STD) { + throw "Invalid C++ standard '$env:NVBENCH_WINDOWS_STD'. Expected '17' or '20'." + } + + $arch = "$env:NVBENCH_WINDOWS_ARCH".Trim() + $normalizedArch = $arch + if ($arch) { + if (@("all", "all-major", "native") -notcontains $arch) { + $archItems = @($arch -split '[;,]' | ForEach-Object { $_.Trim() } | Where-Object { $_ }) + if ($archItems.Length -eq 0) { + throw "Invalid CMAKE_CUDA_ARCHITECTURES value '$arch'. Expected empty, 'all', 'all-major', 'native', or a list like '80;90-real'." + } + foreach ($archItem in $archItems) { + if ($archItem -notmatch '^\d{2,3}(-real|-virtual)?$') { + throw "Invalid CMAKE_CUDA_ARCHITECTURES value '$arch'. Expected empty, 'all', 'all-major', 'native', or a list like '80;90-real'." + } + } + $normalizedArch = $archItems -join ';' + } + } + "arch=$normalizedArch" | Out-File -FilePath $env:GITHUB_OUTPUT -Encoding utf8 -Append + - name: Fetch Windows devcontainer image run: | docker pull "$env:WINDOWS_CI_IMAGE" - name: Build NVBench env: + NVBENCH_WINDOWS_CUDA: ${{ inputs.cuda }} NVBENCH_WINDOWS_STD: ${{ inputs.std }} - NVBENCH_WINDOWS_ARCH: ${{ inputs.arch }} + NVBENCH_WINDOWS_ARCH: ${{ steps.validate_windows_build_inputs.outputs.arch }} run: | $ErrorActionPreference = "Stop" @@ -131,7 +167,8 @@ jobs: @" `$ErrorActionPreference = 'Stop' git config --global --add safe.directory '$containerRepo' - & '$containerRepo/ci/windows/build_nvbench.ps1' -std '$env:NVBENCH_WINDOWS_STD' -arch '$env:NVBENCH_WINDOWS_ARCH' + & '$containerRepo/ci/windows/install_cuda_profiler_api.ps1' -cudaVersion "`$env:NVBENCH_WINDOWS_CUDA" + & '$containerRepo/ci/windows/build_nvbench.ps1' -std "`$env:NVBENCH_WINDOWS_STD" -arch "`$env:NVBENCH_WINDOWS_ARCH" -device-testing `$true exit `$LASTEXITCODE "@ | Set-Content -Path $script -Encoding UTF8 @@ -152,13 +189,19 @@ jobs: "--env", "GITHUB_REPOSITORY=$env:GITHUB_REPOSITORY", "--env", "GITHUB_RUN_ID=$env:GITHUB_RUN_ID", "--env", "GITHUB_SHA=$env:GITHUB_SHA", + "--env", "NVBENCH_WINDOWS_ARCH=$env:NVBENCH_WINDOWS_ARCH", + "--env", "NVBENCH_WINDOWS_CUDA=$env:NVBENCH_WINDOWS_CUDA", + "--env", "NVBENCH_WINDOWS_STD=$env:NVBENCH_WINDOWS_STD", "--env", "SCCACHE_BUCKET=$env:SCCACHE_BUCKET", "--env", "SCCACHE_IDLE_TIMEOUT=$env:SCCACHE_IDLE_TIMEOUT", "--env", "SCCACHE_REGION=$env:SCCACHE_REGION", "--env", "SCCACHE_S3_NO_CREDENTIALS=$env:SCCACHE_S3_NO_CREDENTIALS", "--env", "SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX=$env:SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX", "--env", "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=$env:SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", - "--env", "SCCACHE_S3_USE_SSL=$env:SCCACHE_S3_USE_SSL", + "--env", "SCCACHE_S3_USE_SSL=$env:SCCACHE_S3_USE_SSL" + ) + + $dockerArgs += @( "$env:WINDOWS_CI_IMAGE", "powershell", "-NoLogo", "-NoProfile", "-ExecutionPolicy", "Bypass", "-File", $containerScript diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1 index a39b0b34..c1805267 100644 --- a/ci/windows/build_nvbench.ps1 +++ b/ci/windows/build_nvbench.ps1 @@ -11,7 +11,11 @@ Param( [Parameter(Mandatory = $false)] [Alias("cmake-options")] - [string]$CMAKE_OPTIONS = "" + [string]$CMAKE_OPTIONS = "", + + [Parameter(Mandatory = $false)] + [Alias("device-testing")] + [bool]$DEVICE_TESTING = $false ) $ErrorActionPreference = "Stop" @@ -30,9 +34,11 @@ try { Print-EnvironmentDetails $preset = "nvbench-ci" + $deviceTestingOption = if ($DEVICE_TESTING) { "ON" } else { "OFF" } $localOptions = @( "-DCMAKE_CXX_STANDARD=$CXX_STANDARD", - "-DCMAKE_CUDA_STANDARD=$CXX_STANDARD" + "-DCMAKE_CUDA_STANDARD=$CXX_STANDARD", + "-DNVBench_ENABLE_DEVICE_TESTING=$deviceTestingOption" ) Configure-And-Build-Preset "NVBench" $preset $localOptions diff --git a/ci/windows/install_cuda_profiler_api.ps1 b/ci/windows/install_cuda_profiler_api.ps1 new file mode 100644 index 00000000..556a32ee --- /dev/null +++ b/ci/windows/install_cuda_profiler_api.ps1 @@ -0,0 +1,519 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +Param( + [Parameter(Mandatory = $false)] + [Alias("cudaVersion")] + [string]$CUDA_VERSION = "" +) + +$ErrorActionPreference = "Stop" + +$RedistRootUri = "https://developer.download.nvidia.com/compute/cuda/redist" + +function Get-CudaVersionFromPath { + Param( + [Parameter(Mandatory = $false)] + [string]$Path = "" + ) + + if ($Path -and $Path -match "v(?\d+\.\d+)[\\/]?$") { + return $Matches.version + } + + return "" +} + +function Get-CudaRootFromNvcc { + $nvccCommand = Get-Command "nvcc.exe" -ErrorAction SilentlyContinue + if (-not $nvccCommand) { + return "" + } + + $nvccPath = $nvccCommand.Source + $binDir = Split-Path -Parent $nvccPath + if ((Split-Path -Leaf $binDir) -ne "bin") { + throw "Could not derive CUDA root from nvcc.exe path: $nvccPath" + } + + return Split-Path -Parent $binDir +} + +function Assert-SamePath { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Left, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Right, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Message + ) + + $leftFullPath = [System.IO.Path]::GetFullPath($Left).TrimEnd('\', '/') + $rightFullPath = [System.IO.Path]::GetFullPath($Right).TrimEnd('\', '/') + if ($leftFullPath -ne $rightFullPath) { + throw "$Message Left='$leftFullPath' Right='$rightFullPath'" + } +} + +function Get-HttpStatusCodeFromError { + Param( + [Parameter(Mandatory = $true)] + $ErrorRecord + ) + + $responseProperty = $ErrorRecord.Exception.PSObject.Properties["Response"] + if (-not $responseProperty) { + return $null + } + + $response = $responseProperty.Value + if ($null -eq $response) { + return $null + } + + $statusCodeProperty = $response.PSObject.Properties["StatusCode"] + if (-not $statusCodeProperty) { + return $null + } + + return [int]$statusCodeProperty.Value +} + +function Invoke-WebRequestWithRetry { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Uri, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$OutFile, + + [Parameter(Mandatory = $false)] + [ValidateRange(1, 10)] + [int]$MaxAttempts = 3 + ) + + for ($attempt = 1; $attempt -le $MaxAttempts; $attempt++) { + try { + Remove-Item $OutFile -ErrorAction SilentlyContinue + Invoke-WebRequest -Uri $Uri -OutFile $OutFile -UseBasicParsing -TimeoutSec 300 + return + } catch { + $statusCode = Get-HttpStatusCodeFromError -ErrorRecord $_ + # Fail fast for deterministic client errors that indicate a bad URL, + # missing package, or unsupported method. Keep 408/429 and 5xx on + # the retry path because they are commonly transient in CI. + if (@(400, 401, 403, 404, 405, 410, 414) -contains $statusCode) { + throw "Download failed with non-retryable HTTP status $statusCode from '$Uri'. $_" + } + + if ($attempt -eq $MaxAttempts) { + throw + } + + $delaySeconds = 5 * $attempt + Write-Warning "Download failed on attempt $attempt of $MaxAttempts. Retrying in $delaySeconds seconds. $_" + Start-Sleep -Seconds $delaySeconds + } + } +} + +function Read-JsonFile { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Path + ) + + try { + $content = Get-Content -LiteralPath $Path -Raw + $json = $content | ConvertFrom-Json + return $json + } catch { + throw "Failed to parse JSON file '$Path'. $_" + } +} + +function Get-JsonPropertyValue { + Param( + [Parameter(Mandatory = $true)] + $Object, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Name + ) + + if ($null -eq $Object) { + return $null + } + + $property = $Object.PSObject.Properties[$Name] + if (-not $property) { + return $null + } + + return $property.Value +} + +function Get-ComponentVersion { + Param( + [Parameter(Mandatory = $true)] + $JsonObject, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$ComponentName + ) + + $component = Get-JsonPropertyValue -Object $JsonObject -Name $ComponentName + if ($null -eq $component) { + return "" + } + + $version = Get-JsonPropertyValue -Object $component -Name "version" + if ($null -eq $version) { + return "" + } + + return [string]$version +} + +function Get-CudaVersionFromRoot { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$CudaRoot + ) + + $pathVersion = Get-CudaVersionFromPath -Path $CudaRoot + if ($pathVersion) { + return $pathVersion + } + + $versionJson = Join-Path $CudaRoot "version.json" + if (Test-Path $versionJson) { + $versionData = Read-JsonFile -Path $versionJson + $cudaVersion = Get-ComponentVersion -JsonObject $versionData -ComponentName "cuda" + if ($cudaVersion -match '^(?\d+\.\d+)(\.|$)') { + return $Matches.version + } + } + + return "" +} + +function Assert-Sha256 { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$Path, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$ExpectedSha256 + ) + + $actualSha256 = (Get-FileHash -LiteralPath $Path -Algorithm SHA256).Hash.ToLowerInvariant() + $expectedSha256 = $ExpectedSha256.ToLowerInvariant() + if ($actualSha256 -ne $expectedSha256) { + throw "SHA256 mismatch for '$Path'. Expected '$expectedSha256', got '$actualSha256'." + } + + Write-Host "Validated SHA256 for '$Path': $actualSha256" +} + +function Get-RedistribManifestNames { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$CudaVersionTag + ) + + $indexFile = Join-Path $env:TEMP "cuda_redist_index_$PID.html" + try { + Invoke-WebRequestWithRetry -Uri "$RedistRootUri/" -OutFile $indexFile + $indexContent = Get-Content -LiteralPath $indexFile -Raw + } finally { + Remove-Item $indexFile -ErrorAction SilentlyContinue + } + + $pattern = "redistrib_$([regex]::Escape($CudaVersionTag))\.\d+\.json" + $manifestNames = @( + [regex]::Matches($indexContent, $pattern) | + ForEach-Object { $_.Value } | + Sort-Object -Unique + ) + + if ($manifestNames.Count -eq 0) { + throw "No CUDA $CudaVersionTag redistrib manifests were found at $RedistRootUri." + } + + return @( + $manifestNames | + ForEach-Object { + [PSCustomObject]@{ + Name = $_ + Version = [Version](($_ -replace '^redistrib_', '') -replace '\.json$', '') + } + } | + Sort-Object -Property Version -Descending | + ForEach-Object { $_.Name } + ) +} + +function Read-RedistManifest { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$ManifestName + ) + + $manifestFile = Join-Path $env:TEMP $ManifestName + try { + Invoke-WebRequestWithRetry -Uri "$RedistRootUri/$ManifestName" -OutFile $manifestFile + return Read-JsonFile -Path $manifestFile + } finally { + Remove-Item $manifestFile -ErrorAction SilentlyContinue + } +} + +function Select-ProfilerApiManifest { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$CudaVersionTag, + + [Parameter(Mandatory = $true)] + $VersionData + ) + + $localProfilerApiVersion = Get-ComponentVersion ` + -JsonObject $VersionData ` + -ComponentName "cuda_profiler_api" + $manifestNames = Get-RedistribManifestNames -CudaVersionTag $CudaVersionTag + + if ($localProfilerApiVersion) { + Write-Host "CUDA version metadata reports cuda_profiler_api $localProfilerApiVersion." + } else { + Write-Host "CUDA version metadata does not report cuda_profiler_api; matching by installed core components." + } + + $matchComponents = @("cuda_cupti", "cuda_cudart", "cuda_nvcc", "cuda_cccl") + $bestCandidate = $null + + foreach ($manifestName in $manifestNames) { + $manifest = Read-RedistManifest -ManifestName $manifestName + $manifestProfilerApiVersion = Get-ComponentVersion ` + -JsonObject $manifest ` + -ComponentName "cuda_profiler_api" + + if (-not $manifestProfilerApiVersion) { + continue + } + + if ($localProfilerApiVersion) { + if ($manifestProfilerApiVersion -eq $localProfilerApiVersion) { + Write-Host "Selected CUDA redist manifest $manifestName." + return [PSCustomObject]@{ + Name = $manifestName + Manifest = $manifest + } + } + continue + } + + $componentMatches = 0 + $mismatches = @() + foreach ($componentName in $matchComponents) { + $localVersion = Get-ComponentVersion ` + -JsonObject $VersionData ` + -ComponentName $componentName + $manifestVersion = Get-ComponentVersion ` + -JsonObject $manifest ` + -ComponentName $componentName + + if (-not $localVersion -or -not $manifestVersion) { + continue + } + + if ($localVersion -eq $manifestVersion) { + $componentMatches++ + } else { + $mismatches += "$componentName local=$localVersion manifest=$manifestVersion" + } + } + + if ($componentMatches -gt 0 -and $mismatches.Count -eq 0) { + if ($null -eq $bestCandidate -or $componentMatches -gt $bestCandidate.MatchCount) { + $bestCandidate = [PSCustomObject]@{ + Name = $manifestName + Manifest = $manifest + MatchCount = $componentMatches + } + } + } + } + + if ($localProfilerApiVersion) { + throw "Could not find a CUDA $CudaVersionTag redistrib manifest with cuda_profiler_api $localProfilerApiVersion." + } + + if ($null -eq $bestCandidate) { + throw "Could not match installed CUDA Toolkit component versions to a CUDA $CudaVersionTag redistrib manifest." + } + + Write-Host "Selected CUDA redist manifest $($bestCandidate.Name) using $($bestCandidate.MatchCount) component version match(es)." + return [PSCustomObject]@{ + Name = $bestCandidate.Name + Manifest = $bestCandidate.Manifest + } +} + +function Get-PayloadRoot { + Param( + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$ExtractDir + ) + + $directories = @(Get-ChildItem -LiteralPath $ExtractDir -Directory) + $files = @(Get-ChildItem -LiteralPath $ExtractDir -File) + if ($directories.Count -eq 1 -and $files.Count -eq 0) { + return $directories[0].FullName + } + + return $ExtractDir +} + +function Install-ProfilerApiPackage { + Param( + [Parameter(Mandatory = $true)] + $ManifestSelection, + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [string]$CudaRoot + ) + + $component = Get-JsonPropertyValue ` + -Object $ManifestSelection.Manifest ` + -Name "cuda_profiler_api" + if ($null -eq $component) { + throw "Manifest $($ManifestSelection.Name) does not contain cuda_profiler_api." + } + + $package = Get-JsonPropertyValue -Object $component -Name "windows-x86_64" + if ($null -eq $package) { + throw "Manifest $($ManifestSelection.Name) does not contain cuda_profiler_api for windows-x86_64." + } + + $relativePath = Get-JsonPropertyValue -Object $package -Name "relative_path" + $expectedSha256 = Get-JsonPropertyValue -Object $package -Name "sha256" + if (-not $relativePath -or -not $expectedSha256) { + throw "Manifest $($ManifestSelection.Name) is missing cuda_profiler_api relative_path or sha256." + } + if ($relativePath -notmatch '^cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-[^/]+-archive\.zip$') { + throw "Unexpected cuda_profiler_api package path in $($ManifestSelection.Name): $relativePath" + } + + $pathParts = $relativePath -split '/' + $archiveName = $pathParts[$pathParts.Length - 1] + $archive = Join-Path $env:TEMP $archiveName + $extractDir = Join-Path $env:TEMP "cuda_profiler_api_$([Guid]::NewGuid().ToString('N'))" + $archiveUri = "$RedistRootUri/$relativePath" + + try { + Write-Host "Downloading CUDA Profiler API redist package: $archiveUri" + Invoke-WebRequestWithRetry -Uri $archiveUri -OutFile $archive + Assert-Sha256 -Path $archive -ExpectedSha256 $expectedSha256 + + Expand-Archive -LiteralPath $archive -DestinationPath $extractDir -Force + $payloadRoot = Get-PayloadRoot -ExtractDir $extractDir + $payloadHeader = Join-Path $payloadRoot "include\cuda_profiler_api.h" + if (-not (Test-Path $payloadHeader)) { + throw "CUDA Profiler API archive did not contain expected header: $payloadHeader" + } + + Write-Host "Installing CUDA Profiler API package into: $CudaRoot" + Copy-Item -Path (Join-Path $payloadRoot "*") -Destination $CudaRoot -Recurse -Force + } finally { + Remove-Item $archive -ErrorAction SilentlyContinue + Remove-Item $extractDir -Recurse -Force -ErrorAction SilentlyContinue + } +} + +if (-not $CUDA_VERSION) { + throw "CUDA Toolkit version is required. Provide -cudaVersion ., for example '13.0'." +} + +if ($CUDA_VERSION -notmatch '^\d+\.\d+$') { + throw "Invalid CUDA Toolkit version '$CUDA_VERSION'. Expected '.', for example '13.0'." +} + +$version = [Version]$CUDA_VERSION +$mmVersionTag = "$($version.Major).$($version.Minor)" + +$nvccCudaRoot = Get-CudaRootFromNvcc +if ($nvccCudaRoot) { + $nvccCudaVersion = Get-CudaVersionFromRoot -CudaRoot $nvccCudaRoot + if (-not $nvccCudaVersion) { + throw "Could not determine CUDA version from active nvcc.exe root: $nvccCudaRoot" + } + if ($nvccCudaVersion -ne $mmVersionTag) { + throw "Active nvcc.exe is from CUDA $nvccCudaVersion, but CUDA $mmVersionTag was requested." + } +} + +if ($env:CUDA_PATH) { + $cudaPathVersion = Get-CudaVersionFromRoot -CudaRoot $env:CUDA_PATH + if (-not $cudaPathVersion) { + throw "Could not determine CUDA version from CUDA_PATH: $env:CUDA_PATH" + } + if ($cudaPathVersion -ne $mmVersionTag) { + throw "CUDA_PATH points to CUDA $cudaPathVersion, but CUDA $mmVersionTag was requested." + } + if ($nvccCudaRoot) { + Assert-SamePath ` + -Left $env:CUDA_PATH ` + -Right $nvccCudaRoot ` + -Message "CUDA_PATH and active nvcc.exe point to different CUDA Toolkit roots." + } + $cudaRoot = $env:CUDA_PATH +} elseif ($nvccCudaRoot) { + $cudaRoot = $nvccCudaRoot +} else { + $cudaRoot = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$mmVersionTag" +} + +$profilerHeader = Join-Path $cudaRoot "include\cuda_profiler_api.h" +if (Test-Path $profilerHeader) { + Write-Host "CUDA Profiler API is already installed: $profilerHeader" + return +} + +$versionJson = Join-Path $cudaRoot "version.json" +if (-not (Test-Path $versionJson)) { + throw "CUDA Toolkit version metadata was not found: $versionJson. Cannot determine the matching cuda_profiler_api redist package." +} + +$versionData = Read-JsonFile -Path $versionJson +$manifestSelection = Select-ProfilerApiManifest ` + -CudaVersionTag $mmVersionTag ` + -VersionData $versionData +Install-ProfilerApiPackage ` + -ManifestSelection $manifestSelection ` + -CudaRoot $cudaRoot + +if (-not (Test-Path $profilerHeader)) { + throw "CUDA Profiler API installation completed, but header was not found: $profilerHeader" +} + +Write-Host "CUDA Profiler API installed: $profilerHeader" diff --git a/cmake/NVBenchCUPTI.cmake b/cmake/NVBenchCUPTI.cmake index 789f4af1..18a25bd1 100644 --- a/cmake/NVBenchCUPTI.cmake +++ b/cmake/NVBenchCUPTI.cmake @@ -12,10 +12,73 @@ else() set(nvbench_cupti_root "${CUDAToolkit_LIBRARY_ROOT}") endif() +set(nvbench_cupti_library_hints "${nvbench_cupti_root}/lib64") +if (WIN32) + list(APPEND nvbench_cupti_library_hints + "${nvbench_cupti_root}/lib/x64" + "${nvbench_cupti_root}/lib" + ) +endif() + # The CUPTI targets in FindCUDAToolkit are broken: # - The dll locations are not specified # - Dependent libraries nvperf_* are not linked. # So we create our own targets: +function(nvbench_find_windows_cupti_runtime_library out_var dep_name library_path) + cmake_path(GET library_path PARENT_PATH library_dir) + set(runtime_search_dirs "${library_dir}") + + if ("${library_dir}" MATCHES "/Library/lib/x64$") + cmake_path(GET library_dir PARENT_PATH conda_lib_dir) + cmake_path(GET conda_lib_dir PARENT_PATH conda_library_dir) + list(APPEND runtime_search_dirs "${conda_library_dir}/bin") + elseif ("${library_dir}" MATCHES "/Library/lib$") + cmake_path(GET library_dir PARENT_PATH conda_library_dir) + list(APPEND runtime_search_dirs "${conda_library_dir}/bin") + endif() + + list(REMOVE_DUPLICATES runtime_search_dirs) + + foreach(runtime_search_dir IN LISTS runtime_search_dirs) + if ("${dep_name}" STREQUAL "cupti") + file(GLOB runtime_libraries LIST_DIRECTORIES false + "${runtime_search_dir}/cupti64_*.dll" + ) + if (NOT runtime_libraries) + file(GLOB runtime_libraries LIST_DIRECTORIES false + "${runtime_search_dir}/cupti.dll" + ) + endif() + else() + file(GLOB runtime_libraries LIST_DIRECTORIES false + "${runtime_search_dir}/${dep_name}.dll" + ) + endif() + + if (runtime_libraries) + list(SORT runtime_libraries COMPARE NATURAL ORDER DESCENDING) + list(LENGTH runtime_libraries num_runtime_libraries) + if (num_runtime_libraries GREATER 1) + list(GET runtime_libraries 0 runtime_library) + message(WARNING + "Found multiple runtime DLLs for ${dep_name}; selecting " + "${runtime_library}. Candidates: ${runtime_libraries}" + ) + else() + list(GET runtime_libraries 0 runtime_library) + endif() + + set(${out_var} "${runtime_library}" PARENT_SCOPE) + return() + endif() + endforeach() + + message(FATAL_ERROR + "Could not find the runtime DLL for ${dep_name}. " + "Searched these directories: ${runtime_search_dirs}" + ) +endfunction() + function(nvbench_add_cupti_dep dep_name) string(TOLOWER ${dep_name} dep_name_lower) string(TOUPPER ${dep_name} dep_name_upper) @@ -23,14 +86,20 @@ function(nvbench_add_cupti_dep dep_name) add_library(nvbench::${dep_name_lower} SHARED IMPORTED) find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED - DOC "The import library for ${dep_name_lower} from the CUDA Toolkit." - HINTS "${nvbench_cupti_root}/lib64" + DOC "The library for ${dep_name_lower} from the CUDA Toolkit." + HINTS ${nvbench_cupti_library_hints} ) mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY) if (WIN32) + nvbench_find_windows_cupti_runtime_library( + NVBench_${dep_name_upper}_DLL + ${dep_name_lower} + "${NVBench_${dep_name_upper}_LIBRARY}" + ) set_target_properties(nvbench::${dep_name_lower} PROPERTIES IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}" + IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}" ) else() set_target_properties(nvbench::${dep_name_lower} PROPERTIES diff --git a/cmake/NVBenchConfigTarget.cmake b/cmake/NVBenchConfigTarget.cmake index 536e9663..74652df1 100644 --- a/cmake/NVBenchConfigTarget.cmake +++ b/cmake/NVBenchConfigTarget.cmake @@ -136,3 +136,42 @@ function(nvbench_config_target target_name) ) endif() endfunction() + +function(nvbench_append_test_runtime_path path_modifications_var target_name) + if (NOT TARGET ${target_name}) + return() + endif() + + list(APPEND ${path_modifications_var} + "PATH=path_list_prepend:$" + ) + + set(${path_modifications_var} + "${${path_modifications_var}}" + PARENT_SCOPE + ) +endfunction() + +function(nvbench_config_test_runtime_environment test_name) + if (NOT WIN32) + return() + endif() + + set(path_modifications "") + if (TARGET nvbench) + nvbench_append_test_runtime_path(path_modifications nvbench) + else() + nvbench_append_test_runtime_path(path_modifications nvbench::nvbench) + endif() + + nvbench_append_test_runtime_path(path_modifications nvbench::cupti) + nvbench_append_test_runtime_path(path_modifications nvbench::nvperf_target) + nvbench_append_test_runtime_path(path_modifications nvbench::nvperf_host) + + if (path_modifications) + list(REMOVE_DUPLICATES path_modifications) + set_property(TEST ${test_name} + APPEND PROPERTY ENVIRONMENT_MODIFICATION ${path_modifications} + ) + endif() +endfunction() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 061f8eb5..2abe3c7d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,6 +37,7 @@ function (nvbench_add_examples_target target_prefix cuda_std) add_test(NAME ${example_name} COMMAND "$" ${example_args}) + nvbench_config_test_runtime_environment(${example_name}) # These should not deadlock. If they do, it may be that the CUDA context was created before # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136. diff --git a/exec/CMakeLists.txt b/exec/CMakeLists.txt index 775dccc9..7a9e88ec 100644 --- a/exec/CMakeLists.txt +++ b/exec/CMakeLists.txt @@ -9,35 +9,57 @@ add_dependencies(nvbench.all nvbench.ctl) nvbench_install_executables(nvbench.ctl) if (NVBench_ENABLE_TESTING) + set(ctl_test_names) + # Test: nvbench - add_test(NAME nvbench.ctl.no_args COMMAND "$") + set(test_name nvbench.ctl.no_args) + add_test(NAME ${test_name} COMMAND "$") + list(APPEND ctl_test_names ${test_name}) # Should print the version without any args: - set_property(TEST nvbench.ctl.no_args + set_property(TEST ${test_name} PROPERTY PASS_REGULAR_EXPRESSION "NVBench v" ) # Test: nvbench --version - add_test(NAME nvbench.ctl.version COMMAND "$" --version) + set(test_name nvbench.ctl.version) + add_test(NAME ${test_name} COMMAND "$" --version) + list(APPEND ctl_test_names ${test_name}) # Should print the version without any args: - set_property(TEST nvbench.ctl.version + set_property(TEST ${test_name} PROPERTY PASS_REGULAR_EXPRESSION "NVBench v" ) # Test: nvbench --list - add_test(NAME nvbench.ctl.list COMMAND "$" --list) + set(test_name nvbench.ctl.list) + add_test(NAME ${test_name} COMMAND "$" --list) + list(APPEND ctl_test_names ${test_name}) # Test: nvbench -l - add_test(NAME nvbench.ctl.l COMMAND "$" -l) + set(test_name nvbench.ctl.l) + add_test(NAME ${test_name} COMMAND "$" -l) + list(APPEND ctl_test_names ${test_name}) # Test: nvbench --help - add_test(NAME nvbench.ctl.help COMMAND "$" --help) + set(test_name nvbench.ctl.help) + add_test(NAME ${test_name} COMMAND "$" --help) + list(APPEND ctl_test_names ${test_name}) # Test: nvbench -h - add_test(NAME nvbench.ctl.h COMMAND "$" -h) + set(test_name nvbench.ctl.h) + add_test(NAME ${test_name} COMMAND "$" -h) + list(APPEND ctl_test_names ${test_name}) # Test: nvbench --help-axes - add_test(NAME nvbench.ctl.help_axes COMMAND "$" --help-axes) + set(test_name nvbench.ctl.help_axes) + add_test(NAME ${test_name} COMMAND "$" --help-axes) + list(APPEND ctl_test_names ${test_name}) # Test: nvbench --help-axis - add_test(NAME nvbench.ctl.help_axis COMMAND "$" --help-axis) + set(test_name nvbench.ctl.help_axis) + add_test(NAME ${test_name} COMMAND "$" --help-axis) + list(APPEND ctl_test_names ${test_name}) + + foreach(test_name IN LISTS ctl_test_names) + nvbench_config_test_runtime_environment(${test_name}) + endforeach() endif() diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index 7466dcd7..ba505bf2 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -101,6 +101,12 @@ target_link_libraries(nvbench fmt::fmt nvbench_json ) +target_compile_options(nvbench PUBLIC + # CCCL requires MSVC's conforming preprocessor when compiling CUDA sources + # with cl.exe as the host compiler. + $<$:/Zc:preprocessor> + $<$,$>:-Xcompiler=/Zc:preprocessor> +) # ################################################################################################## @@ -134,6 +140,10 @@ nvbench_config_target(nvbench.main) target_compile_definitions(nvbench.main PRIVATE NVBENCH_NO_IMPLICIT_SYSTEM_HEADER) # Propagate `nvbench` to consumers but keep NVBench's own build warning-visible. target_link_libraries(nvbench.main PUBLIC nvbench) +if (MSVC) + # inform MSVC that library provides main + target_link_options(nvbench.main INTERFACE "LINKER:/INCLUDE:main") +endif() # Ensure CUDA/CUPTI/NVML include dirs are visible for nvbench.main's build. target_link_libraries(nvbench.main PRIVATE ${ctk_libraries}) # Add NVBench's headers privately so the main library itself sees warnings. diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index bbf3e190..4f160923 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -47,6 +47,7 @@ foreach(test_src IN LISTS test_srcs) set_target_properties(${test_name} PROPERTIES COMPILE_FEATURES cuda_std_17) nvbench_config_target(${test_name}) add_test(NAME ${test_name} COMMAND "$" ${NVBench_TEST_ARGS_${test_name}}) + nvbench_config_test_runtime_environment(${test_name}) add_dependencies(nvbench.test.all ${test_name}) endforeach() diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt index 506de5a7..d2082575 100644 --- a/testing/cmake/CMakeLists.txt +++ b/testing/cmake/CMakeLists.txt @@ -10,12 +10,22 @@ set(cmake_opts -D "CMAKE_CUDA_ARCHITECTURES=${arches}" ) if (WIN32) + set(cuda_host_compiler "${CMAKE_CUDA_HOST_COMPILER}") + if (NOT cuda_host_compiler) + set(cuda_host_compiler "${CMAKE_CXX_COMPILER}") + endif() list(APPEND cmake_opts - -D "CMAKE_CUDA_HOST_COMPILER=${CMAKE_CXX_COMPILER}" - -D "CMAKE_LINKER=${CMAKE_LINKER}" - -D "CMAKE_RC_COMPILER=${CMAKE_RC_COMPILER}" - -D "CMAKE_MT=${CMAKE_MT}" + -D "CMAKE_CUDA_HOST_COMPILER=${cuda_host_compiler}" ) + if (CMAKE_LINKER) + list(APPEND cmake_opts -D "CMAKE_LINKER=${CMAKE_LINKER}") + endif() + if (CMAKE_RC_COMPILER) + list(APPEND cmake_opts -D "CMAKE_RC_COMPILER=${CMAKE_RC_COMPILER}") + endif() + if (CMAKE_MT) + list(APPEND cmake_opts -D "CMAKE_MT=${CMAKE_MT}") + endif() endif() # Temporary installation prefix for tests against installed nvbench: @@ -40,11 +50,13 @@ function(nvbench_add_compile_test full_test_name_var subdir test_id) ${ARGN} --test-command "${CMAKE_CTEST_COMMAND}" --output-on-failure ) - if (WIN32 AND NVBench_ENABLE_CUPTI AND nvbench_cupti_root) - cmake_path(NATIVE_PATH nvbench_cupti_root cupti_native) - cmake_path(NATIVE_PATH NVBench_EXECUTABLE_OUTPUT_DIR bin_native) - set_tests_properties(${test_name} PROPERTIES - ENVIRONMENT "PATH=${bin_native}\\;${cupti_native}\\lib64\\;$ENV{PATH}" + if (WIN32) + set(path_mods "PATH=path_list_prepend:$") + if (TARGET nvbench::cupti) + list(PREPEND path_mods "PATH=path_list_prepend:$") + endif() + set_property(TEST ${test_name} PROPERTY + ENVIRONMENT_MODIFICATION ${path_mods} ) endif() set(${full_test_name_var} ${test_name} PARENT_SCOPE) diff --git a/testing/cmake/test_export/CMakeLists.txt b/testing/cmake/test_export/CMakeLists.txt index 21faa30a..f0aae8b2 100644 --- a/testing/cmake/test_export/CMakeLists.txt +++ b/testing/cmake/test_export/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20.1) +cmake_minimum_required(VERSION 3.22.0) project(NVBenchTestExport CUDA CXX) message(STATUS "NVBench_DIR=${NVBench_DIR}") @@ -12,51 +12,105 @@ add_test(NAME nvbench_ctl COMMAND "$") # Setup runtime library paths for testing. # Unix uses LD_LIBRARY_PATH; Windows uses PATH for DLL lookup. -get_property(nvbench_config TARGET nvbench::nvbench - PROPERTY IMPORTED_CONFIGURATIONS -) -list(LENGTH nvbench_config num_configs) -if (num_configs GREATER 1) - message(WARNING - "Multiple IMPORTED_CONFIGURATIONS for nvbench::nvbench. " - "Picking the first one. This may cause issues." +function(get_imported_location out_var target_name) + get_property(imported_configs TARGET ${target_name} + PROPERTY IMPORTED_CONFIGURATIONS ) - list(GET nvbench_config 0 nvbench_config) -endif() + list(LENGTH imported_configs num_configs) + if (num_configs GREATER 1) + message(WARNING + "Multiple IMPORTED_CONFIGURATIONS for ${target_name}. " + "Picking CMAKE_BUILD_TYPE if present, otherwise the first one." + ) + endif() + + if (num_configs GREATER 0) + if (CMAKE_BUILD_TYPE) + string(TOUPPER "${CMAKE_BUILD_TYPE}" build_type) + list(FIND imported_configs "${build_type}" imported_config_index) + else() + set(imported_config_index -1) + endif() + if (imported_config_index GREATER_EQUAL 0) + list(GET imported_configs ${imported_config_index} imported_config) + else() + list(GET imported_configs 0 imported_config) + endif() + get_property(imported_location TARGET ${target_name} + PROPERTY IMPORTED_LOCATION_${imported_config} + ) + endif() + + if (NOT imported_location) + get_property(imported_location TARGET ${target_name} + PROPERTY IMPORTED_LOCATION + ) + endif() + + set(${out_var} "${imported_location}" PARENT_SCOPE) +endfunction() set(nvbench_lib_dir "") # On Unix the build tree uses RUNPATH so only the install tree needs the path. # On Windows there is no RUNPATH so we always need the DLL directory. if (WIN32 OR TEST_TYPE STREQUAL "INSTALL_TREE") - get_property(nvbench_lib TARGET nvbench::nvbench - PROPERTY IMPORTED_LOCATION_${nvbench_config} - ) - cmake_path(GET nvbench_lib PARENT_PATH nvbench_lib_dir) + get_imported_location(nvbench_lib nvbench::nvbench) + if (nvbench_lib) + cmake_path(GET nvbench_lib PARENT_PATH nvbench_lib_dir) + endif() endif() set(cupti_lib_dir "") if (TARGET nvbench::cupti) - if (WIN32) - get_property(cupti_lib TARGET nvbench::cupti PROPERTY IMPORTED_IMPLIB) - else() - get_property(cupti_lib TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION) + get_imported_location(cupti_lib nvbench::cupti) + if (cupti_lib) + cmake_path(GET cupti_lib PARENT_PATH cupti_lib_dir) endif() - cmake_path(GET cupti_lib PARENT_PATH cupti_lib_dir) endif() if (WIN32) - set(lib_dirs "${nvbench_lib_dir}\\;${cupti_lib_dir}") - set_property(TEST test_bench PROPERTY - ENVIRONMENT "PATH=${lib_dirs}\\;$ENV{PATH}" - ) - set_property(TEST nvbench_ctl PROPERTY - ENVIRONMENT "PATH=${lib_dirs}\\;$ENV{PATH}" - ) + set(path_modifications "") + if (cupti_lib_dir) + list(APPEND path_modifications "PATH=path_list_prepend:$") + endif() + if (nvbench_lib_dir) + list(APPEND path_modifications "PATH=path_list_prepend:$") + endif() + if (path_modifications) + set_property(TEST test_bench PROPERTY + ENVIRONMENT_MODIFICATION ${path_modifications} + ) + set_property(TEST nvbench_ctl PROPERTY + ENVIRONMENT_MODIFICATION ${path_modifications} + ) + endif() else() - set_property(TEST test_bench PROPERTY - ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_dir}" - ) - set_property(TEST nvbench_ctl PROPERTY - ENVIRONMENT "LD_LIBRARY_PATH=${nvbench_lib_dir}:${cupti_lib_dir}" - ) + set(test_bench_ld_modifications "") + if (cupti_lib_dir) + list(APPEND test_bench_ld_modifications + "LD_LIBRARY_PATH=path_list_prepend:$" + ) + endif() + if (test_bench_ld_modifications) + set_property(TEST test_bench PROPERTY + ENVIRONMENT_MODIFICATION ${test_bench_ld_modifications} + ) + endif() + + set(nvbench_ctl_ld_modifications "") + if (cupti_lib_dir) + list(APPEND nvbench_ctl_ld_modifications + "LD_LIBRARY_PATH=path_list_prepend:$" + ) + endif() + if (nvbench_lib_dir) + list(APPEND nvbench_ctl_ld_modifications + "LD_LIBRARY_PATH=path_list_prepend:$" + ) + endif() + if (nvbench_ctl_ld_modifications) + set_property(TEST nvbench_ctl PROPERTY + ENVIRONMENT_MODIFICATION ${nvbench_ctl_ld_modifications} + ) + endif() endif() diff --git a/testing/device/CMakeLists.txt b/testing/device/CMakeLists.txt index b7272ee1..918b5f1a 100644 --- a/testing/device/CMakeLists.txt +++ b/testing/device/CMakeLists.txt @@ -7,6 +7,7 @@ add_dependencies(nvbench.test.all ${test_name}) if (NVBench_ENABLE_DEVICE_TESTING) add_test(NAME ${test_name} COMMAND "$") + nvbench_config_test_runtime_environment(${test_name}) set_tests_properties(${test_name} PROPERTIES # Any timeouts/warnings are hard failures for this test. FAIL_REGULAR_EXPRESSION "Warn;timed out" From d481ff654047099b71befd241aa3891f54e7219b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 18 May 2026 16:46:46 -0500 Subject: [PATCH 5/6] Fix for building in conda environment --- nvbench/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index ba505bf2..e943eb5a 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -111,6 +111,14 @@ target_compile_options(nvbench PUBLIC # ################################################################################################## # * conda environment ----------------------------------------------------------------------------- +if (WIN32) + foreach(conda_path_var IN ITEMS CONDA_PREFIX PREFIX BUILD_PREFIX CMAKE_PREFIX_PATH) + if (DEFINED ENV{${conda_path_var}}) + file(TO_CMAKE_PATH "$ENV{${conda_path_var}}" conda_path_value) + set(ENV{${conda_path_var}} "${conda_path_value}") + endif() + endforeach() +endif() rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) if(TARGET conda_env) # When we are inside a conda env the linker will be set to From 2aaf76eb802716660d6f580159a7fa2f1c81b435 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 18 May 2026 20:15:25 -0500 Subject: [PATCH 6/6] Add comment re natural order sorting --- cmake/NVBenchCUPTI.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/NVBenchCUPTI.cmake b/cmake/NVBenchCUPTI.cmake index 18a25bd1..552a5d78 100644 --- a/cmake/NVBenchCUPTI.cmake +++ b/cmake/NVBenchCUPTI.cmake @@ -56,6 +56,8 @@ function(nvbench_find_windows_cupti_runtime_library out_var dep_name library_pat endif() if (runtime_libraries) + # Natural sort compares numeric suffixes numerically, so cupti64_10.dll + # sorts newer than cupti64_9.dll. list(SORT runtime_libraries COMPARE NATURAL ORDER DESCENDING) list(LENGTH runtime_libraries num_runtime_libraries) if (num_runtime_libraries GREATER 1)