nvbug-6193808: Work around mojibake in nvml.system_get_process_name on WSL (#2118)

mdboom · web-flow · commit 29575957ceb1 · 2026-05-21T23:17:30.000Z
* nvbug-6193808: Work around mojibake in nvml.system_get_process_name on WSL

* Re-enable test

* Move POSIX-only functionality to a separate module

* Address comments in the PR
diff --git a/cuda_bindings/docs/source/release/13.2.0-notes.rst b/cuda_bindings/docs/source/release/13.2.0-notes.rst
@@ -80,3 +80,4 @@ Known issues
 ------------
 
 * Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
+* ``nvml.system_get_process_name`` on WSL can return incorrect values.  To work around this, set the locale to "C" before calling ``nvml.device_get_compute_running_processes_v3`` (which sets the process names) and before calling ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but users of the raw NVML API will need to do this manually.
diff --git a/cuda_bindings/docs/source/release/13.3.0-notes.rst b/cuda_bindings/docs/source/release/13.3.0-notes.rst
@@ -39,3 +39,4 @@ Known issues
 ------------
 
 * Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
+* ``nvml.system_get_process_name`` on WSL can return incorrect values.  To work around this, set the locale to "C" before calling ``nvml.device_get_compute_running_processes_v3`` (which sets the process names) and before calling ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but users of the raw NVML API will need to do this manually.
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -143,12 +143,21 @@ def _build_cuda_core(debug=False):
         # cuda-bindings not available in editable mode, will use installed version
         pass
 
+    _posix_only_modules = frozenset(
+        {
+            "_utils/_wsl_locale",
+        }
+    )
+
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
     def module_names():
         root_path = os.path.sep.join(["cuda", "core", ""])
         for filename in glob.glob(f"{root_path}/**/*.pyx", recursive=True):
-            yield filename[len(root_path) : -4]
+            mod = filename[len(root_path) : -4]
+            if sys.platform == "win32" and mod.replace(os.path.sep, "/") in _posix_only_modules:
+                continue
+            yield mod
 
     def get_sources(mod_name):
         """Get source files for a module, including any .cpp files."""
diff --git a/cuda_core/cuda/core/_utils/_wsl_locale.pyx b/cuda_core/cuda/core/_utils/_wsl_locale.pyx
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+# WSL-specific locale guard, used by cuda.core.system.get_process_name() to
+# work around a bug in NVML's WSL implementation where nvmlSystemGetProcessName
+# returns mojibake when the calling thread is in a non-"C" locale. See
+# get_process_name() for the full backstory.
+#
+# This module is only compiled on Linux (build_hooks.py excludes it on Windows)
+# because it uses the POSIX per-thread locale APIs (newlocale/uselocale/
+# freelocale), which are not available on MSVC. Callers must guard imports of
+# this module with try/except ImportError.
+
+
+cdef extern from "locale.h" nogil:
+    ctypedef void *locale_t
+    int LC_ALL_MASK
+    locale_t newlocale(int category_mask, const char *locale, locale_t base)
+    locale_t uselocale(locale_t newloc)
+    void freelocale(locale_t locobj)
+
+
+cdef class c_locale_guard:
+    """Context manager that pins the calling thread to the "C" locale.
+
+    Uses POSIX newlocale/uselocale/freelocale so other threads' view of the
+    locale is unaffected. Restores the previous thread locale on exit.
+    """
+    cdef locale_t _c_locale
+    cdef locale_t _prev_locale
+    cdef bint _active
+
+    def __cinit__(self):
+        self._c_locale = <locale_t>0
+        self._prev_locale = <locale_t>0
+        self._active = False
+
+    def __enter__(self):
+        self._c_locale = newlocale(LC_ALL_MASK, b"C", <locale_t>0)
+        if self._c_locale == <locale_t>0:
+            raise RuntimeError("Failed to create C locale")
+        self._prev_locale = uselocale(self._c_locale)
+        self._active = True
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._active:
+            uselocale(self._prev_locale)
+            freelocale(self._c_locale)
+            self._active = False
+        return False
diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx
@@ -10,6 +10,30 @@
 
 CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool
 
+
+cdef bint _detect_wsl():
+    try:
+        with open("/proc/sys/kernel/osrelease") as f:
+            data = f.read().lower()
+    except OSError:
+        return False
+    return "microsoft" in data or "wsl" in data
+
+
+cdef bint _IS_WSL = _detect_wsl()
+
+
+# The WSL locale guard lives in a separate module that is only compiled on
+# Linux (build_hooks.py excludes it on Windows), because it relies on POSIX
+# per-thread locale APIs that MSVC does not provide. On non-Linux platforms
+# the import fails and we fall back to a no-op guard; _IS_WSL is then False
+# so the guard is never entered anyway.
+if _IS_WSL:
+    from cuda.core._utils._wsl_locale import c_locale_guard
+else:
+    c_locale_guard = None
+
+
 try:
     from cuda.bindings._version import __version_tuple__ as _BINDINGS_VERSION
 except ImportError:
@@ -127,8 +151,37 @@ def get_process_name(pid: int) -> str:
     name: str
         The process name.
     """
+    def _get_process_name(pid) -> str:
+        # NVML caches process names on a per-PID basis when queried via
+        # nvmlSystemGetProcessName, and the cache is populated when enumerating
+        # running processes on devices. To ensure the name is cached for the
+        # requested PID, we walk all devices and query their running processes.
+        for i in range(nvml.device_get_count_v2()):
+            try:
+                dev_h = nvml.device_get_handle_by_index_v2(i)
+                nvml.device_get_compute_running_processes_v3(dev_h)
+            except nvml.NvmlError:
+                continue
+        return nvml.system_get_process_name(pid)
+
     initialize()
-    return nvml.system_get_process_name(pid)
+    if not _IS_WSL:
+        return _get_process_name(pid)
+
+    # WSL workaround: nvmlSystemGetProcessName on WSL takes a wide-char
+    # conversion path when the calling thread's locale is non-"C". That path
+    # walks a UTF-16LE source buffer with a 4-byte stride (as if it were
+    # UTF-32LE) and emits 5-byte UTF-8 sequences that look like garbage
+    # preceding the trailing basename of /proc/<pid>/exe. CPython's startup
+    # unconditionally calls setlocale(LC_ALL, ""), so essentially every
+    # cuda.core caller hits this. The cached entry for the PID is set the
+    # first time NVML resolves it (typically inside
+    # nvmlDeviceGetComputeRunningProcesses_v3), so to recover a correct value
+    # we re-prime the cache under the "C" locale before reading the name.
+    # c_locale_guard uses POSIX per-thread locale APIs (see _wsl_locale.pyx)
+    # so other threads' view of the locale is unaffected.
+    with c_locale_guard():  # no-cython-lint
+        return _get_process_name(pid)
 
 
 __all__ = [
diff --git a/cuda_core/docs/source/release/1.1.0-notes.rst b/cuda_core/docs/source/release/1.1.0-notes.rst
@@ -43,3 +43,12 @@ New features
   :attr:`~ManagedBuffer.preferred_location`,
   :attr:`~ManagedBuffer.accessed_by`). Locations are expressed via
   :class:`Device` or :class:`Host`.
+
+Bug fixes
+---------
+
+- On WSL, ``cuda.core.system.get_process_name`` would raise a
+  ``UnicodeDecodeError``.  It should now return the correct result.
+- Calling ``cuda.core.system.get_process_name`` before querying any device's
+  ``compute_running_processes`` would raise a ``NvmlNotFoundError``.  Now it will
+  correctly return the process name, if it is a GPU-using process.
diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py
@@ -12,8 +12,6 @@
 except ImportError:
     from cuda import cuda as driver
 
-import helpers
-
 from cuda.core import system
 from cuda.core._utils.cuda_utils import handle_return
 
@@ -62,9 +60,11 @@ def test_nvml_version():
         assert 0 <= ver_patch[0] <= 99
 
 
-@pytest.mark.skipif(helpers.IS_WSL, reason="Process names may not be available on WSL")
 @skip_if_nvml_unsupported
 def test_get_process_name():
+    for device in system.Device.get_all_devices():
+        x = device.compute_running_processes
+
     try:
         process_name = system.get_process_name(os.getpid())
     except system.NotFoundError:

Original file line number	Diff line number	Diff line change
`@@ -80,3 +80,4 @@ Known issues`
`80`	`80`	`------------`
`81`	`81`
`82`	`82`	* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
	`83`	+* ``nvml.system_get_process_name`` on WSL can return incorrect values. To work around this, set the locale to "C" before calling ``nvml.device_get_compute_running_processes_v3`` (which sets the process names) and before calling ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but users of the raw NVML API will need to do this manually.
Original file line number	Diff line number	Diff line change
`@@ -39,3 +39,4 @@ Known issues`
`39`	`39`	`------------`
`40`	`40`
`41`	`41`	* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
	`42`	+* ``nvml.system_get_process_name`` on WSL can return incorrect values. To work around this, set the locale to "C" before calling ``nvml.device_get_compute_running_processes_v3`` (which sets the process names) and before calling ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but users of the raw NVML API will need to do this manually.