nvbug-6193808: Work around mojibake in nvml.system_get_process_name on WSL

mdboom · mdboom · commit 06a2bc8a8d27 · 2026-05-20T15:18:12.000-04:00
diff --git a/cuda_bindings/docs/source/release/13.2.0-notes.rst b/cuda_bindings/docs/source/release/13.2.0-notes.rst
@@ -80,3 +80,7 @@ Known issues
 ------------
 
 * Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
+* ``nvml.system_get_process_name`` on WSL can return incorrect values.  To work
+  around this, set the locale to "C" before calling
+  ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but
+  users of the raw NVML API will need to do this manually.
diff --git a/cuda_bindings/docs/source/release/13.3.0-notes.rst b/cuda_bindings/docs/source/release/13.3.0-notes.rst
@@ -39,3 +39,7 @@ Known issues
 ------------
 
 * Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
+* ``nvml.system_get_process_name`` on WSL can return incorrect values.  To work
+  around this, set the locale to "C" before calling
+  ``nvml.system_get_process_name``. ``cuda_core`` does this automatically, but
+  users of the raw NVML API will need to do this manually.
diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx
@@ -10,6 +10,30 @@
 
 CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool
 
+
+# POSIX per-thread locale APIs. We use these (rather than setlocale(3))
+# so the WSL workaround in get_process_name() doesn't perturb the locale
+# observed by other threads. locale_t is an opaque pointer in glibc.
+cdef extern from "locale.h" nogil:
+    ctypedef void *locale_t
+    int LC_ALL_MASK
+    locale_t LC_GLOBAL_LOCALE
+    locale_t newlocale(int category_mask, const char *locale, locale_t base)
+    locale_t uselocale(locale_t newloc)
+    void freelocale(locale_t locobj)
+
+
+cdef bint _detect_wsl():
+    try:
+        with open("/proc/sys/kernel/osrelease") as f:
+            data = f.read().lower()
+    except OSError:
+        return False
+    return "microsoft" in data or "wsl" in data
+
+
+cdef bint _IS_WSL = _detect_wsl()
+
 try:
     from cuda.bindings._version import __version_tuple__ as _BINDINGS_VERSION
 except ImportError:
@@ -127,8 +151,43 @@ def get_process_name(pid: int) -> str:
     name: str
         The process name.
     """
+    def _get_process_name(pid) -> str:
+        # NVML caches process names on a per-PID basis when queried via
+        # nvmlSystemGetProcessName, and the cache is populated when enumerating
+        # running processes on devices. To ensure the name is cached for the
+        # requested PID, we walk all devices and query their running processes.
+        for i in range(nvml.device_get_count_v2()):
+            dev_h = nvml.device_get_handle_by_index_v2(i)
+            nvml.device_get_compute_running_processes_v3(dev_h)
+        return nvml.system_get_process_name(pid)
+
+    cdef locale_t c_locale
+    cdef locale_t prev_locale
+
     initialize()
-    return nvml.system_get_process_name(pid)
+    if not _IS_WSL:
+        return _get_process_name(pid)
+
+    # WSL workaround: nvmlSystemGetProcessName on WSL takes a wide-char
+    # conversion path when the process locale is non-"C". That path walks
+    # a UTF-16LE source buffer with a 4-byte stride (as if it were UTF-32LE)
+    # and emits 5-byte UTF-8 sequences that look like garbage preceding the
+    # trailing basename of /proc/<pid>/exe. CPython's startup unconditionally
+    # calls setlocale(LC_ALL, ""), so essentially every cuda.core caller hits
+    # this. The cached entry for the PID is set the first time NVML resolves
+    # it (typically inside nvmlDeviceGetComputeRunningProcesses_v3), so to
+    # recover a correct value we re-prime the cache under the "C" locale
+    # before reading the name. We use the POSIX per-thread locale APIs so
+    # other threads' view of the locale is unaffected.
+    c_locale = newlocale(LC_ALL_MASK, b"C", <locale_t>0)
+    if c_locale == <locale_t>0:
+        raise RuntimeError("Failed to create C locale")
+    prev_locale = uselocale(c_locale)
+    try:
+        return _get_process_name(pid)
+    finally:
+        uselocale(prev_locale)
+        freelocale(c_locale)
 
 
 __all__ = [
diff --git a/cuda_core/docs/source/release/1.1.0-notes.rst b/cuda_core/docs/source/release/1.1.0-notes.rst
@@ -43,3 +43,13 @@ New features
   :attr:`~ManagedBuffer.preferred_location`,
   :attr:`~ManagedBuffer.accessed_by`). Locations are expressed via
   :class:`Device` or :class:`Host`.
+
+Bug fixes
+---------
+
+- On WSL, ``cuda.core.system.get_process_name`` would raise a
+  ``UnicodeDecodeError``.  It should now return the correct result.  Note it may
+  cause a performance issue as it holds a global lock.
+- Calling ``cuda.core.system.get_process_name`` before querying any device's
+  ``compute_running_processes`` would raise a ``NvmlNotFoundError``.  Now it will
+  correctly return the process name, if it is a GPU-using process.