Move POSIX-only functionality to a separate module

mdboom · mdboom · commit 22fd00c5973b · 2026-05-20T16:09:28.000-04:00
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -143,12 +143,21 @@ def _build_cuda_core(debug=False):
         # cuda-bindings not available in editable mode, will use installed version
         pass
 
+    _posix_only_modules = frozenset(
+        {
+            "_utils/_wsl_locale",
+        }
+    )
+
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
     def module_names():
         root_path = os.path.sep.join(["cuda", "core", ""])
         for filename in glob.glob(f"{root_path}/**/*.pyx", recursive=True):
-            yield filename[len(root_path) : -4]
+            mod = filename[len(root_path) : -4]
+            if sys.platform == "win32" and mod.replace(os.path.sep, "/") in _posix_only_modules:
+                continue
+            yield mod
 
     def get_sources(mod_name):
         """Get source files for a module, including any .cpp files."""
diff --git a/cuda_core/cuda/core/_utils/_wsl_locale.pyx b/cuda_core/cuda/core/_utils/_wsl_locale.pyx
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+# WSL-specific locale guard, used by cuda.core.system.get_process_name() to
+# work around a bug in NVML's WSL implementation where nvmlSystemGetProcessName
+# returns mojibake when the calling thread is in a non-"C" locale. See
+# get_process_name() for the full backstory.
+#
+# This module is only compiled on Linux (build_hooks.py excludes it on Windows)
+# because it uses the POSIX per-thread locale APIs (newlocale/uselocale/
+# freelocale), which are not available on MSVC. Callers must guard imports of
+# this module with try/except ImportError.
+
+
+cdef extern from "locale.h" nogil:
+    ctypedef void *locale_t
+    int LC_ALL_MASK
+    locale_t newlocale(int category_mask, const char *locale, locale_t base)
+    locale_t uselocale(locale_t newloc)
+    void freelocale(locale_t locobj)
+
+
+cdef class c_locale_guard:
+    """Context manager that pins the calling thread to the "C" locale.
+
+    Uses POSIX newlocale/uselocale/freelocale so other threads' view of the
+    locale is unaffected. Restores the previous thread locale on exit.
+    """
+    cdef locale_t _c_locale
+    cdef locale_t _prev_locale
+    cdef bint _active
+
+    def __cinit__(self):
+        self._c_locale = <locale_t>0
+        self._prev_locale = <locale_t>0
+        self._active = False
+
+    def __enter__(self):
+        self._c_locale = newlocale(LC_ALL_MASK, b"C", <locale_t>0)
+        if self._c_locale == <locale_t>0:
+            raise RuntimeError("Failed to create C locale")
+        self._prev_locale = uselocale(self._c_locale)
+        self._active = True
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._active:
+            uselocale(self._prev_locale)
+            freelocale(self._c_locale)
+            self._active = False
+        return False
diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx
@@ -11,18 +11,6 @@
 CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool
 
 
-# POSIX per-thread locale APIs. We use these (rather than setlocale(3))
-# so the WSL workaround in get_process_name() doesn't perturb the locale
-# observed by other threads. locale_t is an opaque pointer in glibc.
-cdef extern from "locale.h" nogil:
-    ctypedef void *locale_t
-    int LC_ALL_MASK
-    locale_t LC_GLOBAL_LOCALE
-    locale_t newlocale(int category_mask, const char *locale, locale_t base)
-    locale_t uselocale(locale_t newloc)
-    void freelocale(locale_t locobj)
-
-
 cdef bint _detect_wsl():
     try:
         with open("/proc/sys/kernel/osrelease") as f:
@@ -34,6 +22,18 @@ cdef bint _detect_wsl():
 
 cdef bint _IS_WSL = _detect_wsl()
 
+
+# The WSL locale guard lives in a separate module that is only compiled on
+# Linux (build_hooks.py excludes it on Windows), because it relies on POSIX
+# per-thread locale APIs that MSVC does not provide. On non-Linux platforms
+# the import fails and we fall back to a no-op guard; _IS_WSL is then False
+# so the guard is never entered anyway.
+if _IS_WSL:
+    from cuda.core._utils._wsl_locale import c_locale_guard
+else:
+    c_locale_guard = None
+
+
 try:
     from cuda.bindings._version import __version_tuple__ as _BINDINGS_VERSION
 except ImportError:
@@ -161,33 +161,24 @@ def get_process_name(pid: int) -> str:
             nvml.device_get_compute_running_processes_v3(dev_h)
         return nvml.system_get_process_name(pid)
 
-    cdef locale_t c_locale
-    cdef locale_t prev_locale
-
     initialize()
     if not _IS_WSL:
         return _get_process_name(pid)
 
     # WSL workaround: nvmlSystemGetProcessName on WSL takes a wide-char
-    # conversion path when the process locale is non-"C". That path walks
-    # a UTF-16LE source buffer with a 4-byte stride (as if it were UTF-32LE)
-    # and emits 5-byte UTF-8 sequences that look like garbage preceding the
-    # trailing basename of /proc/<pid>/exe. CPython's startup unconditionally
-    # calls setlocale(LC_ALL, ""), so essentially every cuda.core caller hits
-    # this. The cached entry for the PID is set the first time NVML resolves
-    # it (typically inside nvmlDeviceGetComputeRunningProcesses_v3), so to
-    # recover a correct value we re-prime the cache under the "C" locale
-    # before reading the name. We use the POSIX per-thread locale APIs so
-    # other threads' view of the locale is unaffected.
-    c_locale = newlocale(LC_ALL_MASK, b"C", <locale_t>0)
-    if c_locale == <locale_t>0:
-        raise RuntimeError("Failed to create C locale")
-    prev_locale = uselocale(c_locale)
-    try:
+    # conversion path when the calling thread's locale is non-"C". That path
+    # walks a UTF-16LE source buffer with a 4-byte stride (as if it were
+    # UTF-32LE) and emits 5-byte UTF-8 sequences that look like garbage
+    # preceding the trailing basename of /proc/<pid>/exe. CPython's startup
+    # unconditionally calls setlocale(LC_ALL, ""), so essentially every
+    # cuda.core caller hits this. The cached entry for the PID is set the
+    # first time NVML resolves it (typically inside
+    # nvmlDeviceGetComputeRunningProcesses_v3), so to recover a correct value
+    # we re-prime the cache under the "C" locale before reading the name.
+    # c_locale_guard uses POSIX per-thread locale APIs (see _wsl_locale.pyx)
+    # so other threads' view of the locale is unaffected.
+    with c_locale_guard():  # no-cython-lint
         return _get_process_name(pid)
-    finally:
-        uselocale(prev_locale)
-        freelocale(c_locale)
 
 
 __all__ = [