From 7223f184fd6da48a7c58cfc326a42bfc4e847744 Mon Sep 17 00:00:00 2001 From: Edward Caunt Date: Wed, 13 May 2026 13:55:40 +0000 Subject: [PATCH 1/4] arch: Enhance visible devices handling to permit UUIDs --- devito/arch/archinfo.py | 71 +++++++++++++++++++++++++++++++++++----- tests/test_gpu_common.py | 45 +++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 9 deletions(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 3166bf255a..3714c18b27 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -493,6 +493,39 @@ def parse_product_arch(): return None +def _resolve_uuids_to_indices(uuids): + """ + Map GPU UUID/unique-ID strings to integer device indices. + """ + # (command, pattern) where group(1)=index, group(2)=uuid + # nvidia-smi -L output: "GPU 0: (UUID: GPU-xxxx-...)" + # rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x" + queries = [ + (['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'), + (['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'), + ] + for cmd, pattern in queries: + try: + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + raw = proc.stdout.read().decode() + except OSError: + # Command not available + continue + + uuid_to_index = {m.group(2): int(m.group(1)) + for line in raw.splitlines() + if (m := re.match(pattern, line))} + if not uuid_to_index: + continue + + try: + return tuple(uuid_to_index[u] for u in uuids) + except KeyError: + continue + + return None + + def get_visible_devices(): device_vars = ( 'CUDA_VISIBLE_DEVICES', @@ -500,17 +533,37 @@ def get_visible_devices(): 'HIP_VISIBLE_DEVICES' ) for v in device_vars: - try: - return v, tuple(int(i) for i in os.environ[v].split(',')) - except ValueError: - # Visible devices set via UUIDs or other non-integer identifiers. - warning("Setting visible devices via UUIDs or other non-integer" - " identifiers is currently unsupported: environment variable" - f" {v}={os.environ[v]} ignored.") - except KeyError: - # Environment variable not set + if v not in os.environ: continue + val = os.environ[v].strip() + + errmsg = f"{v}={os.environ[v]!r} exposes no GPU devices." + + # Empty string or known "no devices" sentinels + if not val or val.upper() in ('NODEVFILES',): + raise RuntimeError(errmsg) + + entries = [e.strip() for e in val.split(',')] + + # Try integer parsing first + with suppress(ValueError): + ids = tuple(int(i) for i in entries) + # Negative sentinel (e.g. -1) means no devices exposed + if len(ids) == 1 and ids[0] < 0: + raise RuntimeError(errmsg) + + return v, ids + + # Try UUID → device index resolution + ids = _resolve_uuids_to_indices(entries) + if ids is not None: + return v, ids + + raise RuntimeError( + f"Cannot resolve device specifiers in {v}={os.environ[v]!r}." + ) + return None, None diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index e84d0df5d8..8cc383370f 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -1,3 +1,6 @@ +import re +from subprocess import DEVNULL, PIPE, Popen + import cloudpickle as pickle import numpy as np import pytest @@ -107,6 +110,48 @@ def test_visible_devices(self, env_variables): # Default physical deviceid expected to be 0 assert argmap2._physical_deviceid == 0 + @pytest.mark.parametrize('env_variables', [ + {"CUDA_VISIBLE_DEVICES": "-1"}, + {"CUDA_VISIBLE_DEVICES": ""}, + {"CUDA_VISIBLE_DEVICES": "NoDevFiles"}, + {"ROCR_VISIBLE_DEVICES": "-1"}, + ]) + def test_no_visible_devices(self, env_variables): + """Accessing _physical_deviceid when no devices are exposed should raise.""" + grid = Grid(shape=(10, 10)) + u = Function(name='u', grid=grid) + + with switchenv(env_variables): + op = Operator(Eq(u, u+1)) + argmap = op.arguments() + with pytest.raises(RuntimeError): + _ = argmap._physical_deviceid + + def test_visible_devices_uuid(self): + # Query GPU 0's UUID independently of _resolve_uuids_to_indices + try: + proc = Popen(['nvidia-smi', '-L'], stdout=PIPE, stderr=DEVNULL) + output = proc.stdout.read().decode() + except OSError: + pytest.skip("nvidia-smi not available") + + uuid = None + for line in output.splitlines(): + m = re.match(r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', line) + if m: + uuid = m.group(1) + break + + if uuid is None: + pytest.skip("No GPU 0 UUID found in nvidia-smi output") + + grid = Grid(shape=(10, 10)) + u = Function(name='u', grid=grid) + with switchenv({'CUDA_VISIBLE_DEVICES': uuid}): + op = Operator(Eq(u, u+1)) + argmap = op.arguments() + assert argmap._physical_deviceid == 0 + @pytest.mark.parallel(mode=2) @pytest.mark.parametrize('visible_devices', [ "1,2", "1,0", "0,2,3", From 98d6328c8e79bca7b91e6f6f9043bf21cadac66e Mon Sep 17 00:00:00 2001 From: Edward Caunt Date: Wed, 20 May 2026 09:56:28 +0000 Subject: [PATCH 2/4] arch: Refactor UUID processing and enhance tests --- devito/arch/archinfo.py | 40 +++++++++++++++++++--------------------- tests/test_gpu_common.py | 34 ++++++++++++++++++++-------------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 3714c18b27..04f3417f31 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -16,7 +16,7 @@ from packaging.version import InvalidVersion, parse from devito.logger import warning -from devito.tools import all_equal, as_tuple, memoized_func +from devito.tools import all_equal, as_tuple, frozendict, memoized_func __all__ = [ # noqa: RUF022 'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices', @@ -493,37 +493,31 @@ def parse_product_arch(): return None -def _resolve_uuids_to_indices(uuids): +@memoized_func +def _get_uuid_to_index_map(): """ - Map GPU UUID/unique-ID strings to integer device indices. + Build a frozen mapping from GPU UUID/unique-ID strings to integer device indices. """ # (command, pattern) where group(1)=index, group(2)=uuid # nvidia-smi -L output: "GPU 0: (UUID: GPU-xxxx-...)" # rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x" queries = [ - (['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'), + (['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'), (['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'), ] + mapper = {} for cmd, pattern in queries: try: proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) raw = proc.stdout.read().decode() except OSError: - # Command not available continue - uuid_to_index = {m.group(2): int(m.group(1)) - for line in raw.splitlines() - if (m := re.match(pattern, line))} - if not uuid_to_index: - continue + for line in raw.splitlines(): + if m := re.match(pattern, line): + mapper[m.group(2)] = int(m.group(1)) - try: - return tuple(uuid_to_index[u] for u in uuids) - except KeyError: - continue - - return None + return frozendict(mapper) def get_visible_devices(): @@ -556,13 +550,17 @@ def get_visible_devices(): return v, ids # Try UUID → device index resolution - ids = _resolve_uuids_to_indices(entries) - if ids is not None: + mapper = _get_uuid_to_index_map() + try: + ids = tuple(mapper[u] for u in entries) return v, ids + except KeyError: + pass - raise RuntimeError( - f"Cannot resolve device specifiers in {v}={os.environ[v]!r}." - ) + warning("Unresolvable visible devices environment variables encountered:" + f" {v}={os.environ[v]} ignored.") + + return None, None return None, None diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 8cc383370f..5981a2fece 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -128,26 +128,32 @@ def test_no_visible_devices(self, env_variables): _ = argmap._physical_deviceid def test_visible_devices_uuid(self): - # Query GPU 0's UUID independently of _resolve_uuids_to_indices - try: - proc = Popen(['nvidia-smi', '-L'], stdout=PIPE, stderr=DEVNULL) - output = proc.stdout.read().decode() - except OSError: - pytest.skip("nvidia-smi not available") - - uuid = None - for line in output.splitlines(): - m = re.match(r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', line) - if m: - uuid = m.group(1) + # Query GPU 0's UUID independently of _get_uuid_to_index_map + probes = [ + (['nvidia-smi', '-L'], r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', 'CUDA_VISIBLE_DEVICES'), + (['rocm-smi', '--showuniqueid'], r'GPU\[0\].*Unique ID:\s*([\w]+)', 'ROCR_VISIBLE_DEVICES'), + ] + uuid = env_var = None + for cmd, pattern, var in probes: + try: + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + output = proc.stdout.read().decode() + except OSError: + continue + for line in output.splitlines(): + m = re.match(pattern, line) + if m: + uuid, env_var = m.group(1), var + break + if uuid is not None: break if uuid is None: - pytest.skip("No GPU 0 UUID found in nvidia-smi output") + pytest.skip("No GPU 0 UUID found via nvidia-smi or rocm-smi") grid = Grid(shape=(10, 10)) u = Function(name='u', grid=grid) - with switchenv({'CUDA_VISIBLE_DEVICES': uuid}): + with switchenv({env_var: uuid}): op = Operator(Eq(u, u+1)) argmap = op.arguments() assert argmap._physical_deviceid == 0 From 16aa5ad09f0ee801a0d3436d49d81407e8a9e3b6 Mon Sep 17 00:00:00 2001 From: Edward Caunt Date: Wed, 20 May 2026 09:59:46 +0000 Subject: [PATCH 3/4] misc: Linting --- tests/test_gpu_common.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 5981a2fece..062cbb3def 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -130,8 +130,16 @@ def test_no_visible_devices(self, env_variables): def test_visible_devices_uuid(self): # Query GPU 0's UUID independently of _get_uuid_to_index_map probes = [ - (['nvidia-smi', '-L'], r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', 'CUDA_VISIBLE_DEVICES'), - (['rocm-smi', '--showuniqueid'], r'GPU\[0\].*Unique ID:\s*([\w]+)', 'ROCR_VISIBLE_DEVICES'), + ( + ['nvidia-smi', '-L'], + r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', + 'CUDA_VISIBLE_DEVICES' + ), + ( + ['rocm-smi', '--showuniqueid'], + r'GPU\[0\].*Unique ID:\s*([\w]+)', + 'ROCR_VISIBLE_DEVICES' + ), ] uuid = env_var = None for cmd, pattern, var in probes: From 9ff2d37c23451dece39db68f307d4a09627f5f53 Mon Sep 17 00:00:00 2001 From: Edward Caunt Date: Wed, 20 May 2026 10:20:36 +0000 Subject: [PATCH 4/4] misc: Fix pluralisation --- devito/arch/archinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 04f3417f31..98131704b6 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -557,7 +557,7 @@ def get_visible_devices(): except KeyError: pass - warning("Unresolvable visible devices environment variables encountered:" + warning("Unresolvable visible devices environment variable encountered:" f" {v}={os.environ[v]} ignored.") return None, None