-
Notifications
You must be signed in to change notification settings - Fork 254
arch: Enhance visible devices handling to permit UUIDs #2930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |
| from packaging.version import InvalidVersion, parse | ||
|
|
||
| from devito.logger import warning | ||
| from devito.tools import all_equal, as_tuple, memoized_func | ||
| from devito.tools import all_equal, as_tuple, frozendict, memoized_func | ||
|
|
||
| __all__ = [ # noqa: RUF022 | ||
| 'platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices', | ||
|
|
@@ -493,23 +493,74 @@ def parse_product_arch(): | |
| return None | ||
|
|
||
|
|
||
| @memoized_func | ||
| def _get_uuid_to_index_map(): | ||
| """ | ||
| Build a frozen mapping from GPU UUID/unique-ID strings to integer device indices. | ||
| """ | ||
| # (command, pattern) where group(1)=index, group(2)=uuid | ||
| # nvidia-smi -L output: "GPU 0: <name> (UUID: GPU-xxxx-...)" | ||
| # rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x<hex>" | ||
| queries = [ | ||
| (['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'), | ||
| (['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'), | ||
| ] | ||
| mapper = {} | ||
| for cmd, pattern in queries: | ||
| try: | ||
| proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) | ||
| raw = proc.stdout.read().decode() | ||
| except OSError: | ||
| continue | ||
|
|
||
| for line in raw.splitlines(): | ||
| if m := re.match(pattern, line): | ||
| mapper[m.group(2)] = int(m.group(1)) | ||
|
|
||
| return frozendict(mapper) | ||
|
|
||
|
|
||
| def get_visible_devices(): | ||
| device_vars = ( | ||
| 'CUDA_VISIBLE_DEVICES', | ||
| 'ROCR_VISIBLE_DEVICES', | ||
| 'HIP_VISIBLE_DEVICES' | ||
| ) | ||
| for v in device_vars: | ||
| if v not in os.environ: | ||
| continue | ||
|
|
||
| val = os.environ[v].strip() | ||
|
|
||
| errmsg = f"{v}={os.environ[v]!r} exposes no GPU devices." | ||
|
|
||
| # Empty string or known "no devices" sentinels | ||
| if not val or val.upper() in ('NODEVFILES',): | ||
| raise RuntimeError(errmsg) | ||
|
|
||
| entries = [e.strip() for e in val.split(',')] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| # Try integer parsing first | ||
| with suppress(ValueError): | ||
| ids = tuple(int(i) for i in entries) | ||
| # Negative sentinel (e.g. -1) means no devices exposed | ||
| if len(ids) == 1 and ids[0] < 0: | ||
| raise RuntimeError(errmsg) | ||
|
|
||
| return v, ids | ||
|
|
||
| # Try UUID → device index resolution | ||
| mapper = _get_uuid_to_index_map() | ||
| try: | ||
| return v, tuple(int(i) for i in os.environ[v].split(',')) | ||
| except ValueError: | ||
| # Visible devices set via UUIDs or other non-integer identifiers. | ||
| warning("Setting visible devices via UUIDs or other non-integer" | ||
| " identifiers is currently unsupported: environment variable" | ||
| f" {v}={os.environ[v]} ignored.") | ||
| ids = tuple(mapper[u] for u in entries) | ||
| return v, ids | ||
| except KeyError: | ||
| # Environment variable not set | ||
| continue | ||
| pass | ||
|
|
||
| warning("Unresolvable visible devices environment variable encountered:" | ||
| f" {v}={os.environ[v]} ignored.") | ||
|
|
||
| return None, None | ||
|
|
||
| return None, None | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,6 @@ | ||
| import re | ||
| from subprocess import DEVNULL, PIPE, Popen | ||
|
|
||
| import cloudpickle as pickle | ||
| import numpy as np | ||
| import pytest | ||
|
|
@@ -107,6 +110,62 @@ def test_visible_devices(self, env_variables): | |
| # Default physical deviceid expected to be 0 | ||
| assert argmap2._physical_deviceid == 0 | ||
|
|
||
| @pytest.mark.parametrize('env_variables', [ | ||
| {"CUDA_VISIBLE_DEVICES": "-1"}, | ||
| {"CUDA_VISIBLE_DEVICES": ""}, | ||
| {"CUDA_VISIBLE_DEVICES": "NoDevFiles"}, | ||
| {"ROCR_VISIBLE_DEVICES": "-1"}, | ||
| ]) | ||
| def test_no_visible_devices(self, env_variables): | ||
| """Accessing _physical_deviceid when no devices are exposed should raise.""" | ||
| grid = Grid(shape=(10, 10)) | ||
| u = Function(name='u', grid=grid) | ||
|
|
||
| with switchenv(env_variables): | ||
| op = Operator(Eq(u, u+1)) | ||
| argmap = op.arguments() | ||
| with pytest.raises(RuntimeError): | ||
| _ = argmap._physical_deviceid | ||
|
|
||
| def test_visible_devices_uuid(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That test should use |
||
| # Query GPU 0's UUID independently of _get_uuid_to_index_map | ||
| probes = [ | ||
| ( | ||
| ['nvidia-smi', '-L'], | ||
| r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', | ||
| 'CUDA_VISIBLE_DEVICES' | ||
| ), | ||
| ( | ||
| ['rocm-smi', '--showuniqueid'], | ||
| r'GPU\[0\].*Unique ID:\s*([\w]+)', | ||
| 'ROCR_VISIBLE_DEVICES' | ||
| ), | ||
| ] | ||
| uuid = env_var = None | ||
| for cmd, pattern, var in probes: | ||
| try: | ||
| proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) | ||
| output = proc.stdout.read().decode() | ||
| except OSError: | ||
| continue | ||
| for line in output.splitlines(): | ||
| m = re.match(pattern, line) | ||
| if m: | ||
| uuid, env_var = m.group(1), var | ||
| break | ||
| if uuid is not None: | ||
| break | ||
|
|
||
| if uuid is None: | ||
| pytest.skip("No GPU 0 UUID found via nvidia-smi or rocm-smi") | ||
|
|
||
| grid = Grid(shape=(10, 10)) | ||
| u = Function(name='u', grid=grid) | ||
| with switchenv({env_var: uuid}): | ||
| op = Operator(Eq(u, u+1)) | ||
| argmap = op.arguments() | ||
| assert argmap._physical_deviceid == 0 | ||
|
|
||
| @pytest.mark.parallel(mode=2) | ||
| @pytest.mark.parametrize('visible_devices', [ | ||
| "1,2", "1,0", "0,2,3", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
raw = check_output(cmd, stderr=DEVNULL, text=True)