From d3e2fc4a2c1ef8cc751abfef45aa6738811a16e0 Mon Sep 17 00:00:00 2001 From: Ganesh Kumar Ashokavardhanan Date: Tue, 19 May 2026 17:02:46 -0700 Subject: [PATCH 1/2] refactor(auto-updater): track vGPU 18.x GRID drivers from resources.json The previous auto-updater read NvidiaGPU/Nvidia-GPU-Linux-Resources.json, which the HPC team stopped updating at vGPU 17.55 (550.144.06). All new GRID releases (18.5, 18.6, etc.) now land in NvidiaGPU/resources.json. Changes: - Switch the source URL to NvidiaGPU/resources.json - Walk OS.Linux.Version[*].Driver[*] for Type='GRID' blocks - Filter entries by vGPUVersion major == TARGET_VGPU_MAJOR (default '18') - Pick the entry with the highest minor (correctly handles 18.10 > 18.6) - Fall back from DirLink to FwLink when only the latter is populated - Add a request timeout (no timeout previously) - Add TARGET_VGPU_MAJOR constant so future major bumps (18 -> 19) are a single-line change Tested against the live manifest: - Latest v18 returned: 570.211.01 (vGPU 18.6) - Idempotent when driver_config.yml is already at latest - Bumps from v17 back to v18 when intentionally regressed - TARGET_VGPU_MAJOR='19' (not yet released) raises a clear error Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- auto_update.py | 98 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/auto_update.py b/auto_update.py index aff54bc..6f06d4b 100644 --- a/auto_update.py +++ b/auto_update.py @@ -2,26 +2,77 @@ import requests from ruamel.yaml import YAML +# Source of truth for Azure-mirrored NVIDIA GRID drivers. The HPC team +# publishes the canonical driver manifest at this path; resources.json +# contains all currently supported vGPU major versions (we previously read +# Nvidia-GPU-Linux-Resources.json, but that file is no longer updated past +# vGPU 17.55). +RESOURCES_JSON_URL = ( + "https://raw.githubusercontent.com/Azure/azhpc-extensions/" + "refs/heads/master/NvidiaGPU/resources.json" +) + +# Active vGPU major version. Bump this when migrating to the next major +# (e.g. "18" -> "19") and the auto-updater will start tracking the latest +# minor of that major. The auto-updater intentionally does NOT cross major +# versions on its own, since major version bumps require validation. +TARGET_VGPU_MAJOR = "18" + + +def _vgpu_sort_key(vgpu_version): + """Convert "18.6" / "18.10" into a tuple for ordering: (18, 6) / (18, 10).""" + return tuple(int(p) if p.isdigit() else 0 for p in vgpu_version.split(".")) + + def get_latest_grid_driver(): - # URL of the JSON file containing driver information - url = "https://raw.githubusercontent.com/Azure/azhpc-extensions/refs/heads/master/NvidiaGPU/Nvidia-GPU-Linux-Resources.json" - response = requests.get(url) - response.raise_for_status() + """Return (driver_version, download_url) for the latest vGPU TARGET_VGPU_MAJOR.x + Linux GRID driver. + + Walks OS.Linux.Version[*].Driver[*] in resources.json, collects all entries + whose vGPUVersion has major == TARGET_VGPU_MAJOR, and picks the one with the + highest minor. Falls back from DirLink to FwLink so we still get a usable URL + when the manifest puts the download in FwLink. + """ + response = requests.get(RESOURCES_JSON_URL, timeout=30) + response.raise_for_status() data = response.json() - - # Extract the latest GRID driver information - grid_versions = data['Latest']['Category'] - grid_info = next((item for item in grid_versions if item["Name"] == "GRID"), None) - - if grid_info: - latest_version_info = grid_info['Versions'][0] - latest_version = latest_version_info['DriverVersion'] - latest_url = latest_version_info['Driver'][0]['DirLink'] - return latest_version, latest_url - - raise Exception("Could not find latest GRID driver version") - -# Add this at the end of your update_driver_config function + + linux_block = next( + (o for o in data.get("OS", []) if o.get("Name") == "Linux"), None + ) + if linux_block is None: + raise RuntimeError("No 'Linux' OS block in NvidiaGPU/resources.json") + + prefix = f"{TARGET_VGPU_MAJOR}." + candidates = {} + for distro in linux_block.get("Version", []): + for drv_block in distro.get("Driver", []): + if drv_block.get("Type") != "GRID": + continue + for v in drv_block.get("Version", []): + vgpu = str(v.get("vGPUVersion", "")).strip() + if not vgpu.startswith(prefix): + continue + driver_num = v.get("Num") + url = v.get("DirLink") or v.get("FwLink") + if not driver_num or not url: + continue + # Same driver may appear in multiple distro blocks; first wins. + candidates.setdefault(driver_num, {"vgpu": vgpu, "url": url}) + + if not candidates: + raise RuntimeError( + f"No vGPU {TARGET_VGPU_MAJOR}.x Linux GRID driver entries found " + f"in {RESOURCES_JSON_URL}" + ) + + best_num = max( + candidates, + key=lambda n: _vgpu_sort_key(candidates[n]["vgpu"]), + ) + return best_num, candidates[best_num]["url"] + + def update_driver_config(): yaml = YAML() yaml.preserve_quotes = True @@ -29,18 +80,15 @@ def update_driver_config(): if not os.path.exists("driver_config.yml"): raise FileNotFoundError("driver_config.yml not found in the current directory.") - + with open("driver_config.yml", "r") as f: config = yaml.load(f) - - # Get latest version and URL + latest_version, latest_url = get_latest_grid_driver() - - # Update the grid section while preserving order + config['grid']['version'] = latest_version config['grid']['url'] = latest_url - - # Write back to file + with open("driver_config.yml", "w") as f: yaml.dump(config, f) From 2f3f507738cc5c491f5ba4d17d797f617eee63ca Mon Sep 17 00:00:00 2001 From: Ganesh Kumar Ashokavardhanan Date: Tue, 19 May 2026 17:09:48 -0700 Subject: [PATCH 2/2] refactor(auto-updater): track latest patch within current driver major MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the hardcoded TARGET_VGPU_MAJOR="18" constant with logic that derives the target driver major directly from the currently-pinned grid.version in driver_config.yml. This makes the auto-updater: * Self-configuring — bumping driver_config.yml to a 595.x version automatically starts tracking 595.x patches, with no code change. * Tied to the ABI-stable identifier — NVIDIA driver MAJOR (R570, R580) is the boundary across which kernel-module ABI, install-script behaviour, and vGPU licensing may change. Filtering by driver major (vs vGPU major) is the more semantically correct invariant. * More conservative — within a major, only patch/minor bumps are picked up. Major bumps remain explicit, manual decisions. Behaviour on current main (grid.version = 570.211.01): - target major = "570" - candidates in resources.json: 570.211.01, 570.195.03 - picked: 570.211.01 (no-op, idempotent) When NVIDIA ships e.g. 570.215.xx, it will be picked up automatically. Verified with 10 unit-style scenarios (current data, idempotency, 595-series tracking, 550-series tie-breaking, end-to-end update_driver_config, garbage-input error path, synthetic patch-bump-within-major, numeric vs lex sort, and missing-major error). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- auto_update.py | 79 ++++++++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/auto_update.py b/auto_update.py index 6f06d4b..2c26aa9 100644 --- a/auto_update.py +++ b/auto_update.py @@ -1,37 +1,50 @@ import os +import re import requests from ruamel.yaml import YAML -# Source of truth for Azure-mirrored NVIDIA GRID drivers. The HPC team -# publishes the canonical driver manifest at this path; resources.json -# contains all currently supported vGPU major versions (we previously read -# Nvidia-GPU-Linux-Resources.json, but that file is no longer updated past -# vGPU 17.55). +# Source of truth for Azure-mirrored NVIDIA GRID drivers. resources.json +# contains all currently-supported NVIDIA driver branches (we previously +# read Nvidia-GPU-Linux-Resources.json, but that file is no longer updated +# past 550.144.06 / vGPU 17.55). RESOURCES_JSON_URL = ( "https://raw.githubusercontent.com/Azure/azhpc-extensions/" "refs/heads/master/NvidiaGPU/resources.json" ) -# Active vGPU major version. Bump this when migrating to the next major -# (e.g. "18" -> "19") and the auto-updater will start tracking the latest -# minor of that major. The auto-updater intentionally does NOT cross major -# versions on its own, since major version bumps require validation. -TARGET_VGPU_MAJOR = "18" +# Driver versions look like "570.211.01" — major.minor.patch. The MAJOR +# component corresponds to NVIDIA's driver branch (R570, R580, …) and is +# the ABI-stable boundary: within one major, NVIDIA only ships +# bug-fix / patch releases. Crossing a major (e.g. 570 -> 580) can +# introduce kernel-module ABI changes, install-script differences, and +# vGPU licensing changes, so it requires deliberate validation. +DRIVER_VERSION_PATTERN = re.compile(r"^\d+(?:\.\d+){1,2}$") -def _vgpu_sort_key(vgpu_version): - """Convert "18.6" / "18.10" into a tuple for ordering: (18, 6) / (18, 10).""" - return tuple(int(p) if p.isdigit() else 0 for p in vgpu_version.split(".")) +def _driver_sort_key(version_str): + """Convert "570.211.01" -> (570, 211, 1) so version comparisons are numeric.""" + return tuple(int(p) for p in version_str.split(".") if p.isdigit()) -def get_latest_grid_driver(): - """Return (driver_version, download_url) for the latest vGPU TARGET_VGPU_MAJOR.x - Linux GRID driver. +def _current_driver_major(config): + """Return the driver major (e.g. '570') of the currently-pinned grid version.""" + current = str(config.get("grid", {}).get("version", "")).strip() + if not DRIVER_VERSION_PATTERN.match(current): + raise RuntimeError( + f"Cannot determine driver major from grid.version={current!r} " + f"in driver_config.yml (expected like '570.211.01')." + ) + return current.split(".")[0] + - Walks OS.Linux.Version[*].Driver[*] in resources.json, collects all entries - whose vGPUVersion has major == TARGET_VGPU_MAJOR, and picks the one with the - highest minor. Falls back from DirLink to FwLink so we still get a usable URL - when the manifest puts the download in FwLink. +def get_latest_grid_driver_for_major(target_major): + """Return (driver_version, download_url) for the highest version in + resources.json whose driver major matches target_major. + + Walks OS.Linux.Version[*].Driver[*] for Type='GRID' blocks and keeps any + entry whose Num starts with f"{target_major}.". Falls back from DirLink + to FwLink so we still get a usable URL when the manifest puts the + download in FwLink (the v18.5 entry is one such example). """ response = requests.get(RESOURCES_JSON_URL, timeout=30) response.raise_for_status() @@ -43,34 +56,31 @@ def get_latest_grid_driver(): if linux_block is None: raise RuntimeError("No 'Linux' OS block in NvidiaGPU/resources.json") - prefix = f"{TARGET_VGPU_MAJOR}." + prefix = f"{target_major}." candidates = {} for distro in linux_block.get("Version", []): for drv_block in distro.get("Driver", []): if drv_block.get("Type") != "GRID": continue for v in drv_block.get("Version", []): - vgpu = str(v.get("vGPUVersion", "")).strip() - if not vgpu.startswith(prefix): + num = str(v.get("Num", "")).strip() + if not num.startswith(prefix): continue - driver_num = v.get("Num") url = v.get("DirLink") or v.get("FwLink") - if not driver_num or not url: + if not url: continue # Same driver may appear in multiple distro blocks; first wins. - candidates.setdefault(driver_num, {"vgpu": vgpu, "url": url}) + candidates.setdefault(num, url) if not candidates: raise RuntimeError( - f"No vGPU {TARGET_VGPU_MAJOR}.x Linux GRID driver entries found " - f"in {RESOURCES_JSON_URL}" + f"No GRID driver {target_major}.x entries found in " + f"{RESOURCES_JSON_URL}. If NVIDIA has ended patches for this " + f"branch, bump driver_config.yml to the next major manually." ) - best_num = max( - candidates, - key=lambda n: _vgpu_sort_key(candidates[n]["vgpu"]), - ) - return best_num, candidates[best_num]["url"] + best = max(candidates, key=_driver_sort_key) + return best, candidates[best] def update_driver_config(): @@ -84,7 +94,8 @@ def update_driver_config(): with open("driver_config.yml", "r") as f: config = yaml.load(f) - latest_version, latest_url = get_latest_grid_driver() + target_major = _current_driver_major(config) + latest_version, latest_url = get_latest_grid_driver_for_major(target_major) config['grid']['version'] = latest_version config['grid']['url'] = latest_url