feat: dedicated diiagnostics dir & more robust CudaCallback

sehoffmann · sehoffmann · commit 1a232aa40d7e · 2026-03-19T14:01:18.000+01:00
diff --git a/dmlcloud/core/callbacks/checkpoint.py b/dmlcloud/core/callbacks/checkpoint.py
@@ -35,7 +35,7 @@ def pre_run(self, pipe: 'Pipeline'):
             dml_checkpoint.create_checkpoint_dir(self.run_dir)
             dml_checkpoint.save_config(pipe.config, self.run_dir)
 
-            with open(pipe.run_dir / "environment.txt", 'w') as f:
+            with open(pipe.run_dir / 'diagnostics' / 'environment.txt', 'w') as f:
                 for k, v in os.environ.items():
                     f.write(f"{k}={v}\n")
 
diff --git a/dmlcloud/core/callbacks/cuda.py b/dmlcloud/core/callbacks/cuda.py
@@ -8,55 +8,69 @@
 from .common import Callback
 
 
+def _call_pynvml(method, *args, **kwargs):
+    try:
+        return method(*args, **kwargs)
+    except pynvml.NVMLError:
+        return None
+
+
+def _get_pynvml_handler(device):
+    try:
+        return torch.cuda._get_pynvml_handler(device)
+    except pynvml.NVMLError:
+        return None
+
+
+def _gather_cuda_info(handler):
+    info = {
+        'name': _call_pynvml(pynvml.nvmlDeviceGetName, handler),
+        'uuid': _call_pynvml(pynvml.nvmlDeviceGetUUID, handler),
+        'serial': _call_pynvml(pynvml.nvmlDeviceGetSerial, handler),
+        'minor_number': _call_pynvml(pynvml.nvmlDeviceGetMinorNumber, handler),
+        'architecture': _call_pynvml(pynvml.nvmlDeviceGetArchitecture, handler),
+        'brand': _call_pynvml(pynvml.nvmlDeviceGetBrand, handler),
+        'vbios_version': _call_pynvml(pynvml.nvmlDeviceGetVbiosVersion, handler),
+        'driver_version': _call_pynvml(pynvml.nvmlSystemGetDriverVersion),
+        'cuda_driver_version': _call_pynvml(pynvml.nvmlSystemGetCudaDriverVersion_v2),
+        'nvml_version': _call_pynvml(pynvml.nvmlSystemGetNVMLVersion),
+        'total_memory': _call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handler, pynvml.nvmlMemory_v2).total,
+        'reserved_memory': _call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handler, pynvml.nvmlMemory_v2).reserved,
+        'num_gpu_cores': _call_pynvml(pynvml.nvmlDeviceGetNumGpuCores, handler),
+        'power_managment_limit': _call_pynvml(pynvml.nvmlDeviceGetPowerManagementLimit, handler),
+        'power_managment_default_limit': _call_pynvml(pynvml.nvmlDeviceGetPowerManagementDefaultLimit, handler),
+        'cuda_compute_capability': _call_pynvml(pynvml.nvmlDeviceGetCudaComputeCapability, handler),
+    }
+    return info
+
+
 class CudaCallback(Callback):
     """
     Logs various properties pertaining to CUDA devices.
     """
 
-    @staticmethod
-    def _call_pynvml(method, *args, **kwargs):
-        try:
-            return method(*args, **kwargs)
-        except pynvml.NVMLError:
-            return None
-
     def pre_run(self, pipe):
-        handle = torch.cuda._get_pynvml_handler(pipe.device)
-
-        info = {
-            'name': self._call_pynvml(pynvml.nvmlDeviceGetName, handle),
-            'uuid': self._call_pynvml(pynvml.nvmlDeviceGetUUID, handle),
-            'serial': self._call_pynvml(pynvml.nvmlDeviceGetSerial, handle),
-            'torch_device': str(pipe.device),
-            'minor_number': self._call_pynvml(pynvml.nvmlDeviceGetMinorNumber, handle),
-            'architecture': self._call_pynvml(pynvml.nvmlDeviceGetArchitecture, handle),
-            'brand': self._call_pynvml(pynvml.nvmlDeviceGetBrand, handle),
-            'vbios_version': self._call_pynvml(pynvml.nvmlDeviceGetVbiosVersion, handle),
-            'driver_version': self._call_pynvml(pynvml.nvmlSystemGetDriverVersion),
-            'cuda_driver_version': self._call_pynvml(pynvml.nvmlSystemGetCudaDriverVersion_v2),
-            'nvml_version': self._call_pynvml(pynvml.nvmlSystemGetNVMLVersion),
-            'total_memory': self._call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handle, pynvml.nvmlMemory_v2).total,
-            'reserved_memory': self._call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handle, pynvml.nvmlMemory_v2).reserved,
-            'num_gpu_cores': self._call_pynvml(pynvml.nvmlDeviceGetNumGpuCores, handle),
-            'power_managment_limit': self._call_pynvml(pynvml.nvmlDeviceGetPowerManagementLimit, handle),
-            'power_managment_default_limit': self._call_pynvml(pynvml.nvmlDeviceGetPowerManagementDefaultLimit, handle),
-            'cuda_compute_capability': self._call_pynvml(pynvml.nvmlDeviceGetCudaComputeCapability, handle),
-        }
-        all_devices = all_gather_object(info)
+        handler = _get_pynvml_handler(pipe.device)
+        info = _gather_cuda_info(handler) if handler is not None else {}
+        info['torch_device'] = str(pipe.device)
+
+        all_infos = all_gather_object(info)
 
         msg = '* CUDA-DEVICES:\n'
-        info_strings = [
-            f'{info["torch_device"]} -> /dev/nvidia{info["minor_number"]} -> {info["name"]} (UUID: {info["uuid"]}) (VRAM: {info["total_memory"] / 1000 ** 2:.0f} MB)'
-            for info in all_devices
-        ]
+        info_strings = []
+        for info in all_infos:
+            if 'minor_number' in info and 'name' in info and 'uuid' in info:
+                info_strings.append(
+                    f'{info["torch_device"]} -> /dev/nvidia{info["minor_number"]} -> {info["name"]} (UUID: {info["uuid"]}) (VRAM: {info["total_memory"] / 1000 ** 2:.0f} MB)'
+                )
         msg += '\n'.join(f'    - [{i}] {info_str}' for i, info_str in enumerate(info_strings))
         dml_logging.info(msg)
 
         if pipe.run_dir and is_root():
-            self._save(pipe.run_dir / 'cuda_devices.json', all_devices)
+            self._save(pipe.run_dir / 'diagnostics' / 'cuda_devices.json', all_infos)
 
-    def _save(self, path, all_devices):
+    def _save(self, path, all_infos):
         with open(path, 'w') as f:
-            devices = {f'rank_{i}': device for i, device in enumerate(all_devices)}
-            obj = {'devices': devices}
+            dct = {f'rank_{i}': info for i, info in enumerate(all_infos)}
+            obj = {'devices': dct}
             json.dump(obj, f, indent=4)
diff --git a/dmlcloud/core/callbacks/git.py b/dmlcloud/core/callbacks/git.py
@@ -38,7 +38,7 @@ def pre_run(self, pipe):
             return
 
         if pipe.run_dir and is_root():
-            self._save(pipe.run_dir / 'git_diff.txt', diff)
+            self._save(pipe.run_dir / 'diagnostics' / 'git_diff.txt', diff)
 
         self._log_diff(diff)
 
diff --git a/dmlcloud/core/checkpoint.py b/dmlcloud/core/checkpoint.py
@@ -63,6 +63,9 @@ def create_checkpoint_dir(path: Path | str, name: Optional[str] = None) -> Path:
     indicator_file = path / '.dmlcloud'
     indicator_file.touch()
 
+    diagnostics_dir = path / 'diagnostics'
+    diagnostics_dir.mkdir(exist_ok=True)
+
     if slurm_job_id() is not None:
         with open(path / '.slurm-jobid', 'w') as f:
             f.write(slurm_job_id())