|
8 | 8 | from .common import Callback |
9 | 9 |
|
10 | 10 |
|
| 11 | +def _call_pynvml(method, *args, **kwargs): |
| 12 | + try: |
| 13 | + return method(*args, **kwargs) |
| 14 | + except pynvml.NVMLError: |
| 15 | + return None |
| 16 | + |
| 17 | + |
| 18 | +def _get_pynvml_handler(device): |
| 19 | + try: |
| 20 | + return torch.cuda._get_pynvml_handler(device) |
| 21 | + except pynvml.NVMLError: |
| 22 | + return None |
| 23 | + |
| 24 | + |
| 25 | +def _gather_cuda_info(handler): |
| 26 | + info = { |
| 27 | + 'name': _call_pynvml(pynvml.nvmlDeviceGetName, handler), |
| 28 | + 'uuid': _call_pynvml(pynvml.nvmlDeviceGetUUID, handler), |
| 29 | + 'serial': _call_pynvml(pynvml.nvmlDeviceGetSerial, handler), |
| 30 | + 'minor_number': _call_pynvml(pynvml.nvmlDeviceGetMinorNumber, handler), |
| 31 | + 'architecture': _call_pynvml(pynvml.nvmlDeviceGetArchitecture, handler), |
| 32 | + 'brand': _call_pynvml(pynvml.nvmlDeviceGetBrand, handler), |
| 33 | + 'vbios_version': _call_pynvml(pynvml.nvmlDeviceGetVbiosVersion, handler), |
| 34 | + 'driver_version': _call_pynvml(pynvml.nvmlSystemGetDriverVersion), |
| 35 | + 'cuda_driver_version': _call_pynvml(pynvml.nvmlSystemGetCudaDriverVersion_v2), |
| 36 | + 'nvml_version': _call_pynvml(pynvml.nvmlSystemGetNVMLVersion), |
| 37 | + 'total_memory': _call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handler, pynvml.nvmlMemory_v2).total, |
| 38 | + 'reserved_memory': _call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handler, pynvml.nvmlMemory_v2).reserved, |
| 39 | + 'num_gpu_cores': _call_pynvml(pynvml.nvmlDeviceGetNumGpuCores, handler), |
| 40 | + 'power_managment_limit': _call_pynvml(pynvml.nvmlDeviceGetPowerManagementLimit, handler), |
| 41 | + 'power_managment_default_limit': _call_pynvml(pynvml.nvmlDeviceGetPowerManagementDefaultLimit, handler), |
| 42 | + 'cuda_compute_capability': _call_pynvml(pynvml.nvmlDeviceGetCudaComputeCapability, handler), |
| 43 | + } |
| 44 | + return info |
| 45 | + |
| 46 | + |
11 | 47 | class CudaCallback(Callback): |
12 | 48 | """ |
13 | 49 | Logs various properties pertaining to CUDA devices. |
14 | 50 | """ |
15 | 51 |
|
16 | | - @staticmethod |
17 | | - def _call_pynvml(method, *args, **kwargs): |
18 | | - try: |
19 | | - return method(*args, **kwargs) |
20 | | - except pynvml.NVMLError: |
21 | | - return None |
22 | | - |
23 | 52 | def pre_run(self, pipe): |
24 | | - handle = torch.cuda._get_pynvml_handler(pipe.device) |
25 | | - |
26 | | - info = { |
27 | | - 'name': self._call_pynvml(pynvml.nvmlDeviceGetName, handle), |
28 | | - 'uuid': self._call_pynvml(pynvml.nvmlDeviceGetUUID, handle), |
29 | | - 'serial': self._call_pynvml(pynvml.nvmlDeviceGetSerial, handle), |
30 | | - 'torch_device': str(pipe.device), |
31 | | - 'minor_number': self._call_pynvml(pynvml.nvmlDeviceGetMinorNumber, handle), |
32 | | - 'architecture': self._call_pynvml(pynvml.nvmlDeviceGetArchitecture, handle), |
33 | | - 'brand': self._call_pynvml(pynvml.nvmlDeviceGetBrand, handle), |
34 | | - 'vbios_version': self._call_pynvml(pynvml.nvmlDeviceGetVbiosVersion, handle), |
35 | | - 'driver_version': self._call_pynvml(pynvml.nvmlSystemGetDriverVersion), |
36 | | - 'cuda_driver_version': self._call_pynvml(pynvml.nvmlSystemGetCudaDriverVersion_v2), |
37 | | - 'nvml_version': self._call_pynvml(pynvml.nvmlSystemGetNVMLVersion), |
38 | | - 'total_memory': self._call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handle, pynvml.nvmlMemory_v2).total, |
39 | | - 'reserved_memory': self._call_pynvml(pynvml.nvmlDeviceGetMemoryInfo, handle, pynvml.nvmlMemory_v2).reserved, |
40 | | - 'num_gpu_cores': self._call_pynvml(pynvml.nvmlDeviceGetNumGpuCores, handle), |
41 | | - 'power_managment_limit': self._call_pynvml(pynvml.nvmlDeviceGetPowerManagementLimit, handle), |
42 | | - 'power_managment_default_limit': self._call_pynvml(pynvml.nvmlDeviceGetPowerManagementDefaultLimit, handle), |
43 | | - 'cuda_compute_capability': self._call_pynvml(pynvml.nvmlDeviceGetCudaComputeCapability, handle), |
44 | | - } |
45 | | - all_devices = all_gather_object(info) |
| 53 | + handler = _get_pynvml_handler(pipe.device) |
| 54 | + info = _gather_cuda_info(handler) if handler is not None else {} |
| 55 | + info['torch_device'] = str(pipe.device) |
| 56 | + |
| 57 | + all_infos = all_gather_object(info) |
46 | 58 |
|
47 | 59 | msg = '* CUDA-DEVICES:\n' |
48 | | - info_strings = [ |
49 | | - f'{info["torch_device"]} -> /dev/nvidia{info["minor_number"]} -> {info["name"]} (UUID: {info["uuid"]}) (VRAM: {info["total_memory"] / 1000 ** 2:.0f} MB)' |
50 | | - for info in all_devices |
51 | | - ] |
| 60 | + info_strings = [] |
| 61 | + for info in all_infos: |
| 62 | + if 'minor_number' in info and 'name' in info and 'uuid' in info: |
| 63 | + info_strings.append( |
| 64 | + f'{info["torch_device"]} -> /dev/nvidia{info["minor_number"]} -> {info["name"]} (UUID: {info["uuid"]}) (VRAM: {info["total_memory"] / 1000 ** 2:.0f} MB)' |
| 65 | + ) |
52 | 66 | msg += '\n'.join(f' - [{i}] {info_str}' for i, info_str in enumerate(info_strings)) |
53 | 67 | dml_logging.info(msg) |
54 | 68 |
|
55 | 69 | if pipe.run_dir and is_root(): |
56 | | - self._save(pipe.run_dir / 'cuda_devices.json', all_devices) |
| 70 | + self._save(pipe.run_dir / 'diagnostics' / 'cuda_devices.json', all_infos) |
57 | 71 |
|
58 | | - def _save(self, path, all_devices): |
| 72 | + def _save(self, path, all_infos): |
59 | 73 | with open(path, 'w') as f: |
60 | | - devices = {f'rank_{i}': device for i, device in enumerate(all_devices)} |
61 | | - obj = {'devices': devices} |
| 74 | + dct = {f'rank_{i}': info for i, info in enumerate(all_infos)} |
| 75 | + obj = {'devices': dct} |
62 | 76 | json.dump(obj, f, indent=4) |
0 commit comments