diff --git a/.vscode/settings.json b/.vscode/settings.json index 8ceee9d01..a1517b8b1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -71,6 +71,7 @@ "numpify", "numpy", "nvidia", + "nvml", "odom", "odometry", "particlefilter", @@ -85,6 +86,7 @@ "pretrained", "proto", "protos", + "pyamdgpuinfo", "pyplot", "pywrapper", "Quaterniond", @@ -127,6 +129,7 @@ "unpenalized", "urdf", "vcstool", + "vram", "walkready", "wandb", "webots", diff --git a/pixi.lock b/pixi.lock index a636e7816..00b4af32a 100644 --- a/pixi.lock +++ b/pixi.lock @@ -530,6 +530,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -1504,6 +1505,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -2618,6 +2620,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -3596,6 +3599,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -17410,6 +17414,20 @@ packages: - pkg:pypi/notify2?source=hash-mapping size: 11122 timestamp: 1647371622387 +- conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda + sha256: fb46556c423311638fbb26ed4b2a67cf598044919e42ab7571365a5b4ae3b663 + md5: dc8587ae654e96031728802016e8258c + depends: + - python >=3.10 + constrains: + - pynvml ~=13.0 + - nvidia-ml ==9999999999 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/nvidia-ml-py?source=hash-mapping + size: 48878 + timestamp: 1775592349734 - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda sha256: 3906abfb6511a3bb309e39b9b1b7bc38f50a723971de2395489fd1f379255890 md5: 4c06a92e74452cfa53623a81592e8934 diff --git a/pixi.toml b/pixi.toml index 24faec9bf..600c4ebbe 100644 --- a/pixi.toml +++ b/pixi.toml @@ -70,6 +70,7 @@ mujoco = ">=3.6.0,<4" mypy = ">=1.18.2,<2" ninja = ">=1.13.2,<2" numpy = ">=1.26.4,<2" +nvidia-ml-py = ">=13.595.45,<14" opencv = ">=4.11.0,<5" paramiko = ">=4.0.0,<5" pkg-config = ">=0.29.2,<0.30" diff --git a/src/bitbots_misc/system_monitor/config/config.yaml b/src/bitbots_misc/system_monitor/config/config.yaml index 4ee829080..2ebfd972b 100644 --- a/src/bitbots_misc/system_monitor/config/config.yaml +++ b/src/bitbots_misc/system_monitor/config/config.yaml @@ -1,7 +1,7 @@ system_monitor: ros__parameters: # How many times per second should the system be queried for stats - update_frequency: 10.0 + update_frequency: 2.0 # These settings are quick_switches to completely disable certain parts of statistic collection do_cpu: true diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py index 886f31af9..78a8b750e 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py +++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py @@ -7,8 +7,15 @@ _prev_total = defaultdict(int) _prev_busy = defaultdict(int) +# store last reported usage per cpu to smooth sampling noise +_prev_usage: dict[str, float] = {} -def collect_all(): +# smoothing factor for exponential moving average (0..1) +# higher = more responsive, lower = smoother +_EMA_ALPHA = 0.5 + + +def collect_all() -> tuple[int, list[CpuMsg], float]: """ parse /proc/stat and calculate total and busy time @@ -50,7 +57,7 @@ def _get_cpu_stats(): return timings -def _calculate_usage(cpu_num, total, busy): +def _calculate_usage(cpu_num, total, busy) -> float: """ calculate usage percentage based on busy/total time """ @@ -60,7 +67,18 @@ def _calculate_usage(cpu_num, total, busy): _prev_total[cpu_num] = total _prev_busy[cpu_num] = busy - if diff_total == 0: - return 0 + # compute raw usage and handle edge cases + if diff_total <= 0 or diff_busy <= 0: + raw_usage = 0.0 else: - return float(int(diff_busy / diff_total * 100)) + raw_usage = (diff_busy / diff_total) * 100.0 + + # smooth short-term sampling noise with exponential moving average + prev = _prev_usage.get(cpu_num) + if prev is None: + smoothed = float(round(raw_usage, 2)) + else: + smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2)) + + _prev_usage[cpu_num] = smoothed + return smoothed diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index 2f440c031..ed658e921 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -1,19 +1,210 @@ -import pyamdgpuinfo +import atexit +from pathlib import Path +from rclpy.node import Node + +# Detect available GPU backend (deferred until we have a node for logging) +_gpu_backend = None +_nvml_module = None +_nvml_handle = None +_nvml_shutdown_registered = False + +_JETSON_GPU_LOAD_PATHS = ( + Path("/sys/devices/gpu.0/load"), + Path("/sys/kernel/debug/gpu.0/load"), +) +_JETSON_DEVICE_TREE_PATHS = ( + Path("/proc/device-tree/compatible"), + Path("/proc/device-tree/model"), +) +_JETSON_GPU_TEMPERATURE_TYPES = ("gpu",) + + +def _detect_gpu_backend(node: Node): + """Auto-detect available GPU and return appropriate backend function.""" + global _gpu_backend, _nvml_module, _nvml_handle, _nvml_shutdown_registered + + # Try NVIDIA first (most common in robotics) + try: + import pynvml + except ImportError: + node.get_logger().debug("pynvml not available") + else: + try: + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + if device_count > 0: + _nvml_module = pynvml + _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0) + if not _nvml_shutdown_registered: + atexit.register(_shutdown_nvml) + _nvml_shutdown_registered = True + _gpu_backend = _collect_nvidia + node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)") + return + except Exception as e: + node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}") + + if _detect_jetson_gpu(): + _gpu_backend = _collect_jetson + node.get_logger().info("Detected NVIDIA Jetson GPU (sysfs)") + return + + # Try AMD next + try: + import pyamdgpuinfo + + if pyamdgpuinfo.detect_gpus() > 0: + _gpu_backend = _collect_amd + node.get_logger().info("Detected AMD GPU (pyamdgpuinfo)") + return + except ImportError: + node.get_logger().debug("pyamdgpuinfo not available") + except Exception as e: + node.get_logger().debug(f"AMD GPU detection failed: {e}") + + # No GPU detected + _gpu_backend = _collect_none + node.get_logger().info("No GPU detected; falling back to null backend") + + +def _collect_none(node: Node) -> tuple[float, int, int, float]: + """Null backend when no GPU is available.""" + return (0.0, 0, 0, 0.0) + + +def _collect_nvidia(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from NVIDIA GPU using pynvml.""" + try: + if _nvml_module is None or _nvml_handle is None: + return (0.0, 0, 0, 0.0) + + load = _fraction_from_percent(float(_nvml_module.nvmlDeviceGetUtilizationRates(_nvml_handle).gpu)) + mem_info = _nvml_module.nvmlDeviceGetMemoryInfo(_nvml_handle) + vram_used = mem_info.used + vram_total = mem_info.total + temperature = float(_nvml_module.nvmlDeviceGetTemperature(_nvml_handle, 0)) + return (load, vram_used, vram_total, temperature) + except Exception as e: + node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}") + return (0.0, 0, 0, 0.0) + + +def _detect_jetson_gpu() -> bool: + """Detect NVIDIA Jetson GPUs, which often do not expose NVML devices.""" + if any(_path_exists(path) for path in _JETSON_GPU_LOAD_PATHS): + return True + + for path in _JETSON_DEVICE_TREE_PATHS: + try: + content = path.read_bytes().lower() + except OSError: + continue + if b"nvidia,tegra" in content or b"nvidia jetson" in content: + return True + + return False + + +def _path_exists(path: Path) -> bool: + try: + return path.exists() + except OSError: + return False -def collect_all(): - """ - use pyamdgpuinfo to get gpu metrics +def _read_float(path: Path) -> float | None: + try: + return float(path.read_text().strip()) + except (OSError, ValueError): + return None + + +def _collect_jetson_temperature() -> float: + for thermal_type_path in Path("/sys/devices/virtual/thermal").glob("thermal_zone*/type"): + try: + thermal_type = thermal_type_path.read_text().strip().lower() + except OSError: + continue + + if not any(gpu_type in thermal_type for gpu_type in _JETSON_GPU_TEMPERATURE_TYPES): + continue + + temperature = _read_float(thermal_type_path.with_name("temp")) + if temperature is None: + continue + return temperature / 1000.0 if temperature > 1000 else temperature + + return 0.0 + + +def _collect_jetson(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from NVIDIA Jetson sysfs files.""" + try: + load = 0.0 + for path in _JETSON_GPU_LOAD_PATHS: + raw_load = _read_float(path) + if raw_load is None: + continue + # Jetson reports GPU load in permille on current L4T kernels. + load = _fraction_from_per_mille(raw_load) + break + + temperature = _collect_jetson_temperature() + return (load, 0, 0, temperature) + except Exception as e: + node.get_logger().error(f"Error collecting NVIDIA Jetson GPU metrics: {e}") + return (0.0, 0, 0, 0.0) + + +def _collect_amd(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from AMD GPU using pyamdgpuinfo.""" + try: + import pyamdgpuinfo + + if pyamdgpuinfo.detect_gpus() == 0: + return (0.0, 0, 0, 0.0) + + gpu = pyamdgpuinfo.get_gpu(0) + load = _fraction_from_percent(float(gpu.query_load())) + vram_total = gpu.memory_info["vram_size"] + vram_used = gpu.query_vram_usage() + temperature = float(gpu.query_temperature()) + + return (load, vram_used, vram_total, temperature) + except Exception as e: + node.get_logger().error(f"Error collecting AMD GPU metrics: {e}") + return (0.0, 0, 0, 0.0) + + +def collect_all(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics using the auto-detected backend. + + :param node: ROS node for logging (required for backend detection and error logging) :return: (load, vram_used, vram_total, temperature) """ - if pyamdgpuinfo.detect_gpus() == 0: - return (0, 0, 0, 0) + if _gpu_backend is None: + _detect_gpu_backend(node) + if _gpu_backend is None: + return (0.0, 0, 0, 0.0) + return _gpu_backend(node) + + +def _fraction_from_percent(value: float) -> float: + """Convert percent to fraction and clamp to [0.0..1.0] defensively.""" + return min(max(value / 100.0, 0.0), 1.0) + + +def _fraction_from_per_mille(value: float) -> float: + """Convert permille to fraction and clamp to [0.0..1.0] defensively.""" + return min(max(value / 1000.0, 0.0), 1.0) - gpu = pyamdgpuinfo.get_gpu(0) - load = gpu.query_load() - vram_total = gpu.memory_info["vram_size"] - vram_used = gpu.query_vram_usage() - temperature = gpu.query_temperature() - return (load, vram_used, vram_total, temperature) +def _shutdown_nvml() -> None: + """Release NVML resources at process shutdown.""" + if _nvml_module is None: + return + try: + _nvml_module.nvmlShutdown() + except Exception: + pass diff --git a/src/bitbots_misc/system_monitor/system_monitor/memory.py b/src/bitbots_misc/system_monitor/system_monitor/memory.py index 01942ef0d..54a4a2a65 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/memory.py +++ b/src/bitbots_misc/system_monitor/system_monitor/memory.py @@ -1,7 +1,7 @@ import psutil -def collect_all(): +def collect_all() -> tuple[int, int, int]: """ :return: (memory_available, memory_used, memory_total) """ diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py index 56304400d..2ed5bf1c3 100755 --- a/src/bitbots_misc/system_monitor/system_monitor/monitor.py +++ b/src/bitbots_misc/system_monitor/system_monitor/monitor.py @@ -54,8 +54,8 @@ def main(): while rclpy.ok(): last_send_time = time.time() - running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0) - gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0) + running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0) + gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all(node) if do_gpu else (0.0, 0, 0, 0.0) memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1) interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else [] @@ -116,6 +116,6 @@ def main(): diag_array.header.stamp = node.get_clock().now().to_msg() diagnostic_pub.publish(diag_array) - # sleep to have correct rate. we dont use ROS time since we are interested in system things + # sleep to have correct rate. we don't use ROS time since we are interested in system things dt = time.time() - last_send_time time.sleep(max(0, (1 / rate) - dt)) diff --git a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py index b896bef15..fa338eab4 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py +++ b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py @@ -6,7 +6,7 @@ _prev_msg_time = None -def collect_all(clock): +def collect_all(clock) -> list[NetworkInterfaceMsg]: global _prev_msg_time if _prev_msg_time is None: _prev_msg_time = clock.now() @@ -15,7 +15,7 @@ def collect_all(clock): return list(msgs.values()) -def _get_interfaces(): +def _get_interfaces() -> dict[str, NetworkInterfaceMsg]: """@rtype: dict""" result = {} with open("/proc/net/dev") as file_obj: diff --git a/src/bitbots_msgs/CMakeLists.txt b/src/bitbots_msgs/CMakeLists.txt index 0f5ed631b..5b212e25b 100644 --- a/src/bitbots_msgs/CMakeLists.txt +++ b/src/bitbots_msgs/CMakeLists.txt @@ -42,6 +42,7 @@ rosidl_generate_interfaces( "msg/Strategy.msg" "msg/TeamData.msg" "msg/TTS.msg" + "msg/Workload.msg" "srv/AddAnimation.srv" "srv/ManualPenalize.srv" "srv/SetObjectPose.srv"