From c6fcff2d6f2923c80fea0c741fc3f35d686b020f Mon Sep 17 00:00:00 2001 From: Jan Gutsche Date: Sun, 24 May 2026 18:56:54 +0200 Subject: [PATCH 1/8] Refactor system monitor components for improved data handling and type consistency --- .../system_monitor/config/config.yaml | 2 +- .../system_monitor/system_monitor/cpus.py | 28 +++++++++++++++---- .../system_monitor/system_monitor/gpu.py | 8 +++--- .../system_monitor/system_monitor/memory.py | 2 +- .../system_monitor/system_monitor/monitor.py | 6 ++-- .../system_monitor/network_interfaces.py | 4 +-- src/bitbots_msgs/CMakeLists.txt | 1 + 7 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/bitbots_misc/system_monitor/config/config.yaml b/src/bitbots_misc/system_monitor/config/config.yaml index 4ee829080..2ebfd972b 100644 --- a/src/bitbots_misc/system_monitor/config/config.yaml +++ b/src/bitbots_misc/system_monitor/config/config.yaml @@ -1,7 +1,7 @@ system_monitor: ros__parameters: # How many times per second should the system be queried for stats - update_frequency: 10.0 + update_frequency: 2.0 # These settings are quick_switches to completely disable certain parts of statistic collection do_cpu: true diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py index 886f31af9..31868c2b9 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py +++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py @@ -7,8 +7,15 @@ _prev_total = defaultdict(int) _prev_busy = defaultdict(int) +# store last reported usage per cpu to smooth sampling noise +_prev_usage = defaultdict(float) -def collect_all(): +# smoothing factor for exponential moving average (0..1) +# higher = more responsive, lower = smoother +_EMA_ALPHA = 0.5 + + +def collect_all() -> tuple[int, list[CpuMsg], float]: """ parse /proc/stat and calculate total and busy time @@ -50,7 +57,7 @@ def _get_cpu_stats(): return timings -def _calculate_usage(cpu_num, total, busy): +def _calculate_usage(cpu_num, total, busy) -> float: """ calculate usage percentage based on busy/total time """ @@ -60,7 +67,18 @@ def _calculate_usage(cpu_num, total, busy): _prev_total[cpu_num] = total _prev_busy[cpu_num] = busy - if diff_total == 0: - return 0 + # compute raw usage and handle edge cases + if diff_total <= 0 or diff_busy <= 0: + raw_usage = 0.0 else: - return float(int(diff_busy / diff_total * 100)) + raw_usage = (diff_busy / diff_total) * 100.0 + + # smooth short-term sampling noise with exponential moving average + prev = _prev_usage[cpu_num] + if prev == 0.0: + smoothed = float(round(raw_usage, 2)) + else: + smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2)) + + _prev_usage[cpu_num] = smoothed + return smoothed diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index 2f440c031..f3abb5e40 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -1,19 +1,19 @@ import pyamdgpuinfo -def collect_all(): +def collect_all() -> tuple[float, int, int, float]: """ use pyamdgpuinfo to get gpu metrics :return: (load, vram_used, vram_total, temperature) """ if pyamdgpuinfo.detect_gpus() == 0: - return (0, 0, 0, 0) + return (0.0, 0, 0, 0.0) gpu = pyamdgpuinfo.get_gpu(0) - load = gpu.query_load() + load = float(gpu.query_load()) vram_total = gpu.memory_info["vram_size"] vram_used = gpu.query_vram_usage() - temperature = gpu.query_temperature() + temperature = float(gpu.query_temperature()) return (load, vram_used, vram_total, temperature) diff --git a/src/bitbots_misc/system_monitor/system_monitor/memory.py b/src/bitbots_misc/system_monitor/system_monitor/memory.py index 01942ef0d..54a4a2a65 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/memory.py +++ b/src/bitbots_misc/system_monitor/system_monitor/memory.py @@ -1,7 +1,7 @@ import psutil -def collect_all(): +def collect_all() -> tuple[int, int, int]: """ :return: (memory_available, memory_used, memory_total) """ diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py index 56304400d..ca859975a 100755 --- a/src/bitbots_misc/system_monitor/system_monitor/monitor.py +++ b/src/bitbots_misc/system_monitor/system_monitor/monitor.py @@ -54,8 +54,8 @@ def main(): while rclpy.ok(): last_send_time = time.time() - running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0) - gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0) + running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0) + gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0.0, 0, 0, 0.0) memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1) interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else [] @@ -116,6 +116,6 @@ def main(): diag_array.header.stamp = node.get_clock().now().to_msg() diagnostic_pub.publish(diag_array) - # sleep to have correct rate. we dont use ROS time since we are interested in system things + # sleep to have correct rate. we don't use ROS time since we are interested in system things dt = time.time() - last_send_time time.sleep(max(0, (1 / rate) - dt)) diff --git a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py index b896bef15..fa338eab4 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py +++ b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py @@ -6,7 +6,7 @@ _prev_msg_time = None -def collect_all(clock): +def collect_all(clock) -> list[NetworkInterfaceMsg]: global _prev_msg_time if _prev_msg_time is None: _prev_msg_time = clock.now() @@ -15,7 +15,7 @@ def collect_all(clock): return list(msgs.values()) -def _get_interfaces(): +def _get_interfaces() -> dict[str, NetworkInterfaceMsg]: """@rtype: dict""" result = {} with open("/proc/net/dev") as file_obj: diff --git a/src/bitbots_msgs/CMakeLists.txt b/src/bitbots_msgs/CMakeLists.txt index 0f5ed631b..5b212e25b 100644 --- a/src/bitbots_msgs/CMakeLists.txt +++ b/src/bitbots_msgs/CMakeLists.txt @@ -42,6 +42,7 @@ rosidl_generate_interfaces( "msg/Strategy.msg" "msg/TeamData.msg" "msg/TTS.msg" + "msg/Workload.msg" "srv/AddAnimation.srv" "srv/ManualPenalize.srv" "srv/SetObjectPose.srv" From 33366a84aaa70635ba0ae1e96a5b896028919447 Mon Sep 17 00:00:00 2001 From: Jan Gutsche Date: Sun, 24 May 2026 19:52:17 +0200 Subject: [PATCH 2/8] Fix system monitoring (on intel system) Enhance GPU monitoring by integrating NVIDIA and AMD detection, updating collection methods, and adding support for nvidia-ml-py package --- .vscode/settings.json | 3 + pixi.lock | 18 +++ pixi.toml | 1 + .../system_monitor/system_monitor/gpu.py | 123 ++++++++++++++++-- .../system_monitor/system_monitor/monitor.py | 2 +- 5 files changed, 133 insertions(+), 14 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8ceee9d01..a1517b8b1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -71,6 +71,7 @@ "numpify", "numpy", "nvidia", + "nvml", "odom", "odometry", "particlefilter", @@ -85,6 +86,7 @@ "pretrained", "proto", "protos", + "pyamdgpuinfo", "pyplot", "pywrapper", "Quaterniond", @@ -127,6 +129,7 @@ "unpenalized", "urdf", "vcstool", + "vram", "walkready", "wandb", "webots", diff --git a/pixi.lock b/pixi.lock index a636e7816..00b4af32a 100644 --- a/pixi.lock +++ b/pixi.lock @@ -530,6 +530,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -1504,6 +1505,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -2618,6 +2620,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -3596,6 +3599,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda @@ -17410,6 +17414,20 @@ packages: - pkg:pypi/notify2?source=hash-mapping size: 11122 timestamp: 1647371622387 +- conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda + sha256: fb46556c423311638fbb26ed4b2a67cf598044919e42ab7571365a5b4ae3b663 + md5: dc8587ae654e96031728802016e8258c + depends: + - python >=3.10 + constrains: + - pynvml ~=13.0 + - nvidia-ml ==9999999999 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/nvidia-ml-py?source=hash-mapping + size: 48878 + timestamp: 1775592349734 - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda sha256: 3906abfb6511a3bb309e39b9b1b7bc38f50a723971de2395489fd1f379255890 md5: 4c06a92e74452cfa53623a81592e8934 diff --git a/pixi.toml b/pixi.toml index 24faec9bf..600c4ebbe 100644 --- a/pixi.toml +++ b/pixi.toml @@ -70,6 +70,7 @@ mujoco = ">=3.6.0,<4" mypy = ">=1.18.2,<2" ninja = ">=1.13.2,<2" numpy = ">=1.26.4,<2" +nvidia-ml-py = ">=13.595.45,<14" opencv = ">=4.11.0,<5" paramiko = ">=4.0.0,<5" pkg-config = ">=0.29.2,<0.30" diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index f3abb5e40..c1d764543 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -1,19 +1,116 @@ -import pyamdgpuinfo +from rclpy.node import Node +# Detect available GPU backend (deferred until we have a node for logging) +_gpu_backend = None -def collect_all() -> tuple[float, int, int, float]: - """ - use pyamdgpuinfo to get gpu metrics - :return: (load, vram_used, vram_total, temperature) - """ - if pyamdgpuinfo.detect_gpus() == 0: +def _detect_gpu_backend(node: Node): + """Auto-detect available GPU and return appropriate backend function.""" + global _gpu_backend + + # Try NVIDIA first (most common in robotics) + try: + import pynvml + + # nvmlInit can fail if the NVIDIA driver/ NVML library is not installed + try: + pynvml.nvmlInit() + except Exception as e: + node.get_logger().debug(f"pynvml present but nvmlInit failed: {e}") + else: + try: + device_count = pynvml.nvmlDeviceGetCount() + if device_count > 0: + _gpu_backend = _collect_nvidia + node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)") + return + except Exception as e: + node.get_logger().debug(f"NVIDIA GPU detection failed: {e}") + finally: + try: + pynvml.nvmlShutdown() + except Exception: + pass + except ImportError: + node.get_logger().debug("pynvml not available") + + # Try AMD next + try: + import pyamdgpuinfo + + if pyamdgpuinfo.detect_gpus() > 0: + _gpu_backend = _collect_amd + node.get_logger().info("Detected AMD GPU (pyamdgpuinfo)") + return + except ImportError: + node.get_logger().debug("pyamdgpuinfo not available") + except Exception as e: + node.get_logger().debug(f"AMD GPU detection failed: {e}") + + # No GPU detected + _gpu_backend = _collect_none + node.get_logger().info("No GPU detected; falling back to null backend") + + +def _collect_none(node: Node) -> tuple[float, int, int, float]: + """Null backend when no GPU is available.""" + return (0.0, 0, 0, 0.0) + + +def _collect_nvidia(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from NVIDIA GPU using pynvml.""" + try: + import pynvml + + pynvml.nvmlInit() + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + load = float(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + vram_used = mem_info.used + vram_total = mem_info.total + temperature = float(pynvml.nvmlDeviceGetTemperature(handle, 0)) + return (load, vram_used, vram_total, temperature) + finally: + try: + pynvml.nvmlShutdown() + except Exception: + pass + except Exception as e: + node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}") + return (0.0, 0, 0, 0.0) + + +def _collect_amd(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from AMD GPU using pyamdgpuinfo.""" + try: + import pyamdgpuinfo + + if pyamdgpuinfo.detect_gpus() == 0: + return (0.0, 0, 0, 0.0) + + gpu = pyamdgpuinfo.get_gpu(0) + load = float(gpu.query_load()) + vram_total = gpu.memory_info["vram_size"] + vram_used = gpu.query_vram_usage() + temperature = float(gpu.query_temperature()) + + return (load, vram_used, vram_total, temperature) + except Exception as e: + node.get_logger().error(f"Error collecting AMD GPU metrics: {e}") return (0.0, 0, 0, 0.0) - gpu = pyamdgpuinfo.get_gpu(0) - load = float(gpu.query_load()) - vram_total = gpu.memory_info["vram_size"] - vram_used = gpu.query_vram_usage() - temperature = float(gpu.query_temperature()) - return (load, vram_used, vram_total, temperature) +def collect_all(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics using the auto-detected backend. + + If `node` is provided the ROS node's logger will be used for messages. + + node: ROS node for logging (required for backend detection and error logging) + :return: (load, vram_used, vram_total, temperature) + """ + if _gpu_backend is None: + _detect_gpu_backend(node) + if _gpu_backend is None: + return (0.0, 0, 0, 0.0) + return _gpu_backend(node) diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py index ca859975a..2ed5bf1c3 100755 --- a/src/bitbots_misc/system_monitor/system_monitor/monitor.py +++ b/src/bitbots_misc/system_monitor/system_monitor/monitor.py @@ -55,7 +55,7 @@ def main(): while rclpy.ok(): last_send_time = time.time() running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0) - gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0.0, 0, 0, 0.0) + gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all(node) if do_gpu else (0.0, 0, 0, 0.0) memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1) interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else [] From c6842cf0dfe377b5c17b78370b59b810b9b00dbd Mon Sep 17 00:00:00 2001 From: Jan Gutsche Date: Sun, 24 May 2026 22:01:47 +0200 Subject: [PATCH 3/8] Fix system_monitor GPU on jetson --- .../system_monitor/system_monitor/gpu.py | 115 +++++++++++++++--- 1 file changed, 97 insertions(+), 18 deletions(-) diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index c1d764543..bd3819053 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -1,8 +1,20 @@ +from pathlib import Path + from rclpy.node import Node # Detect available GPU backend (deferred until we have a node for logging) _gpu_backend = None +_JETSON_GPU_LOAD_PATHS = ( + Path("/sys/devices/gpu.0/load"), + Path("/sys/kernel/debug/gpu.0/load"), +) +_JETSON_DEVICE_TREE_PATHS = ( + Path("/proc/device-tree/compatible"), + Path("/proc/device-tree/model"), +) +_JETSON_GPU_TEMPERATURE_TYPES = ("gpu",) + def _detect_gpu_backend(node: Node): """Auto-detect available GPU and return appropriate backend function.""" @@ -11,28 +23,28 @@ def _detect_gpu_backend(node: Node): # Try NVIDIA first (most common in robotics) try: import pynvml - - # nvmlInit can fail if the NVIDIA driver/ NVML library is not installed + except ImportError: + node.get_logger().debug("pynvml not available") + else: try: pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + if device_count > 0: + _gpu_backend = _collect_nvidia + node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)") + return except Exception as e: - node.get_logger().debug(f"pynvml present but nvmlInit failed: {e}") - else: + node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}") + finally: try: - device_count = pynvml.nvmlDeviceGetCount() - if device_count > 0: - _gpu_backend = _collect_nvidia - node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)") - return - except Exception as e: - node.get_logger().debug(f"NVIDIA GPU detection failed: {e}") - finally: - try: - pynvml.nvmlShutdown() - except Exception: - pass - except ImportError: - node.get_logger().debug("pynvml not available") + pynvml.nvmlShutdown() + except Exception: + pass + + if _detect_jetson_gpu(): + _gpu_backend = _collect_jetson + node.get_logger().info("Detected NVIDIA Jetson GPU (sysfs)") + return # Try AMD next try: @@ -81,6 +93,73 @@ def _collect_nvidia(node: Node) -> tuple[float, int, int, float]: return (0.0, 0, 0, 0.0) +def _detect_jetson_gpu() -> bool: + """Detect NVIDIA Jetson GPUs, which often do not expose NVML devices.""" + if any(_path_exists(path) for path in _JETSON_GPU_LOAD_PATHS): + return True + + for path in _JETSON_DEVICE_TREE_PATHS: + try: + content = path.read_bytes().lower() + except OSError: + continue + if b"nvidia,tegra" in content or b"nvidia jetson" in content: + return True + + return False + + +def _path_exists(path: Path) -> bool: + try: + return path.exists() + except OSError: + return False + + +def _read_float(path: Path) -> float | None: + try: + return float(path.read_text().strip()) + except (OSError, ValueError): + return None + + +def _collect_jetson_temperature() -> float: + for thermal_type_path in Path("/sys/devices/virtual/thermal").glob("thermal_zone*/type"): + try: + thermal_type = thermal_type_path.read_text().strip().lower() + except OSError: + continue + + if not any(gpu_type in thermal_type for gpu_type in _JETSON_GPU_TEMPERATURE_TYPES): + continue + + temperature = _read_float(thermal_type_path.with_name("temp")) + if temperature is None: + continue + return temperature / 1000.0 if temperature > 1000 else temperature + + return 0.0 + + +def _collect_jetson(node: Node) -> tuple[float, int, int, float]: + """Collect GPU metrics from NVIDIA Jetson sysfs files.""" + try: + load = 0.0 + for path in _JETSON_GPU_LOAD_PATHS: + raw_load = _read_float(path) + if raw_load is None: + continue + # Jetson reports GPU load in permille on current L4T kernels. + load = raw_load / 10.0 + break + + temperature = _collect_jetson_temperature() + return (load, 0, 0, temperature) + except Exception as e: + node.get_logger().error(f"Error collecting NVIDIA Jetson GPU metrics: {e}") + return (0.0, 0, 0, 0.0) + + def _collect_amd(node: Node) -> tuple[float, int, int, float]: """Collect GPU metrics from AMD GPU using pyamdgpuinfo.""" try: From 167b9385fe5bdc1fc1882658e8d7ceba9334398f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 06:21:41 +0000 Subject: [PATCH 4/8] Fix system monitor GPU unit consistency and CPU EMA init Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742 Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com> --- .../system_monitor/system_monitor/cpus.py | 4 +- .../system_monitor/system_monitor/gpu.py | 61 +++++++++++-------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py index 31868c2b9..e53d31074 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py +++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py @@ -8,7 +8,7 @@ _prev_busy = defaultdict(int) # store last reported usage per cpu to smooth sampling noise -_prev_usage = defaultdict(float) +_prev_usage: dict[str, float | None] = defaultdict(lambda: None) # smoothing factor for exponential moving average (0..1) # higher = more responsive, lower = smoother @@ -75,7 +75,7 @@ def _calculate_usage(cpu_num, total, busy) -> float: # smooth short-term sampling noise with exponential moving average prev = _prev_usage[cpu_num] - if prev == 0.0: + if prev is None: smoothed = float(round(raw_usage, 2)) else: smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2)) diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index bd3819053..f1baffcb3 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -1,9 +1,13 @@ +import atexit from pathlib import Path from rclpy.node import Node # Detect available GPU backend (deferred until we have a node for logging) _gpu_backend = None +_nvml_module = None +_nvml_handle = None +_nvml_shutdown_registered = False _JETSON_GPU_LOAD_PATHS = ( Path("/sys/devices/gpu.0/load"), @@ -30,16 +34,17 @@ def _detect_gpu_backend(node: Node): pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() if device_count > 0: + global _nvml_module, _nvml_handle, _nvml_shutdown_registered + _nvml_module = pynvml + _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0) + if not _nvml_shutdown_registered: + atexit.register(_shutdown_nvml) + _nvml_shutdown_registered = True _gpu_backend = _collect_nvidia node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)") return except Exception as e: node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}") - finally: - try: - pynvml.nvmlShutdown() - except Exception: - pass if _detect_jetson_gpu(): _gpu_backend = _collect_jetson @@ -72,22 +77,15 @@ def _collect_none(node: Node) -> tuple[float, int, int, float]: def _collect_nvidia(node: Node) -> tuple[float, int, int, float]: """Collect GPU metrics from NVIDIA GPU using pynvml.""" try: - import pynvml + if _nvml_module is None or _nvml_handle is None: + return (0.0, 0, 0, 0.0) - pynvml.nvmlInit() - try: - handle = pynvml.nvmlDeviceGetHandleByIndex(0) - load = float(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu) - mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - vram_used = mem_info.used - vram_total = mem_info.total - temperature = float(pynvml.nvmlDeviceGetTemperature(handle, 0)) - return (load, vram_used, vram_total, temperature) - finally: - try: - pynvml.nvmlShutdown() - except Exception: - pass + load = _fraction_from_percent(float(_nvml_module.nvmlDeviceGetUtilizationRates(_nvml_handle).gpu)) + mem_info = _nvml_module.nvmlDeviceGetMemoryInfo(_nvml_handle) + vram_used = mem_info.used + vram_total = mem_info.total + temperature = float(_nvml_module.nvmlDeviceGetTemperature(_nvml_handle, 0)) + return (load, vram_used, vram_total, temperature) except Exception as e: node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}") return (0.0, 0, 0, 0.0) @@ -150,7 +148,7 @@ def _collect_jetson(node: Node) -> tuple[float, int, int, float]: if raw_load is None: continue # Jetson reports GPU load in permille on current L4T kernels. - load = raw_load / 10.0 + load = _fraction_from_per_mille(raw_load) break temperature = _collect_jetson_temperature() @@ -169,7 +167,7 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]: return (0.0, 0, 0, 0.0) gpu = pyamdgpuinfo.get_gpu(0) - load = float(gpu.query_load()) + load = _fraction_from_percent(float(gpu.query_load())) vram_total = gpu.memory_info["vram_size"] vram_used = gpu.query_vram_usage() temperature = float(gpu.query_temperature()) @@ -183,8 +181,6 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]: def collect_all(node: Node) -> tuple[float, int, int, float]: """Collect GPU metrics using the auto-detected backend. - If `node` is provided the ROS node's logger will be used for messages. - node: ROS node for logging (required for backend detection and error logging) :return: (load, vram_used, vram_total, temperature) """ @@ -193,3 +189,20 @@ def collect_all(node: Node) -> tuple[float, int, int, float]: if _gpu_backend is None: return (0.0, 0, 0, 0.0) return _gpu_backend(node) + + +def _fraction_from_percent(value: float) -> float: + return min(max(value / 100.0, 0.0), 1.0) + + +def _fraction_from_per_mille(value: float) -> float: + return min(max(value / 1000.0, 0.0), 1.0) + + +def _shutdown_nvml() -> None: + if _nvml_module is None: + return + try: + _nvml_module.nvmlShutdown() + except Exception: + pass From a0fa5b3e24c1cb5983db1684976ac89a3478cd32 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 06:22:58 +0000 Subject: [PATCH 5/8] Add docstrings for GPU normalization helpers Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742 Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com> --- src/bitbots_misc/system_monitor/system_monitor/gpu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index f1baffcb3..18a8e0d84 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -192,14 +192,17 @@ def collect_all(node: Node) -> tuple[float, int, int, float]: def _fraction_from_percent(value: float) -> float: + """Convert percent [0..100] to clamped fraction [0.0..1.0].""" return min(max(value / 100.0, 0.0), 1.0) def _fraction_from_per_mille(value: float) -> float: + """Convert permille [0..1000] to clamped fraction [0.0..1.0].""" return min(max(value / 1000.0, 0.0), 1.0) def _shutdown_nvml() -> None: + """Release NVML resources at process shutdown.""" if _nvml_module is None: return try: From fbc7c96c7fe5a899d28fd547acbcdf1ba1ae2ee3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 06:23:56 +0000 Subject: [PATCH 6/8] Polish GPU backend globals and docstring parameter format Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742 Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com> --- src/bitbots_misc/system_monitor/system_monitor/gpu.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index 18a8e0d84..b90b330a9 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -22,7 +22,7 @@ def _detect_gpu_backend(node: Node): """Auto-detect available GPU and return appropriate backend function.""" - global _gpu_backend + global _gpu_backend, _nvml_module, _nvml_handle, _nvml_shutdown_registered # Try NVIDIA first (most common in robotics) try: @@ -34,7 +34,6 @@ def _detect_gpu_backend(node: Node): pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() if device_count > 0: - global _nvml_module, _nvml_handle, _nvml_shutdown_registered _nvml_module = pynvml _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0) if not _nvml_shutdown_registered: @@ -181,7 +180,7 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]: def collect_all(node: Node) -> tuple[float, int, int, float]: """Collect GPU metrics using the auto-detected backend. - node: ROS node for logging (required for backend detection and error logging) + :param node: ROS node for logging (required for backend detection and error logging) :return: (load, vram_used, vram_total, temperature) """ if _gpu_backend is None: From 66048af473ecdbefee28f117c3a46714a9117f1b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 06:24:49 +0000 Subject: [PATCH 7/8] Use explicit dict lookup for CPU EMA previous usage Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742 Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com> --- src/bitbots_misc/system_monitor/system_monitor/cpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py index e53d31074..78a8b750e 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py +++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py @@ -8,7 +8,7 @@ _prev_busy = defaultdict(int) # store last reported usage per cpu to smooth sampling noise -_prev_usage: dict[str, float | None] = defaultdict(lambda: None) +_prev_usage: dict[str, float] = {} # smoothing factor for exponential moving average (0..1) # higher = more responsive, lower = smoother @@ -74,7 +74,7 @@ def _calculate_usage(cpu_num, total, busy) -> float: raw_usage = (diff_busy / diff_total) * 100.0 # smooth short-term sampling noise with exponential moving average - prev = _prev_usage[cpu_num] + prev = _prev_usage.get(cpu_num) if prev is None: smoothed = float(round(raw_usage, 2)) else: From bf629a38de07b530990bd7539083729ba17fe2c8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 06:25:43 +0000 Subject: [PATCH 8/8] Clarify defensive clamping in GPU helper docstrings Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742 Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com> --- src/bitbots_misc/system_monitor/system_monitor/gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py index b90b330a9..ed658e921 100644 --- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py +++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py @@ -191,12 +191,12 @@ def collect_all(node: Node) -> tuple[float, int, int, float]: def _fraction_from_percent(value: float) -> float: - """Convert percent [0..100] to clamped fraction [0.0..1.0].""" + """Convert percent to fraction and clamp to [0.0..1.0] defensively.""" return min(max(value / 100.0, 0.0), 1.0) def _fraction_from_per_mille(value: float) -> float: - """Convert permille [0..1000] to clamped fraction [0.0..1.0].""" + """Convert permille to fraction and clamp to [0.0..1.0] defensively.""" return min(max(value / 1000.0, 0.0), 1.0)