From c6fcff2d6f2923c80fea0c741fc3f35d686b020f Mon Sep 17 00:00:00 2001
From: Jan Gutsche <github@jaagut.de>
Date: Sun, 24 May 2026 18:56:54 +0200
Subject: [PATCH 1/8] Refactor system monitor components for improved data
 handling and type consistency

---
 .../system_monitor/config/config.yaml         |  2 +-
 .../system_monitor/system_monitor/cpus.py     | 28 +++++++++++++++----
 .../system_monitor/system_monitor/gpu.py      |  8 +++---
 .../system_monitor/system_monitor/memory.py   |  2 +-
 .../system_monitor/system_monitor/monitor.py  |  6 ++--
 .../system_monitor/network_interfaces.py      |  4 +--
 src/bitbots_msgs/CMakeLists.txt               |  1 +
 7 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/config/config.yaml b/src/bitbots_misc/system_monitor/config/config.yaml
index 4ee829080..2ebfd972b 100644
--- a/src/bitbots_misc/system_monitor/config/config.yaml
+++ b/src/bitbots_misc/system_monitor/config/config.yaml
@@ -1,7 +1,7 @@
 system_monitor:
   ros__parameters:
     # How many times per second should the system be queried for stats
-    update_frequency: 10.0
+    update_frequency: 2.0
 
     # These settings are quick_switches to completely disable certain parts of statistic collection
     do_cpu: true
diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
index 886f31af9..31868c2b9 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
@@ -7,8 +7,15 @@
 _prev_total = defaultdict(int)
 _prev_busy = defaultdict(int)
 
+# store last reported usage per cpu to smooth sampling noise
+_prev_usage = defaultdict(float)
 
-def collect_all():
+# smoothing factor for exponential moving average (0..1)
+# higher = more responsive, lower = smoother
+_EMA_ALPHA = 0.5
+
+
+def collect_all() -> tuple[int, list[CpuMsg], float]:
     """
     parse /proc/stat and calculate total and busy time
 
@@ -50,7 +57,7 @@ def _get_cpu_stats():
     return timings
 
 
-def _calculate_usage(cpu_num, total, busy):
+def _calculate_usage(cpu_num, total, busy) -> float:
     """
     calculate usage percentage based on busy/total time
     """
@@ -60,7 +67,18 @@ def _calculate_usage(cpu_num, total, busy):
     _prev_total[cpu_num] = total
     _prev_busy[cpu_num] = busy
 
-    if diff_total == 0:
-        return 0
+    # compute raw usage and handle edge cases
+    if diff_total <= 0 or diff_busy <= 0:
+        raw_usage = 0.0
     else:
-        return float(int(diff_busy / diff_total * 100))
+        raw_usage = (diff_busy / diff_total) * 100.0
+
+    # smooth short-term sampling noise with exponential moving average
+    prev = _prev_usage[cpu_num]
+    if prev == 0.0:
+        smoothed = float(round(raw_usage, 2))
+    else:
+        smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2))
+
+    _prev_usage[cpu_num] = smoothed
+    return smoothed
diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index 2f440c031..f3abb5e40 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -1,19 +1,19 @@
 import pyamdgpuinfo
 
 
-def collect_all():
+def collect_all() -> tuple[float, int, int, float]:
     """
     use pyamdgpuinfo to get gpu metrics
 
     :return: (load, vram_used, vram_total, temperature)
     """
     if pyamdgpuinfo.detect_gpus() == 0:
-        return (0, 0, 0, 0)
+        return (0.0, 0, 0, 0.0)
 
     gpu = pyamdgpuinfo.get_gpu(0)
-    load = gpu.query_load()
+    load = float(gpu.query_load())
     vram_total = gpu.memory_info["vram_size"]
     vram_used = gpu.query_vram_usage()
-    temperature = gpu.query_temperature()
+    temperature = float(gpu.query_temperature())
 
     return (load, vram_used, vram_total, temperature)
diff --git a/src/bitbots_misc/system_monitor/system_monitor/memory.py b/src/bitbots_misc/system_monitor/system_monitor/memory.py
index 01942ef0d..54a4a2a65 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/memory.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/memory.py
@@ -1,7 +1,7 @@
 import psutil
 
 
-def collect_all():
+def collect_all() -> tuple[int, int, int]:
     """
     :return: (memory_available, memory_used, memory_total)
     """
diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py
index 56304400d..ca859975a 100755
--- a/src/bitbots_misc/system_monitor/system_monitor/monitor.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/monitor.py
@@ -54,8 +54,8 @@ def main():
 
     while rclpy.ok():
         last_send_time = time.time()
-        running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0)
-        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0)
+        running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0)
+        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0.0, 0, 0, 0.0)
         memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1)
         interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else []
 
@@ -116,6 +116,6 @@ def main():
         diag_array.header.stamp = node.get_clock().now().to_msg()
         diagnostic_pub.publish(diag_array)
 
-        # sleep to have correct rate. we dont use ROS time since we are interested in system things
+        # sleep to have correct rate. we don't use ROS time since we are interested in system things
         dt = time.time() - last_send_time
         time.sleep(max(0, (1 / rate) - dt))
diff --git a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py
index b896bef15..fa338eab4 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/network_interfaces.py
@@ -6,7 +6,7 @@
 _prev_msg_time = None
 
 
-def collect_all(clock):
+def collect_all(clock) -> list[NetworkInterfaceMsg]:
     global _prev_msg_time
     if _prev_msg_time is None:
         _prev_msg_time = clock.now()
@@ -15,7 +15,7 @@ def collect_all(clock):
     return list(msgs.values())
 
 
-def _get_interfaces():
+def _get_interfaces() -> dict[str, NetworkInterfaceMsg]:
     """@rtype: dict"""
     result = {}
     with open("/proc/net/dev") as file_obj:
diff --git a/src/bitbots_msgs/CMakeLists.txt b/src/bitbots_msgs/CMakeLists.txt
index 0f5ed631b..5b212e25b 100644
--- a/src/bitbots_msgs/CMakeLists.txt
+++ b/src/bitbots_msgs/CMakeLists.txt
@@ -42,6 +42,7 @@ rosidl_generate_interfaces(
   "msg/Strategy.msg"
   "msg/TeamData.msg"
   "msg/TTS.msg"
+  "msg/Workload.msg"
   "srv/AddAnimation.srv"
   "srv/ManualPenalize.srv"
   "srv/SetObjectPose.srv"

From 33366a84aaa70635ba0ae1e96a5b896028919447 Mon Sep 17 00:00:00 2001
From: Jan Gutsche <github@jaagut.de>
Date: Sun, 24 May 2026 19:52:17 +0200
Subject: [PATCH 2/8] Fix system monitoring (on intel system)

Enhance GPU monitoring by integrating NVIDIA and AMD detection, updating collection methods, and adding support for nvidia-ml-py package
---
 .vscode/settings.json                         |   3 +
 pixi.lock                                     |  18 +++
 pixi.toml                                     |   1 +
 .../system_monitor/system_monitor/gpu.py      | 123 ++++++++++++++++--
 .../system_monitor/system_monitor/monitor.py  |   2 +-
 5 files changed, 133 insertions(+), 14 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8ceee9d01..a1517b8b1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -71,6 +71,7 @@
         "numpify",
         "numpy",
         "nvidia",
+        "nvml",
         "odom",
         "odometry",
         "particlefilter",
@@ -85,6 +86,7 @@
         "pretrained",
         "proto",
         "protos",
+        "pyamdgpuinfo",
         "pyplot",
         "pywrapper",
         "Quaterniond",
@@ -127,6 +129,7 @@
         "unpenalized",
         "urdf",
         "vcstool",
+        "vram",
         "walkready",
         "wandb",
         "webots",
diff --git a/pixi.lock b/pixi.lock
index a636e7816..00b4af32a 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -530,6 +530,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda
@@ -1504,6 +1505,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda
@@ -2618,6 +2620,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda
@@ -3596,6 +3599,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/notify2-0.3.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/paramiko-4.0.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.7-pyhcf101f3_0.conda
@@ -17410,6 +17414,20 @@ packages:
   - pkg:pypi/notify2?source=hash-mapping
   size: 11122
   timestamp: 1647371622387
+- conda: https://conda.anaconda.org/conda-forge/noarch/nvidia-ml-py-13.595.45-pyhd8ed1ab_1.conda
+  sha256: fb46556c423311638fbb26ed4b2a67cf598044919e42ab7571365a5b4ae3b663
+  md5: dc8587ae654e96031728802016e8258c
+  depends:
+  - python >=3.10
+  constrains:
+  - pynvml ~=13.0
+  - nvidia-ml ==9999999999
+  license: BSD-3-Clause
+  license_family: BSD
+  purls:
+  - pkg:pypi/nvidia-ml-py?source=hash-mapping
+  size: 48878
+  timestamp: 1775592349734
 - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.2-pyhc364b38_0.conda
   sha256: 3906abfb6511a3bb309e39b9b1b7bc38f50a723971de2395489fd1f379255890
   md5: 4c06a92e74452cfa53623a81592e8934
diff --git a/pixi.toml b/pixi.toml
index 24faec9bf..600c4ebbe 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -70,6 +70,7 @@ mujoco = ">=3.6.0,<4"
 mypy = ">=1.18.2,<2"
 ninja = ">=1.13.2,<2"
 numpy = ">=1.26.4,<2"
+nvidia-ml-py = ">=13.595.45,<14"
 opencv = ">=4.11.0,<5"
 paramiko =   ">=4.0.0,<5"
 pkg-config = ">=0.29.2,<0.30"
diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index f3abb5e40..c1d764543 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -1,19 +1,116 @@
-import pyamdgpuinfo
+from rclpy.node import Node
 
+# Detect available GPU backend (deferred until we have a node for logging)
+_gpu_backend = None
 
-def collect_all() -> tuple[float, int, int, float]:
-    """
-    use pyamdgpuinfo to get gpu metrics
 
-    :return: (load, vram_used, vram_total, temperature)
-    """
-    if pyamdgpuinfo.detect_gpus() == 0:
+def _detect_gpu_backend(node: Node):
+    """Auto-detect available GPU and return appropriate backend function."""
+    global _gpu_backend
+
+    # Try NVIDIA first (most common in robotics)
+    try:
+        import pynvml
+
+        # nvmlInit can fail if the NVIDIA driver/ NVML library is not installed
+        try:
+            pynvml.nvmlInit()
+        except Exception as e:
+            node.get_logger().debug(f"pynvml present but nvmlInit failed: {e}")
+        else:
+            try:
+                device_count = pynvml.nvmlDeviceGetCount()
+                if device_count > 0:
+                    _gpu_backend = _collect_nvidia
+                    node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
+                    return
+            except Exception as e:
+                node.get_logger().debug(f"NVIDIA GPU detection failed: {e}")
+            finally:
+                try:
+                    pynvml.nvmlShutdown()
+                except Exception:
+                    pass
+    except ImportError:
+        node.get_logger().debug("pynvml not available")
+
+    # Try AMD next
+    try:
+        import pyamdgpuinfo
+
+        if pyamdgpuinfo.detect_gpus() > 0:
+            _gpu_backend = _collect_amd
+            node.get_logger().info("Detected AMD GPU (pyamdgpuinfo)")
+            return
+    except ImportError:
+        node.get_logger().debug("pyamdgpuinfo not available")
+    except Exception as e:
+        node.get_logger().debug(f"AMD GPU detection failed: {e}")
+
+    # No GPU detected
+    _gpu_backend = _collect_none
+    node.get_logger().info("No GPU detected; falling back to null backend")
+
+
+def _collect_none(node: Node) -> tuple[float, int, int, float]:
+    """Null backend when no GPU is available."""
+    return (0.0, 0, 0, 0.0)
+
+
+def _collect_nvidia(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from NVIDIA GPU using pynvml."""
+    try:
+        import pynvml
+
+        pynvml.nvmlInit()
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+            load = float(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            vram_used = mem_info.used
+            vram_total = mem_info.total
+            temperature = float(pynvml.nvmlDeviceGetTemperature(handle, 0))
+            return (load, vram_used, vram_total, temperature)
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except Exception:
+                pass
+    except Exception as e:
+        node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}")
+        return (0.0, 0, 0, 0.0)
+
+
+def _collect_amd(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from AMD GPU using pyamdgpuinfo."""
+    try:
+        import pyamdgpuinfo
+
+        if pyamdgpuinfo.detect_gpus() == 0:
+            return (0.0, 0, 0, 0.0)
+
+        gpu = pyamdgpuinfo.get_gpu(0)
+        load = float(gpu.query_load())
+        vram_total = gpu.memory_info["vram_size"]
+        vram_used = gpu.query_vram_usage()
+        temperature = float(gpu.query_temperature())
+
+        return (load, vram_used, vram_total, temperature)
+    except Exception as e:
+        node.get_logger().error(f"Error collecting AMD GPU metrics: {e}")
         return (0.0, 0, 0, 0.0)
 
-    gpu = pyamdgpuinfo.get_gpu(0)
-    load = float(gpu.query_load())
-    vram_total = gpu.memory_info["vram_size"]
-    vram_used = gpu.query_vram_usage()
-    temperature = float(gpu.query_temperature())
 
-    return (load, vram_used, vram_total, temperature)
+def collect_all(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics using the auto-detected backend.
+
+    If `node` is provided the ROS node's logger will be used for messages.
+
+    node: ROS node for logging (required for backend detection and error logging)
+    :return: (load, vram_used, vram_total, temperature)
+    """
+    if _gpu_backend is None:
+        _detect_gpu_backend(node)
+    if _gpu_backend is None:
+        return (0.0, 0, 0, 0.0)
+    return _gpu_backend(node)
diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py
index ca859975a..2ed5bf1c3 100755
--- a/src/bitbots_misc/system_monitor/system_monitor/monitor.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/monitor.py
@@ -55,7 +55,7 @@ def main():
     while rclpy.ok():
         last_send_time = time.time()
         running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0)
-        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0.0, 0, 0, 0.0)
+        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all(node) if do_gpu else (0.0, 0, 0, 0.0)
         memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1)
         interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else []
 

From c6842cf0dfe377b5c17b78370b59b810b9b00dbd Mon Sep 17 00:00:00 2001
From: Jan Gutsche <github@jaagut.de>
Date: Sun, 24 May 2026 22:01:47 +0200
Subject: [PATCH 3/8] Fix system_monitor GPU on jetson

---
 .../system_monitor/system_monitor/gpu.py      | 115 +++++++++++++++---
 1 file changed, 97 insertions(+), 18 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index c1d764543..bd3819053 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -1,8 +1,20 @@
+from pathlib import Path
+
 from rclpy.node import Node
 
 # Detect available GPU backend (deferred until we have a node for logging)
 _gpu_backend = None
 
+_JETSON_GPU_LOAD_PATHS = (
+    Path("/sys/devices/gpu.0/load"),
+    Path("/sys/kernel/debug/gpu.0/load"),
+)
+_JETSON_DEVICE_TREE_PATHS = (
+    Path("/proc/device-tree/compatible"),
+    Path("/proc/device-tree/model"),
+)
+_JETSON_GPU_TEMPERATURE_TYPES = ("gpu",)
+
 
 def _detect_gpu_backend(node: Node):
     """Auto-detect available GPU and return appropriate backend function."""
@@ -11,28 +23,28 @@ def _detect_gpu_backend(node: Node):
     # Try NVIDIA first (most common in robotics)
     try:
         import pynvml
-
-        # nvmlInit can fail if the NVIDIA driver/ NVML library is not installed
+    except ImportError:
+        node.get_logger().debug("pynvml not available")
+    else:
         try:
             pynvml.nvmlInit()
+            device_count = pynvml.nvmlDeviceGetCount()
+            if device_count > 0:
+                _gpu_backend = _collect_nvidia
+                node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
+                return
         except Exception as e:
-            node.get_logger().debug(f"pynvml present but nvmlInit failed: {e}")
-        else:
+            node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}")
+        finally:
             try:
-                device_count = pynvml.nvmlDeviceGetCount()
-                if device_count > 0:
-                    _gpu_backend = _collect_nvidia
-                    node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
-                    return
-            except Exception as e:
-                node.get_logger().debug(f"NVIDIA GPU detection failed: {e}")
-            finally:
-                try:
-                    pynvml.nvmlShutdown()
-                except Exception:
-                    pass
-    except ImportError:
-        node.get_logger().debug("pynvml not available")
+                pynvml.nvmlShutdown()
+            except Exception:
+                pass
+
+    if _detect_jetson_gpu():
+        _gpu_backend = _collect_jetson
+        node.get_logger().info("Detected NVIDIA Jetson GPU (sysfs)")
+        return
 
     # Try AMD next
     try:
@@ -81,6 +93,73 @@ def _collect_nvidia(node: Node) -> tuple[float, int, int, float]:
         return (0.0, 0, 0, 0.0)
 
 
+def _detect_jetson_gpu() -> bool:
+    """Detect NVIDIA Jetson GPUs, which often do not expose NVML devices."""
+    if any(_path_exists(path) for path in _JETSON_GPU_LOAD_PATHS):
+        return True
+
+    for path in _JETSON_DEVICE_TREE_PATHS:
+        try:
+            content = path.read_bytes().lower()
+        except OSError:
+            continue
+        if b"nvidia,tegra" in content or b"nvidia jetson" in content:
+            return True
+
+    return False
+
+
+def _path_exists(path: Path) -> bool:
+    try:
+        return path.exists()
+    except OSError:
+        return False
+
+
+def _read_float(path: Path) -> float | None:
+    try:
+        return float(path.read_text().strip())
+    except (OSError, ValueError):
+        return None
+
+
+def _collect_jetson_temperature() -> float:
+    for thermal_type_path in Path("/sys/devices/virtual/thermal").glob("thermal_zone*/type"):
+        try:
+            thermal_type = thermal_type_path.read_text().strip().lower()
+        except OSError:
+            continue
+
+        if not any(gpu_type in thermal_type for gpu_type in _JETSON_GPU_TEMPERATURE_TYPES):
+            continue
+
+        temperature = _read_float(thermal_type_path.with_name("temp"))
+        if temperature is None:
+            continue
+        return temperature / 1000.0 if temperature > 1000 else temperature
+
+    return 0.0
+
+
+def _collect_jetson(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from NVIDIA Jetson sysfs files."""
+    try:
+        load = 0.0
+        for path in _JETSON_GPU_LOAD_PATHS:
+            raw_load = _read_float(path)
+            if raw_load is None:
+                continue
+            # Jetson reports GPU load in permille on current L4T kernels.
+            load = raw_load / 10.0
+            break
+
+        temperature = _collect_jetson_temperature()
+        return (load, 0, 0, temperature)
+    except Exception as e:
+        node.get_logger().error(f"Error collecting NVIDIA Jetson GPU metrics: {e}")
+        return (0.0, 0, 0, 0.0)
+
+
 def _collect_amd(node: Node) -> tuple[float, int, int, float]:
     """Collect GPU metrics from AMD GPU using pyamdgpuinfo."""
     try:

From 167b9385fe5bdc1fc1882658e8d7ceba9334398f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 May 2026 06:21:41 +0000
Subject: [PATCH 4/8] Fix system monitor GPU unit consistency and CPU EMA init

Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742

Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com>
---
 .../system_monitor/system_monitor/cpus.py     |  4 +-
 .../system_monitor/system_monitor/gpu.py      | 61 +++++++++++--------
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
index 31868c2b9..e53d31074 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
@@ -8,7 +8,7 @@
 _prev_busy = defaultdict(int)
 
 # store last reported usage per cpu to smooth sampling noise
-_prev_usage = defaultdict(float)
+_prev_usage: dict[str, float | None] = defaultdict(lambda: None)
 
 # smoothing factor for exponential moving average (0..1)
 # higher = more responsive, lower = smoother
@@ -75,7 +75,7 @@ def _calculate_usage(cpu_num, total, busy) -> float:
 
     # smooth short-term sampling noise with exponential moving average
     prev = _prev_usage[cpu_num]
-    if prev == 0.0:
+    if prev is None:
         smoothed = float(round(raw_usage, 2))
     else:
         smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2))
diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index bd3819053..f1baffcb3 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -1,9 +1,13 @@
+import atexit
 from pathlib import Path
 
 from rclpy.node import Node
 
 # Detect available GPU backend (deferred until we have a node for logging)
 _gpu_backend = None
+_nvml_module = None
+_nvml_handle = None
+_nvml_shutdown_registered = False
 
 _JETSON_GPU_LOAD_PATHS = (
     Path("/sys/devices/gpu.0/load"),
@@ -30,16 +34,17 @@ def _detect_gpu_backend(node: Node):
             pynvml.nvmlInit()
             device_count = pynvml.nvmlDeviceGetCount()
             if device_count > 0:
+                global _nvml_module, _nvml_handle, _nvml_shutdown_registered
+                _nvml_module = pynvml
+                _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+                if not _nvml_shutdown_registered:
+                    atexit.register(_shutdown_nvml)
+                    _nvml_shutdown_registered = True
                 _gpu_backend = _collect_nvidia
                 node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
                 return
         except Exception as e:
             node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}")
-        finally:
-            try:
-                pynvml.nvmlShutdown()
-            except Exception:
-                pass
 
     if _detect_jetson_gpu():
         _gpu_backend = _collect_jetson
@@ -72,22 +77,15 @@ def _collect_none(node: Node) -> tuple[float, int, int, float]:
 def _collect_nvidia(node: Node) -> tuple[float, int, int, float]:
     """Collect GPU metrics from NVIDIA GPU using pynvml."""
     try:
-        import pynvml
+        if _nvml_module is None or _nvml_handle is None:
+            return (0.0, 0, 0, 0.0)
 
-        pynvml.nvmlInit()
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-            load = float(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
-            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            vram_used = mem_info.used
-            vram_total = mem_info.total
-            temperature = float(pynvml.nvmlDeviceGetTemperature(handle, 0))
-            return (load, vram_used, vram_total, temperature)
-        finally:
-            try:
-                pynvml.nvmlShutdown()
-            except Exception:
-                pass
+        load = _fraction_from_percent(float(_nvml_module.nvmlDeviceGetUtilizationRates(_nvml_handle).gpu))
+        mem_info = _nvml_module.nvmlDeviceGetMemoryInfo(_nvml_handle)
+        vram_used = mem_info.used
+        vram_total = mem_info.total
+        temperature = float(_nvml_module.nvmlDeviceGetTemperature(_nvml_handle, 0))
+        return (load, vram_used, vram_total, temperature)
     except Exception as e:
         node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}")
         return (0.0, 0, 0, 0.0)
@@ -150,7 +148,7 @@ def _collect_jetson(node: Node) -> tuple[float, int, int, float]:
             if raw_load is None:
                 continue
             # Jetson reports GPU load in permille on current L4T kernels.
-            load = raw_load / 10.0
+            load = _fraction_from_per_mille(raw_load)
             break
 
         temperature = _collect_jetson_temperature()
@@ -169,7 +167,7 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]:
             return (0.0, 0, 0, 0.0)
 
         gpu = pyamdgpuinfo.get_gpu(0)
-        load = float(gpu.query_load())
+        load = _fraction_from_percent(float(gpu.query_load()))
         vram_total = gpu.memory_info["vram_size"]
         vram_used = gpu.query_vram_usage()
         temperature = float(gpu.query_temperature())
@@ -183,8 +181,6 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]:
 def collect_all(node: Node) -> tuple[float, int, int, float]:
     """Collect GPU metrics using the auto-detected backend.
 
-    If `node` is provided the ROS node's logger will be used for messages.
-
     node: ROS node for logging (required for backend detection and error logging)
     :return: (load, vram_used, vram_total, temperature)
     """
@@ -193,3 +189,20 @@ def collect_all(node: Node) -> tuple[float, int, int, float]:
     if _gpu_backend is None:
         return (0.0, 0, 0, 0.0)
     return _gpu_backend(node)
+
+
+def _fraction_from_percent(value: float) -> float:
+    return min(max(value / 100.0, 0.0), 1.0)
+
+
+def _fraction_from_per_mille(value: float) -> float:
+    return min(max(value / 1000.0, 0.0), 1.0)
+
+
+def _shutdown_nvml() -> None:
+    if _nvml_module is None:
+        return
+    try:
+        _nvml_module.nvmlShutdown()
+    except Exception:
+        pass

From a0fa5b3e24c1cb5983db1684976ac89a3478cd32 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 May 2026 06:22:58 +0000
Subject: [PATCH 5/8] Add docstrings for GPU normalization helpers

Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742

Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com>
---
 src/bitbots_misc/system_monitor/system_monitor/gpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index f1baffcb3..18a8e0d84 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -192,14 +192,17 @@ def collect_all(node: Node) -> tuple[float, int, int, float]:
 
 
 def _fraction_from_percent(value: float) -> float:
+    """Convert percent [0..100] to clamped fraction [0.0..1.0]."""
     return min(max(value / 100.0, 0.0), 1.0)
 
 
 def _fraction_from_per_mille(value: float) -> float:
+    """Convert permille [0..1000] to clamped fraction [0.0..1.0]."""
     return min(max(value / 1000.0, 0.0), 1.0)
 
 
 def _shutdown_nvml() -> None:
+    """Release NVML resources at process shutdown."""
     if _nvml_module is None:
         return
     try:

From fbc7c96c7fe5a899d28fd547acbcdf1ba1ae2ee3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 May 2026 06:23:56 +0000
Subject: [PATCH 6/8] Polish GPU backend globals and docstring parameter format

Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742

Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com>
---
 src/bitbots_misc/system_monitor/system_monitor/gpu.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index 18a8e0d84..b90b330a9 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -22,7 +22,7 @@
 
 def _detect_gpu_backend(node: Node):
     """Auto-detect available GPU and return appropriate backend function."""
-    global _gpu_backend
+    global _gpu_backend, _nvml_module, _nvml_handle, _nvml_shutdown_registered
 
     # Try NVIDIA first (most common in robotics)
     try:
@@ -34,7 +34,6 @@ def _detect_gpu_backend(node: Node):
             pynvml.nvmlInit()
             device_count = pynvml.nvmlDeviceGetCount()
             if device_count > 0:
-                global _nvml_module, _nvml_handle, _nvml_shutdown_registered
                 _nvml_module = pynvml
                 _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                 if not _nvml_shutdown_registered:
@@ -181,7 +180,7 @@ def _collect_amd(node: Node) -> tuple[float, int, int, float]:
 def collect_all(node: Node) -> tuple[float, int, int, float]:
     """Collect GPU metrics using the auto-detected backend.
 
-    node: ROS node for logging (required for backend detection and error logging)
+    :param node: ROS node for logging (required for backend detection and error logging)
     :return: (load, vram_used, vram_total, temperature)
     """
     if _gpu_backend is None:

From 66048af473ecdbefee28f117c3a46714a9117f1b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 May 2026 06:24:49 +0000
Subject: [PATCH 7/8] Use explicit dict lookup for CPU EMA previous usage

Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742

Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com>
---
 src/bitbots_misc/system_monitor/system_monitor/cpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
index e53d31074..78a8b750e 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/cpus.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
@@ -8,7 +8,7 @@
 _prev_busy = defaultdict(int)
 
 # store last reported usage per cpu to smooth sampling noise
-_prev_usage: dict[str, float | None] = defaultdict(lambda: None)
+_prev_usage: dict[str, float] = {}
 
 # smoothing factor for exponential moving average (0..1)
 # higher = more responsive, lower = smoother
@@ -74,7 +74,7 @@ def _calculate_usage(cpu_num, total, busy) -> float:
         raw_usage = (diff_busy / diff_total) * 100.0
 
     # smooth short-term sampling noise with exponential moving average
-    prev = _prev_usage[cpu_num]
+    prev = _prev_usage.get(cpu_num)
     if prev is None:
         smoothed = float(round(raw_usage, 2))
     else:

From bf629a38de07b530990bd7539083729ba17fe2c8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 May 2026 06:25:43 +0000
Subject: [PATCH 8/8] Clarify defensive clamping in GPU helper docstrings

Agent-Logs-Url: https://github.com/bit-bots/bitbots_main/sessions/797bd202-b15d-4688-b2a5-48352421a742

Co-authored-by: jaagut <34797331+jaagut@users.noreply.github.com>
---
 src/bitbots_misc/system_monitor/system_monitor/gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
index b90b330a9..ed658e921 100644
--- a/src/bitbots_misc/system_monitor/system_monitor/gpu.py
+++ b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -191,12 +191,12 @@ def collect_all(node: Node) -> tuple[float, int, int, float]:
 
 
 def _fraction_from_percent(value: float) -> float:
-    """Convert percent [0..100] to clamped fraction [0.0..1.0]."""
+    """Convert percent to fraction and clamp to [0.0..1.0] defensively."""
     return min(max(value / 100.0, 0.0), 1.0)
 
 
 def _fraction_from_per_mille(value: float) -> float:
-    """Convert permille [0..1000] to clamped fraction [0.0..1.0]."""
+    """Convert permille to fraction and clamp to [0.0..1.0] defensively."""
     return min(max(value / 1000.0, 0.0), 1.0)