bit-bots · jaagut · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 25, 2026
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -71,6 +71,7 @@
         "numpify",
         "numpy",
         "nvidia",
+        "nvml",
         "odom",
         "odometry",
         "particlefilter",
@@ -85,6 +86,7 @@
         "pretrained",
         "proto",
         "protos",
+        "pyamdgpuinfo",
         "pyplot",
         "pywrapper",
         "Quaterniond",
@@ -127,6 +129,7 @@
         "unpenalized",
         "urdf",
         "vcstool",
+        "vram",
         "walkready",
         "wandb",
         "webots",

diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -70,6 +70,7 @@ mujoco = ">=3.6.0,<4"
 mypy = ">=1.18.2,<2"
 ninja = ">=1.13.2,<2"
 numpy = ">=1.26.4,<2"
+nvidia-ml-py = ">=13.595.45,<14"
 opencv = ">=4.11.0,<5"
 paramiko =   ">=4.0.0,<5"
 pkg-config = ">=0.29.2,<0.30"

diff --git a/src/bitbots_misc/system_monitor/config/config.yaml b/src/bitbots_misc/system_monitor/config/config.yaml
@@ -1,7 +1,7 @@
 system_monitor:
   ros__parameters:
     # How many times per second should the system be queried for stats
-    update_frequency: 10.0
+    update_frequency: 2.0
 
     # These settings are quick_switches to completely disable certain parts of statistic collection
     do_cpu: true

diff --git a/src/bitbots_misc/system_monitor/system_monitor/cpus.py b/src/bitbots_misc/system_monitor/system_monitor/cpus.py
@@ -7,8 +7,15 @@
 _prev_total = defaultdict(int)
 _prev_busy = defaultdict(int)
 
+# store last reported usage per cpu to smooth sampling noise
+_prev_usage: dict[str, float] = {}
 
-def collect_all():
+# smoothing factor for exponential moving average (0..1)
+# higher = more responsive, lower = smoother
+_EMA_ALPHA = 0.5
+
+
+def collect_all() -> tuple[int, list[CpuMsg], float]:
     """
     parse /proc/stat and calculate total and busy time
 
@@ -50,7 +57,7 @@ def _get_cpu_stats():
     return timings
 
 
-def _calculate_usage(cpu_num, total, busy):
+def _calculate_usage(cpu_num, total, busy) -> float:
     """
     calculate usage percentage based on busy/total time
     """
@@ -60,7 +67,18 @@ def _calculate_usage(cpu_num, total, busy):
     _prev_total[cpu_num] = total
     _prev_busy[cpu_num] = busy
 
-    if diff_total == 0:
-        return 0
+    # compute raw usage and handle edge cases
+    if diff_total <= 0 or diff_busy <= 0:
+        raw_usage = 0.0
     else:
-        return float(int(diff_busy / diff_total * 100))
+        raw_usage = (diff_busy / diff_total) * 100.0
+
+    # smooth short-term sampling noise with exponential moving average
+    prev = _prev_usage.get(cpu_num)
+    if prev is None:
+        smoothed = float(round(raw_usage, 2))
+    else:
+        smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2))
+
+    _prev_usage[cpu_num] = smoothed
+    return smoothed
diff --git a/src/bitbots_misc/system_monitor/system_monitor/gpu.py b/src/bitbots_misc/system_monitor/system_monitor/gpu.py
@@ -1,19 +1,210 @@
-import pyamdgpuinfo
+import atexit
+from pathlib import Path
 
+from rclpy.node import Node
+
+# Detect available GPU backend (deferred until we have a node for logging)
+_gpu_backend = None
+_nvml_module = None
+_nvml_handle = None
+_nvml_shutdown_registered = False
+
+_JETSON_GPU_LOAD_PATHS = (
+    Path("/sys/devices/gpu.0/load"),
+    Path("/sys/kernel/debug/gpu.0/load"),
+)
+_JETSON_DEVICE_TREE_PATHS = (
+    Path("/proc/device-tree/compatible"),
+    Path("/proc/device-tree/model"),
+)
+_JETSON_GPU_TEMPERATURE_TYPES = ("gpu",)
+
+
+def _detect_gpu_backend(node: Node):
+    """Auto-detect available GPU and return appropriate backend function."""
+    global _gpu_backend, _nvml_module, _nvml_handle, _nvml_shutdown_registered
+
+    # Try NVIDIA first (most common in robotics)
+    try:
+        import pynvml
+    except ImportError:
+        node.get_logger().debug("pynvml not available")
+    else:
+        try:
+            pynvml.nvmlInit()
+            device_count = pynvml.nvmlDeviceGetCount()
+            if device_count > 0:
+                _nvml_module = pynvml
+                _nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+                if not _nvml_shutdown_registered:
+                    atexit.register(_shutdown_nvml)
+                    _nvml_shutdown_registered = True
+                _gpu_backend = _collect_nvidia
+                node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
+                return
+        except Exception as e:
+            node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}")
+
+    if _detect_jetson_gpu():
+        _gpu_backend = _collect_jetson
+        node.get_logger().info("Detected NVIDIA Jetson GPU (sysfs)")
+        return
+
+    # Try AMD next
+    try:
+        import pyamdgpuinfo
+
+        if pyamdgpuinfo.detect_gpus() > 0:
+            _gpu_backend = _collect_amd
+            node.get_logger().info("Detected AMD GPU (pyamdgpuinfo)")
+            return
+    except ImportError:
+        node.get_logger().debug("pyamdgpuinfo not available")
+    except Exception as e:
+        node.get_logger().debug(f"AMD GPU detection failed: {e}")
+
+    # No GPU detected
+    _gpu_backend = _collect_none
+    node.get_logger().info("No GPU detected; falling back to null backend")
+
+
+def _collect_none(node: Node) -> tuple[float, int, int, float]:
+    """Null backend when no GPU is available."""
+    return (0.0, 0, 0, 0.0)
+
+
+def _collect_nvidia(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from NVIDIA GPU using pynvml."""
+    try:
+        if _nvml_module is None or _nvml_handle is None:
+            return (0.0, 0, 0, 0.0)
+
+        load = _fraction_from_percent(float(_nvml_module.nvmlDeviceGetUtilizationRates(_nvml_handle).gpu))
+        mem_info = _nvml_module.nvmlDeviceGetMemoryInfo(_nvml_handle)
+        vram_used = mem_info.used
+        vram_total = mem_info.total
+        temperature = float(_nvml_module.nvmlDeviceGetTemperature(_nvml_handle, 0))
+        return (load, vram_used, vram_total, temperature)
+    except Exception as e:
+        node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}")
+        return (0.0, 0, 0, 0.0)
+
+
+def _detect_jetson_gpu() -> bool:
+    """Detect NVIDIA Jetson GPUs, which often do not expose NVML devices."""
+    if any(_path_exists(path) for path in _JETSON_GPU_LOAD_PATHS):
+        return True
+
+    for path in _JETSON_DEVICE_TREE_PATHS:
+        try:
+            content = path.read_bytes().lower()
+        except OSError:
+            continue
+        if b"nvidia,tegra" in content or b"nvidia jetson" in content:
+            return True
+
+    return False
+
+
+def _path_exists(path: Path) -> bool:
+    try:
+        return path.exists()
+    except OSError:
+        return False
 
-def collect_all():
-    """
-    use pyamdgpuinfo to get gpu metrics
 
+def _read_float(path: Path) -> float | None:
+    try:
+        return float(path.read_text().strip())
+    except (OSError, ValueError):
+        return None
+
+
+def _collect_jetson_temperature() -> float:
+    for thermal_type_path in Path("/sys/devices/virtual/thermal").glob("thermal_zone*/type"):
+        try:
+            thermal_type = thermal_type_path.read_text().strip().lower()
+        except OSError:
+            continue
+
+        if not any(gpu_type in thermal_type for gpu_type in _JETSON_GPU_TEMPERATURE_TYPES):
+            continue
+
+        temperature = _read_float(thermal_type_path.with_name("temp"))
+        if temperature is None:
+            continue
+        return temperature / 1000.0 if temperature > 1000 else temperature
+
+    return 0.0
+
+
+def _collect_jetson(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from NVIDIA Jetson sysfs files."""
+    try:
+        load = 0.0
+        for path in _JETSON_GPU_LOAD_PATHS:
+            raw_load = _read_float(path)
+            if raw_load is None:
+                continue
+            # Jetson reports GPU load in permille on current L4T kernels.
+            load = _fraction_from_per_mille(raw_load)
+            break
+
+        temperature = _collect_jetson_temperature()
+        return (load, 0, 0, temperature)
+    except Exception as e:
+        node.get_logger().error(f"Error collecting NVIDIA Jetson GPU metrics: {e}")
+        return (0.0, 0, 0, 0.0)
+
+
+def _collect_amd(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics from AMD GPU using pyamdgpuinfo."""
+    try:
+        import pyamdgpuinfo
+
+        if pyamdgpuinfo.detect_gpus() == 0:
+            return (0.0, 0, 0, 0.0)
+
+        gpu = pyamdgpuinfo.get_gpu(0)
+        load = _fraction_from_percent(float(gpu.query_load()))
+        vram_total = gpu.memory_info["vram_size"]
+        vram_used = gpu.query_vram_usage()
+        temperature = float(gpu.query_temperature())
+
+        return (load, vram_used, vram_total, temperature)
+    except Exception as e:
+        node.get_logger().error(f"Error collecting AMD GPU metrics: {e}")
+        return (0.0, 0, 0, 0.0)
+
+
+def collect_all(node: Node) -> tuple[float, int, int, float]:
+    """Collect GPU metrics using the auto-detected backend.
+
+    :param node: ROS node for logging (required for backend detection and error logging)
     :return: (load, vram_used, vram_total, temperature)
     """
-    if pyamdgpuinfo.detect_gpus() == 0:
-        return (0, 0, 0, 0)
+    if _gpu_backend is None:
+        _detect_gpu_backend(node)
+    if _gpu_backend is None:
+        return (0.0, 0, 0, 0.0)
+    return _gpu_backend(node)
+
+
+def _fraction_from_percent(value: float) -> float:
+    """Convert percent to fraction and clamp to [0.0..1.0] defensively."""
+    return min(max(value / 100.0, 0.0), 1.0)
+
+
+def _fraction_from_per_mille(value: float) -> float:
+    """Convert permille to fraction and clamp to [0.0..1.0] defensively."""
+    return min(max(value / 1000.0, 0.0), 1.0)
 
-    gpu = pyamdgpuinfo.get_gpu(0)
-    load = gpu.query_load()
-    vram_total = gpu.memory_info["vram_size"]
-    vram_used = gpu.query_vram_usage()
-    temperature = gpu.query_temperature()
 
-    return (load, vram_used, vram_total, temperature)
+def _shutdown_nvml() -> None:
+    """Release NVML resources at process shutdown."""
+    if _nvml_module is None:
+        return
+    try:
+        _nvml_module.nvmlShutdown()
+    except Exception:
+        pass
diff --git a/src/bitbots_misc/system_monitor/system_monitor/memory.py b/src/bitbots_misc/system_monitor/system_monitor/memory.py
@@ -1,7 +1,7 @@
 import psutil
 
 
-def collect_all():
+def collect_all() -> tuple[int, int, int]:
     """
     :return: (memory_available, memory_used, memory_total)
     """

diff --git a/src/bitbots_misc/system_monitor/system_monitor/monitor.py b/src/bitbots_misc/system_monitor/system_monitor/monitor.py
@@ -54,8 +54,8 @@ def main():
 
     while rclpy.ok():
         last_send_time = time.time()
-        running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0)
-        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0)
+        running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0)
+        gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all(node) if do_gpu else (0.0, 0, 0, 0.0)
         memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1)
         interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else []
 
@@ -116,6 +116,6 @@ def main():
         diag_array.header.stamp = node.get_clock().now().to_msg()
         diagnostic_pub.publish(diag_array)
 
-        # sleep to have correct rate. we dont use ROS time since we are interested in system things
+        # sleep to have correct rate. we don't use ROS time since we are interested in system things
         dt = time.time() - last_send_time
         time.sleep(max(0, (1 / rate) - dt))