Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
"numpify",
"numpy",
"nvidia",
"nvml",
"odom",
"odometry",
"particlefilter",
Expand All @@ -85,6 +86,7 @@
"pretrained",
"proto",
"protos",
"pyamdgpuinfo",
"pyplot",
"pywrapper",
"Quaterniond",
Expand Down Expand Up @@ -127,6 +129,7 @@
"unpenalized",
"urdf",
"vcstool",
"vram",
"walkready",
"wandb",
"webots",
Expand Down
18 changes: 18 additions & 0 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ mujoco = ">=3.6.0,<4"
mypy = ">=1.18.2,<2"
ninja = ">=1.13.2,<2"
numpy = ">=1.26.4,<2"
nvidia-ml-py = ">=13.595.45,<14"
opencv = ">=4.11.0,<5"
paramiko = ">=4.0.0,<5"
pkg-config = ">=0.29.2,<0.30"
Expand Down
2 changes: 1 addition & 1 deletion src/bitbots_misc/system_monitor/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
system_monitor:
ros__parameters:
# How many times per second should the system be queried for stats
update_frequency: 10.0
update_frequency: 2.0

# These settings are quick_switches to completely disable certain parts of statistic collection
do_cpu: true
Expand Down
28 changes: 23 additions & 5 deletions src/bitbots_misc/system_monitor/system_monitor/cpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,15 @@
_prev_total = defaultdict(int)
_prev_busy = defaultdict(int)

# store last reported usage per cpu to smooth sampling noise
_prev_usage: dict[str, float] = {}

def collect_all():
# smoothing factor for exponential moving average (0..1)
# higher = more responsive, lower = smoother
_EMA_ALPHA = 0.5


def collect_all() -> tuple[int, list[CpuMsg], float]:
"""
parse /proc/stat and calculate total and busy time

Expand Down Expand Up @@ -50,7 +57,7 @@ def _get_cpu_stats():
return timings


def _calculate_usage(cpu_num, total, busy):
def _calculate_usage(cpu_num, total, busy) -> float:
"""
calculate usage percentage based on busy/total time
"""
Expand All @@ -60,7 +67,18 @@ def _calculate_usage(cpu_num, total, busy):
_prev_total[cpu_num] = total
_prev_busy[cpu_num] = busy

if diff_total == 0:
return 0
# compute raw usage and handle edge cases
if diff_total <= 0 or diff_busy <= 0:
raw_usage = 0.0
else:
return float(int(diff_busy / diff_total * 100))
raw_usage = (diff_busy / diff_total) * 100.0

# smooth short-term sampling noise with exponential moving average
prev = _prev_usage.get(cpu_num)
if prev is None:
smoothed = float(round(raw_usage, 2))
else:
smoothed = float(round((raw_usage * _EMA_ALPHA) + (prev * (1.0 - _EMA_ALPHA)), 2))

Comment on lines +76 to +82
_prev_usage[cpu_num] = smoothed
return smoothed
215 changes: 203 additions & 12 deletions src/bitbots_misc/system_monitor/system_monitor/gpu.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,210 @@
import pyamdgpuinfo
import atexit
from pathlib import Path

from rclpy.node import Node

# Detect available GPU backend (deferred until we have a node for logging)
_gpu_backend = None
_nvml_module = None
_nvml_handle = None
_nvml_shutdown_registered = False

_JETSON_GPU_LOAD_PATHS = (
Path("/sys/devices/gpu.0/load"),
Path("/sys/kernel/debug/gpu.0/load"),
)
_JETSON_DEVICE_TREE_PATHS = (
Path("/proc/device-tree/compatible"),
Path("/proc/device-tree/model"),
)
_JETSON_GPU_TEMPERATURE_TYPES = ("gpu",)


def _detect_gpu_backend(node: Node):
"""Auto-detect available GPU and return appropriate backend function."""
global _gpu_backend, _nvml_module, _nvml_handle, _nvml_shutdown_registered

# Try NVIDIA first (most common in robotics)
try:
import pynvml
except ImportError:
node.get_logger().debug("pynvml not available")
else:
try:
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
if device_count > 0:
_nvml_module = pynvml
_nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
if not _nvml_shutdown_registered:
atexit.register(_shutdown_nvml)
_nvml_shutdown_registered = True
_gpu_backend = _collect_nvidia
node.get_logger().info(f"Detected NVIDIA GPU (pynvml): {device_count} device(s)")
return
except Exception as e:
node.get_logger().debug(f"NVIDIA GPU detection failed (pynvml): {e}")

if _detect_jetson_gpu():
_gpu_backend = _collect_jetson
node.get_logger().info("Detected NVIDIA Jetson GPU (sysfs)")
return

# Try AMD next
try:
import pyamdgpuinfo

if pyamdgpuinfo.detect_gpus() > 0:
_gpu_backend = _collect_amd
node.get_logger().info("Detected AMD GPU (pyamdgpuinfo)")
return
except ImportError:
node.get_logger().debug("pyamdgpuinfo not available")
except Exception as e:
node.get_logger().debug(f"AMD GPU detection failed: {e}")

# No GPU detected
_gpu_backend = _collect_none
node.get_logger().info("No GPU detected; falling back to null backend")


def _collect_none(node: Node) -> tuple[float, int, int, float]:
"""Null backend when no GPU is available."""
return (0.0, 0, 0, 0.0)


def _collect_nvidia(node: Node) -> tuple[float, int, int, float]:
"""Collect GPU metrics from NVIDIA GPU using pynvml."""
try:
if _nvml_module is None or _nvml_handle is None:
return (0.0, 0, 0, 0.0)

load = _fraction_from_percent(float(_nvml_module.nvmlDeviceGetUtilizationRates(_nvml_handle).gpu))
mem_info = _nvml_module.nvmlDeviceGetMemoryInfo(_nvml_handle)
vram_used = mem_info.used
vram_total = mem_info.total
temperature = float(_nvml_module.nvmlDeviceGetTemperature(_nvml_handle, 0))
return (load, vram_used, vram_total, temperature)
except Exception as e:
node.get_logger().error(f"Error collecting NVIDIA GPU metrics: {e}")
return (0.0, 0, 0, 0.0)


def _detect_jetson_gpu() -> bool:
"""Detect NVIDIA Jetson GPUs, which often do not expose NVML devices."""
if any(_path_exists(path) for path in _JETSON_GPU_LOAD_PATHS):
return True

for path in _JETSON_DEVICE_TREE_PATHS:
try:
content = path.read_bytes().lower()
except OSError:
continue
if b"nvidia,tegra" in content or b"nvidia jetson" in content:
return True

return False


def _path_exists(path: Path) -> bool:
try:
return path.exists()
except OSError:
return False

def collect_all():
"""
use pyamdgpuinfo to get gpu metrics

def _read_float(path: Path) -> float | None:
try:
return float(path.read_text().strip())
except (OSError, ValueError):
return None


def _collect_jetson_temperature() -> float:
for thermal_type_path in Path("/sys/devices/virtual/thermal").glob("thermal_zone*/type"):
try:
thermal_type = thermal_type_path.read_text().strip().lower()
except OSError:
continue

if not any(gpu_type in thermal_type for gpu_type in _JETSON_GPU_TEMPERATURE_TYPES):
continue

temperature = _read_float(thermal_type_path.with_name("temp"))
if temperature is None:
continue
return temperature / 1000.0 if temperature > 1000 else temperature

return 0.0


def _collect_jetson(node: Node) -> tuple[float, int, int, float]:
"""Collect GPU metrics from NVIDIA Jetson sysfs files."""
try:
load = 0.0
for path in _JETSON_GPU_LOAD_PATHS:
raw_load = _read_float(path)
if raw_load is None:
continue
# Jetson reports GPU load in permille on current L4T kernels.
load = _fraction_from_per_mille(raw_load)
break

temperature = _collect_jetson_temperature()
return (load, 0, 0, temperature)
except Exception as e:
node.get_logger().error(f"Error collecting NVIDIA Jetson GPU metrics: {e}")
return (0.0, 0, 0, 0.0)


def _collect_amd(node: Node) -> tuple[float, int, int, float]:
"""Collect GPU metrics from AMD GPU using pyamdgpuinfo."""
try:
import pyamdgpuinfo

if pyamdgpuinfo.detect_gpus() == 0:
return (0.0, 0, 0, 0.0)

gpu = pyamdgpuinfo.get_gpu(0)
load = _fraction_from_percent(float(gpu.query_load()))
vram_total = gpu.memory_info["vram_size"]
vram_used = gpu.query_vram_usage()
temperature = float(gpu.query_temperature())

return (load, vram_used, vram_total, temperature)
except Exception as e:
node.get_logger().error(f"Error collecting AMD GPU metrics: {e}")
return (0.0, 0, 0, 0.0)


def collect_all(node: Node) -> tuple[float, int, int, float]:
"""Collect GPU metrics using the auto-detected backend.

:param node: ROS node for logging (required for backend detection and error logging)
:return: (load, vram_used, vram_total, temperature)
"""
if pyamdgpuinfo.detect_gpus() == 0:
return (0, 0, 0, 0)
if _gpu_backend is None:
_detect_gpu_backend(node)
if _gpu_backend is None:
return (0.0, 0, 0, 0.0)
return _gpu_backend(node)


def _fraction_from_percent(value: float) -> float:
"""Convert percent to fraction and clamp to [0.0..1.0] defensively."""
return min(max(value / 100.0, 0.0), 1.0)


def _fraction_from_per_mille(value: float) -> float:
"""Convert permille to fraction and clamp to [0.0..1.0] defensively."""
return min(max(value / 1000.0, 0.0), 1.0)

gpu = pyamdgpuinfo.get_gpu(0)
load = gpu.query_load()
vram_total = gpu.memory_info["vram_size"]
vram_used = gpu.query_vram_usage()
temperature = gpu.query_temperature()

return (load, vram_used, vram_total, temperature)
def _shutdown_nvml() -> None:
"""Release NVML resources at process shutdown."""
if _nvml_module is None:
return
try:
_nvml_module.nvmlShutdown()
except Exception:
pass
2 changes: 1 addition & 1 deletion src/bitbots_misc/system_monitor/system_monitor/memory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import psutil


def collect_all():
def collect_all() -> tuple[int, int, int]:
"""
:return: (memory_available, memory_used, memory_total)
"""
Expand Down
6 changes: 3 additions & 3 deletions src/bitbots_misc/system_monitor/system_monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def main():

while rclpy.ok():
last_send_time = time.time()
running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0)
gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all() if do_gpu else (0, 0, 0, 0)
running_processes, cpu_usages, overall_usage_percentage = cpus.collect_all() if do_cpu else (-1, [], 0.0)
gpu_load, gpu_vram_used, gpu_vram_total, gpu_temperature = gpu.collect_all(node) if do_gpu else (0.0, 0, 0, 0.0)
memory_available, memory_used, memory_total = memory.collect_all() if do_memory else (-1, -1, -1)
interfaces = network_interfaces.collect_all(node.get_clock()) if do_network else []

Expand Down Expand Up @@ -116,6 +116,6 @@ def main():
diag_array.header.stamp = node.get_clock().now().to_msg()
diagnostic_pub.publish(diag_array)

# sleep to have correct rate. we dont use ROS time since we are interested in system things
# sleep to have correct rate. we don't use ROS time since we are interested in system things
dt = time.time() - last_send_time
time.sleep(max(0, (1 / rate) - dt))
Loading
Loading