From f73efd00f3c67b851a3e8c243690eeaf9923ffc4 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 18 Jun 2026 22:24:41 +0200 Subject: [PATCH 1/4] fix: update Q-scripts and tests to AntiPendulumConfig/QLearningAgent API after PR #12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - play_q, train_q, analyse_q, use_q_ide: wrap env kwargs in AntiPendulumConfig; replace DEFAULT_DISCRETE reference with discrete="energy" string preset; replace QLearningAgent(trained=...) with filename/use_file API - experiments/*.yaml: negate crane_velocity weights (0.1→-0.1) to preserve penalty semantics under new rc.crane_velocity * x_dot^2 sign convention - pyproject.toml: remove pyment from runtime deps (docstring tool, not runtime) - experiment_config.py: update RewardConfig.crane_velocity docstring for new sign - test_ppo: remove skip markers, update env_kwargs to conf=AntiPendulumConfig - test_simple_q_env: remove skip markers (API already updated in PR #12) - test_environment: delete test_t_min_crane_reward_term (term removed from reward) - test_algorithm: update skip reason to name the obs-index mismatch blocking them --- experiments/hybrid_cv01.yaml | 7 +++-- experiments/hybrid_t_min.yaml | 2 +- pyproject.toml | 1 - scripts/analyse_q.py | 6 ++-- scripts/play_q.py | 12 +++++--- scripts/train_q.py | 20 +++++++----- scripts/use_q_ide.py | 28 +++++++++-------- src/crane_controller/experiment_config.py | 3 +- tests/test_algorithm.py | 4 +-- tests/test_environment.py | 17 ----------- tests/test_ppo.py | 37 ++++++++++------------- tests/test_simple_q_env.py | 6 ++-- 12 files changed, 65 insertions(+), 78 deletions(-) diff --git a/experiments/hybrid_cv01.yaml b/experiments/hybrid_cv01.yaml index 5487554..6b73a77 100644 --- a/experiments/hybrid_cv01.yaml +++ b/experiments/hybrid_cv01.yaml @@ -1,6 +1,7 @@ # Validated hybrid reward: energy + crane_velocity penalty + position return. -# energy=1.0 (KE+PE physics signal), crane_velocity=0.1 (-x_dot^2 penalty to -# damp trolley oscillation), position=0.02 (-|x| to encourage return to origin). +# energy=1.0 (KE+PE physics signal), crane_velocity=-0.1 (x_dot^2 penalty to +# damp trolley oscillation; negative because rc.crane_velocity * x_dot^2 is used), +# position=0.1 (-|x| to encourage return to origin). # terminal_penalty=-5.0 provides a one-shot crash signal propagated ~100 steps # back by gamma=0.99. Trained with randomize_start=True on speeds +-[0.1, 1.0]. # Seeds 2718, 3141, 31415 achieve 6/6 OOD generalisation at start_speed=7.0. @@ -13,7 +14,7 @@ reward: terminal_penalty: -5.0 angle: 0.0 angular_velocity: 0.0 - crane_velocity: 0.1 + crane_velocity: -0.1 crane_acceleration: 0.0 angular_acceleration: 0.0 t_min_crane: 0.0 diff --git a/experiments/hybrid_t_min.yaml b/experiments/hybrid_t_min.yaml index 948c0e9..f39de1e 100644 --- a/experiments/hybrid_t_min.yaml +++ b/experiments/hybrid_t_min.yaml @@ -10,7 +10,7 @@ reward: terminal_penalty: -5.0 angle: 0.0 angular_velocity: 0.0 - crane_velocity: 0.1 + crane_velocity: -0.1 crane_acceleration: 0.0 angular_acceleration: 0.0 t_min_crane: 0.01 diff --git a/pyproject.toml b/pyproject.toml index e5e293a..224cf3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,6 @@ dependencies = [ "matplotlib>=3.10", "seaborn>=0.13.2", "tqdm>=4.67", - "pyment>=0.3.3", ] [project.urls] diff --git a/scripts/analyse_q.py b/scripts/analyse_q.py index c44fedb..be00c61 100644 --- a/scripts/analyse_q.py +++ b/scripts/analyse_q.py @@ -22,7 +22,7 @@ import numpy as np from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.q_agent import QLearningAgent LOGGER = logging.getLogger(__name__) @@ -36,7 +36,7 @@ def _build_dummy_env() -> AntiPendulumEnv: AntiPendulumEnv Environment with discrete observation space. """ - return AntiPendulumEnv(build_crane, discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy()) + return AntiPendulumEnv(build_crane, conf=AntiPendulumConfig(discrete="energy")) def main() -> None: @@ -53,7 +53,7 @@ def main() -> None: args = parser.parse_args() env = _build_dummy_env() - agent = QLearningAgent(env, trained=(args.model_path, True)) + agent = QLearningAgent(env, filename=args.model_path, use_file="r") logging.basicConfig(level=logging.INFO, format="%(message)s") diff --git a/scripts/play_q.py b/scripts/play_q.py index 0f5ced3..aa6a37a 100644 --- a/scripts/play_q.py +++ b/scripts/play_q.py @@ -12,7 +12,7 @@ import logging from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.q_agent import QLearningAgent LOGGER = logging.getLogger(__name__) @@ -31,11 +31,13 @@ def main() -> None: env = AntiPendulumEnv( build_crane, - start_speed=args.v0, - render_mode=args.render_mode, - discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy(), + conf=AntiPendulumConfig( + start_speed=args.v0, + render_mode=args.render_mode, + discrete="energy", + ), ) - agent = QLearningAgent(env, trained=(args.model_path, True)) + agent = QLearningAgent(env, filename=args.model_path, use_file="r") logging.basicConfig(level=logging.INFO, format="%(message)s") for episode in range(args.episodes): diff --git a/scripts/train_q.py b/scripts/train_q.py index 0f0ac96..1e6162e 100644 --- a/scripts/train_q.py +++ b/scripts/train_q.py @@ -16,7 +16,7 @@ from pathlib import Path from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.q_agent import QLearningAgent logging.basicConfig(level=logging.INFO, format="%(message)s") @@ -55,20 +55,24 @@ def main() -> None: env = AntiPendulumEnv( build_crane, - start_speed=args.v0, - render_mode="plot" if args.dry_run else "none", - reward_limit=args.reward_limit, - discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy(), + conf=AntiPendulumConfig( + start_speed=args.v0, + render_mode="plot" if args.dry_run else "none", + reward_limit=args.reward_limit, + discrete="energy", + ), ) if args.dry_run: - agent = QLearningAgent(env, trained=None) + agent = QLearningAgent(env) agent.do_episodes(n_episodes=50, max_steps=1000) else: Path(args.save_path).parent.mkdir(parents=True, exist_ok=True) - trained = (args.trained, True) if args.trained else (args.save_path, False) - agent = QLearningAgent(env, trained=trained) + if args.trained: + agent = QLearningAgent(env, filename=Path(args.trained), use_file="rw") + else: + agent = QLearningAgent(env, filename=Path(args.save_path), use_file="w") agent.do_episodes(n_episodes=args.episodes, max_steps=5000) LOGGER.info(f"Model saved to {args.save_path}") diff --git a/scripts/use_q_ide.py b/scripts/use_q_ide.py index 87ef21e..32dd7f6 100644 --- a/scripts/use_q_ide.py +++ b/scripts/use_q_ide.py @@ -11,7 +11,7 @@ from typing import Any from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.envs.simple_test_env import SimpleTestEnv from crane_controller.experiment_config import RewardConfig from crane_controller.q_agent import QLearningAgent @@ -72,20 +72,22 @@ def do_use(conf: Config | dict[str, Any] | None = None) -> None: _conf = Config() if conf is None else (Config(**conf) if isinstance(conf, dict) else conf) env = AntiPendulumEnv( build_crane, - start_speed=_conf.v0, - randomize_start=_conf.randomize_start, - seed=_conf.seed, - dt=_conf.dt, - render_mode=_conf.render, - discrete=_conf.discretization, - reward_fac=_conf.rc, - reward_limit=_conf.r_limit, - discount=_conf.discount, + conf=AntiPendulumConfig( + start_speed=_conf.v0, + randomize_start=_conf.randomize_start, + seed=_conf.seed, + dt=_conf.dt, + render_mode=_conf.render, + discrete=_conf.discretization, + reward_fac=_conf.rc, + reward_limit=_conf.r_limit, + discount=_conf.discount, + ), ) - filename = _conf.file + filename = Path(_conf.file) if _conf.file is not None else None if filename is not None: - Path(filename).parent.mkdir(parents=True, exist_ok=True) + filename.parent.mkdir(parents=True, exist_ok=True) agent = QLearningAgent(env, filename=filename, use_file=_conf.use_file, strategy=_conf.strategy) LOGGER.info(f"DISCRETE: {agent.env.discrete}") agent.do_episodes(n_episodes=_conf.episodes, max_steps=_conf.steps, show=0) @@ -110,7 +112,7 @@ def simple_env(episodes: int, render: str, file: str, use: str, r_limit: float | dt=1.0, render_mode=render, ) - agent = QLearningAgent(env, filename=file, use_file=use) + agent = QLearningAgent(env, filename=Path(file), use_file=use) agent.do_episodes(n_episodes=episodes, max_steps=steps) diff --git a/src/crane_controller/experiment_config.py b/src/crane_controller/experiment_config.py index 595be47..fa2c031 100644 --- a/src/crane_controller/experiment_config.py +++ b/src/crane_controller/experiment_config.py @@ -44,7 +44,8 @@ class RewardConfig: Uses pure angular velocity ``(cm_v[0] - origin_v[0]) / wire.length``, excluding crane translation. crane_velocity : float - Weight for the squared crane velocity penalty ``-x_dot^2`` (default 0.0). + Weight for the squared crane velocity term ``+x_dot^2`` (default 0.0). + Positive values reward crane velocity; use a negative value to penalise it. crane_acceleration : float Weight for the squared crane acceleration penalty ``-x_ddot^2`` (default 0.0). Equals the control action squared (acc = action * self.acc). diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py index 6c7e78f..f1739e9 100644 --- a/tests/test_algorithm.py +++ b/tests/test_algorithm.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -@pytest.mark.skip(reason="Test needs to be updated") +@pytest.mark.skip(reason="AlgorithmAgent.get_action uses obs[1]/obs[2] for pos/speed, but 'energy' discretization places pos at obs[2] and speed at obs[3]; update AlgorithmAgent to match new observation layout") def test_algorithm_strategies( crane: Callable[..., Crane], *, @@ -29,7 +29,7 @@ def test_algorithm_strategies( agent.do_strategies(max_steps=5000 if show else 10) -@pytest.mark.skip(reason="Test needs to be updated.") +@pytest.mark.skip(reason="AlgorithmAgent.get_action uses obs[1]/obs[2] for pos/speed, but 'energy' discretization places pos at obs[2] and speed at obs[3]; update AlgorithmAgent to match new observation layout") def test_algorithm(crane: Callable[..., Crane], *, show: bool) -> None: env = AntiPendulumEnv( crane, diff --git a/tests/test_environment.py b/tests/test_environment.py index 0cf7166..fb28a03 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -256,23 +256,6 @@ def test_step_accepts_correct_action(crane: Callable[..., Crane], continuous_act assert obs.shape == (4,) -@pytest.mark.skip(reason="The t_min reward term is not in use any more") -def test_t_min_crane_reward_term(crane: Callable[..., Crane]) -> None: - """t_min_crane weight adds -t_min to the reward; zero at origin at rest.""" - rc = RewardConfig(energy=0.0, positional=0.0, position=0.0, acceleration=0.0, t_min_crane=1.0) - env = AntiPendulumEnv(crane, conf=AntiPendulumConfig(start_speed=1.0, reward_fac=rc, continuous_actions=False)) - _ = env.reset() - # Displace crane so t_min > 0 - env.crane.position[0] = 1.0 - env.crane.velocity[0] = 0.0 - _, reward, _, _, _ = env.step(1) # coast — minimal physics change - assert reward < 0.0, f"Expected negative reward from t_min penalty, got {reward}" - # At origin at rest t_min = 0 → contribution is exactly 0 - env.crane.position[0] = 0.0 - env.crane.velocity[0] = 0.0 - t_min = env._t_min_crane() # type: ignore[reportPrivateUsage] - assert t_min == 0.0, f"Expected t_min=0 at origin at rest, got {t_min}" - if __name__ == "__main__": import os diff --git a/tests/test_ppo.py b/tests/test_ppo.py index 119289f..6b14d8a 100644 --- a/tests/test_ppo.py +++ b/tests/test_ppo.py @@ -3,66 +3,63 @@ from pathlib import Path import numpy as np -import pytest from py_crane.crane import Crane from stable_baselines3.common.running_mean_std import RunningMeanStd -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.ppo_agent import ProximalPolicyOptimizationAgent logger = logging.getLogger(__name__) -@pytest.mark.skip(reason="Test must be updated") def test_monitor(crane: Callable[..., Crane], *, show: bool) -> None: agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, env_kwargs={ "crane": crane, - "seed": 2, - "start_speed": 1.0, - "render_mode": "reward-tracking" if show else "none", + "conf": AntiPendulumConfig( + seed=2, + start_speed=1.0, + render_mode="reward-tracking" if show else "none", + ), }, ) agent.do_training(1000) -@pytest.mark.skip(reason="Test must be updated") def test_ppo_saves_vecnorm(crane: Callable[..., Crane], tmp_path: Path) -> None: """Test that do_training saves the VecNormalize statistics alongside the model.""" save_path = str(tmp_path / "model.zip") agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, save_path=save_path, ) agent.do_training(500, progress_bar=False) assert (tmp_path / "model_vecnorm.pkl").exists() -@pytest.mark.skip(reason="Test must be updated") def test_ppo_vecnorm_updates(crane: Callable[..., Crane]) -> None: """Test that the VecNormalize running mean is updated during training.""" agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, ) agent.do_training(500, progress_bar=False) assert isinstance(agent.vec_env.obs_rms, RunningMeanStd) assert not np.allclose(agent.vec_env.obs_rms.mean, 0.0) -@pytest.mark.skip(reason="Test must be updated") def test_ppo_inference_disables_training_mode(crane: Callable[..., Crane], tmp_path: Path) -> None: """Test that load() sets VecNormalize to evaluation mode.""" save_path = str(tmp_path / "model.zip") agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, save_path=save_path, ) agent.do_training(500, progress_bar=False) @@ -70,20 +67,19 @@ def test_ppo_inference_disables_training_mode(crane: Callable[..., Crane], tmp_p loaded = ProximalPolicyOptimizationAgent.load( AntiPendulumEnv, model_path=save_path, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, ) assert not loaded.vec_env.training assert not loaded.vec_env.norm_reward -@pytest.mark.skip(reason="Test must be updated") def test_ppo_resume_keeps_training_mode(crane: Callable[..., Crane], tmp_path: Path) -> None: """Test that resume() keeps VecNormalize in training mode.""" save_path = str(tmp_path / "model.zip") agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, save_path=save_path, ) agent.do_training(500, progress_bar=False) @@ -91,21 +87,20 @@ def test_ppo_resume_keeps_training_mode(crane: Callable[..., Crane], tmp_path: P resumed = ProximalPolicyOptimizationAgent.resume( AntiPendulumEnv, model_path=save_path, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, n_envs=1, ) assert resumed.vec_env.training assert resumed.vec_env.norm_reward -@pytest.mark.skip(reason="Test must be updated") def test_ppo_resume_updates_vecnorm(crane: Callable[..., Crane], tmp_path: Path) -> None: """Test that VecNormalize statistics update during resumed training.""" save_path = str(tmp_path / "model.zip") agent = ProximalPolicyOptimizationAgent( AntiPendulumEnv, n_envs=1, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, save_path=save_path, ) agent.do_training(500, progress_bar=False) @@ -113,10 +108,10 @@ def test_ppo_resume_updates_vecnorm(crane: Callable[..., Crane], tmp_path: Path) resumed = ProximalPolicyOptimizationAgent.resume( AntiPendulumEnv, model_path=save_path, - env_kwargs={"crane": crane, "start_speed": 1.0}, + env_kwargs={"crane": crane, "conf": AntiPendulumConfig(start_speed=1.0)}, save_path=save_path, n_envs=1, ) - mean_before = resumed.vec_env.obs_rms.mean.copy() # type: ignore[attr-defined] + mean_before = resumed.vec_env.obs_rms.mean.copy() resumed.do_training(500, progress_bar=False, reset_num_timesteps=False) - assert not np.allclose(resumed.vec_env.obs_rms.mean, mean_before) # type: ignore[attr-defined] + assert not np.allclose(resumed.vec_env.obs_rms.mean, mean_before) \ No newline at end of file diff --git a/tests/test_simple_q_env.py b/tests/test_simple_q_env.py index 7073076..3e4de3f 100644 --- a/tests/test_simple_q_env.py +++ b/tests/test_simple_q_env.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -@pytest.mark.skip(reason="Test must be updated") + def test_env(): env = SimpleTestEnv( config=Config( @@ -50,7 +50,7 @@ def test_env(): assert abs(stats[0] - stats[1]) / stats[2] < 0.05, f"stats: {stats}" -@pytest.mark.skip(reason="Test must be updated") + def test_smoke(*, show: bool) -> None: env = SimpleTestEnv( config=Config( @@ -71,7 +71,7 @@ def test_smoke(*, show: bool) -> None: agent.do_episodes(n_episodes=5, max_steps=200) -@pytest.mark.skip(reason="Test must be updated") + def test_q_analyse(env: gym.Env[tuple[int, ...] | np.ndarray, int], *, show: bool) -> None: agent = QLearningAgent(env, filename=Path("q_trained.json"), use_file="r") agent.q_values = agent.read_dumped() From 3a1b59a94188081ab2b30ff84166329ee7984fb5 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 18 Jun 2026 22:27:25 +0200 Subject: [PATCH 2/4] fix: remove brittle seeded-RNG assertions from test_env action_space.sample() and observation_space.sample() with seed=1 returned version-specific values that changed with the gymnasium upgrade in PR #12. The physics-loop assertions below are still valid and kept. --- tests/test_simple_q_env.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_simple_q_env.py b/tests/test_simple_q_env.py index 3e4de3f..302ae27 100644 --- a/tests/test_simple_q_env.py +++ b/tests/test_simple_q_env.py @@ -30,8 +30,6 @@ def test_env(): ) assert env.config is not None assert env.action_space.n == 3 # type: ignore[attr-defined] ## the attribute exists - assert env.action_space.sample() == 1, "Pseudo random" - assert list(env.observation_space.sample()) == [-98, -10] pos = env.pos speed = env.speed dt = env.dt From e7df79b8bc5744c53ac394aa4ede3363919ebda0 Mon Sep 17 00:00:00 2001 From: Babic Date: Fri, 19 Jun 2026 05:33:15 +0200 Subject: [PATCH 3/4] fix: resolve remaining test failures and ruff/mypy issues after PR #12 - q_agent.py: duck-type env.conf.dt so QLearningAgent works with SimpleTestEnv (which has no .conf); falls back to dt=1.0 - test_simple_q_env: cast action_space.sample() to int before arithmetic to avoid np.uint16 underflow (0-1 -> 65535); drop flaky stats uniformity assertion (physics assertions in the loop are sufficient) - test_ppo: add isinstance(rms, RunningMeanStd) narrowing so mypy accepts .mean access; add missing trailing newline - test_algorithm, test_simple_q_env: shorten skip-reason strings to fit ruff E501 line-length limit - test_environment, test_algorithm, test_ppo, test_simple_q_env: ruff format pass --- src/crane_controller/q_agent.py | 3 ++- tests/test_algorithm.py | 8 ++++++-- tests/test_environment.py | 1 - tests/test_ppo.py | 6 ++++-- tests/test_simple_q_env.py | 11 +++-------- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/crane_controller/q_agent.py b/src/crane_controller/q_agent.py index 1680133..9207e60 100644 --- a/src/crane_controller/q_agent.py +++ b/src/crane_controller/q_agent.py @@ -228,7 +228,8 @@ def do_episodes(self, n_episodes: int = 1000, max_steps: int = 5000, show: int = num_truncated += int(trunc) if _episode >= n_episodes - 100: log_r0 = np.log(-self.env.rewards[0]) # type: ignore[attr-defined] ## extended class - _t = [-i * self.env.conf.dt / (np.log(-r) - log_r0) for i, r in enumerate(self.env.rewards[1:])] # type: ignore[attr-defined] ## extended class + _env_dt = getattr(getattr(self.env, "conf", self.env), "dt", 1.0) + _t = [-i * _env_dt / (np.log(-r) - log_r0) for i, r in enumerate(self.env.rewards[1:])] # type: ignore[attr-defined] ## extended class tau.append(np.average(_t)) rewards[0].extend(list(range(len(self.env.rewards)))) # type: ignore[attr-defined] ## extended class rewards[1].extend([np.log(-x) - log_r0 for x in self.env.rewards]) # type: ignore[attr-defined] ## extended class diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py index f1739e9..826fe9c 100644 --- a/tests/test_algorithm.py +++ b/tests/test_algorithm.py @@ -10,7 +10,9 @@ logger = logging.getLogger(__name__) -@pytest.mark.skip(reason="AlgorithmAgent.get_action uses obs[1]/obs[2] for pos/speed, but 'energy' discretization places pos at obs[2] and speed at obs[3]; update AlgorithmAgent to match new observation layout") +@pytest.mark.skip( + reason="AlgorithmAgent uses obs[1]/obs[2] for pos/speed; 'energy' discretization moved them to obs[2]/obs[3]" +) def test_algorithm_strategies( crane: Callable[..., Crane], *, @@ -29,7 +31,9 @@ def test_algorithm_strategies( agent.do_strategies(max_steps=5000 if show else 10) -@pytest.mark.skip(reason="AlgorithmAgent.get_action uses obs[1]/obs[2] for pos/speed, but 'energy' discretization places pos at obs[2] and speed at obs[3]; update AlgorithmAgent to match new observation layout") +@pytest.mark.skip( + reason="AlgorithmAgent uses obs[1]/obs[2] for pos/speed; 'energy' discretization moved them to obs[2]/obs[3]" +) def test_algorithm(crane: Callable[..., Crane], *, show: bool) -> None: env = AntiPendulumEnv( crane, diff --git a/tests/test_environment.py b/tests/test_environment.py index fb28a03..5c0e6d4 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -256,7 +256,6 @@ def test_step_accepts_correct_action(crane: Callable[..., Crane], continuous_act assert obs.shape == (4,) - if __name__ == "__main__": import os from pathlib import Path diff --git a/tests/test_ppo.py b/tests/test_ppo.py index 6b14d8a..1c09c18 100644 --- a/tests/test_ppo.py +++ b/tests/test_ppo.py @@ -112,6 +112,8 @@ def test_ppo_resume_updates_vecnorm(crane: Callable[..., Crane], tmp_path: Path) save_path=save_path, n_envs=1, ) - mean_before = resumed.vec_env.obs_rms.mean.copy() + rms = resumed.vec_env.obs_rms + assert isinstance(rms, RunningMeanStd) + mean_before = rms.mean.copy() resumed.do_training(500, progress_bar=False, reset_num_timesteps=False) - assert not np.allclose(resumed.vec_env.obs_rms.mean, mean_before) \ No newline at end of file + assert not np.allclose(resumed.vec_env.obs_rms.mean, mean_before) diff --git a/tests/test_simple_q_env.py b/tests/test_simple_q_env.py index 302ae27..3105b46 100644 --- a/tests/test_simple_q_env.py +++ b/tests/test_simple_q_env.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) - def test_env(): env = SimpleTestEnv( config=Config( @@ -33,11 +32,9 @@ def test_env(): pos = env.pos speed = env.speed dt = env.dt - stats = [0, 0, 0] for _i in range(1000): - i_acc = env.action_space.sample() - stats[i_acc + 1] += 1 - a = i_acc * env.config.acc + i_acc = int(env.action_space.sample()) # cast np.uint16 → int to avoid underflow + a = (i_acc - 1) * env.config.acc # action 0→-acc, 1→0, 2→+acc obs, _reward, _term, _trunc, _ = env.step(i_acc) pos += speed * dt + 0.5 * a * dt * dt speed += a * dt @@ -45,8 +42,6 @@ def test_env(): assert round(pos) == obs[0] assert speed == env.speed assert round(speed) == obs[1] - assert abs(stats[0] - stats[1]) / stats[2] < 0.05, f"stats: {stats}" - def test_smoke(*, show: bool) -> None: @@ -69,7 +64,7 @@ def test_smoke(*, show: bool) -> None: agent.do_episodes(n_episodes=5, max_steps=200) - +@pytest.mark.skip(reason="needs 'env' fixture and pre-trained q_trained.json; run manually via __main__") def test_q_analyse(env: gym.Env[tuple[int, ...] | np.ndarray, int], *, show: bool) -> None: agent = QLearningAgent(env, filename=Path("q_trained.json"), use_file="r") agent.q_values = agent.read_dumped() From 31ebca1b357d505972b2fc308b0794bc541ab7bd Mon Sep 17 00:00:00 2001 From: Babic Date: Fri, 19 Jun 2026 06:02:10 +0200 Subject: [PATCH 4/4] fix: update PPO scripts and agent for AntiPendulumConfig API after PR #12 - play_ppo.py, train_ppo.py: wrap flat env kwargs (start_speed, render_mode, reward_fac, etc.) in AntiPendulumConfig; use dataclasses.replace to update start_speed between speed-sweep steps - ppo_agent.py: access continuous_actions and acc via env.conf instead of as direct env attributes (moved to frozen dataclass in PR #12) - controlled_crane_pendulum.py: restore self.render_mode as a direct instance attribute (gymnasium convention); was removed by PR #12, breaking any caller that reads env.render_mode --- scripts/play_ppo.py | 20 +++++----- scripts/train_ppo.py | 37 ++++++++++--------- .../envs/controlled_crane_pendulum.py | 1 + src/crane_controller/ppo_agent.py | 4 +- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/scripts/play_ppo.py b/scripts/play_ppo.py index 6b81824..8a701a3 100644 --- a/scripts/play_ppo.py +++ b/scripts/play_ppo.py @@ -21,7 +21,7 @@ import numpy as np from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.experiment_config import load_training_sidecar from crane_controller.ppo_agent import EpisodeResult, ProximalPolicyOptimizationAgent @@ -191,13 +191,15 @@ def main() -> None: model_path=args.model_path, env_kwargs={ "crane": build_crane, - "start_speed": speeds[0], - "randomize_start": args.randomize_start, - "render_mode": args.render_mode, - "reward_fac": config.reward, - "rail_limit": config.training.rail_limit, - "reward_limit": config.training.reward_limit, - "continuous_actions": args.continuous_actions, + "conf": AntiPendulumConfig( + start_speed=speeds[0], + randomize_start=args.randomize_start, + render_mode=args.render_mode, + reward_fac=config.reward, + rail_limit=config.training.rail_limit, + reward_limit=config.training.reward_limit, + continuous_actions=args.continuous_actions, + ), }, max_episode_steps=mep, ) @@ -206,7 +208,7 @@ def main() -> None: all_results: list[EpisodeResult] = [] for speed in speeds: - agent.env.unwrapped.start_speed = speed # type: ignore[attr-defined] + agent.env.unwrapped.conf = dataclasses.replace(agent.env.unwrapped.conf, start_speed=speed) # type: ignore[attr-defined] for episode in range(args.episodes): LOGGER.info("Episode %s/%s speed=%+.1f", episode + 1, args.episodes, speed) png_path: str | None = None diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py index dd0c35f..9b419c1 100644 --- a/scripts/train_ppo.py +++ b/scripts/train_ppo.py @@ -18,7 +18,7 @@ from pathlib import Path from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.experiment_config import ( ExperimentConfig, RewardConfig, @@ -185,8 +185,7 @@ def main() -> None: # noqa: PLR0915 n_envs=1, env_kwargs={ "crane": build_crane, - "start_speed": -1.0, - "render_mode": "reward-tracking", + "conf": AntiPendulumConfig(start_speed=-1.0, render_mode="reward-tracking"), }, ) agent.do_training(1000, progress_bar=False) @@ -229,13 +228,15 @@ def main() -> None: # noqa: PLR0915 model_path=args.resume_from, env_kwargs={ "crane": build_crane, - "start_speed": args.start_speed, - "randomize_start": args.randomize_start, - "render_mode": args.render_mode, - "reward_fac": resume_config.reward, - "rail_limit": args.rail_limit, - "reward_limit": resume_config.training.reward_limit, - "continuous_actions": args.continuous_actions, + "conf": AntiPendulumConfig( + start_speed=args.start_speed, + randomize_start=args.randomize_start, + render_mode=args.render_mode, + reward_fac=resume_config.reward, + rail_limit=args.rail_limit, + reward_limit=resume_config.training.reward_limit, + continuous_actions=args.continuous_actions, + ), }, save_path=args.save_path, n_envs=args.n_envs, @@ -254,13 +255,15 @@ def main() -> None: # noqa: PLR0915 n_envs=args.n_envs, env_kwargs={ "crane": build_crane, - "start_speed": args.start_speed, - "randomize_start": args.randomize_start, - "render_mode": args.render_mode, - "reward_fac": experiment_config.reward, - "rail_limit": experiment_config.training.rail_limit, - "reward_limit": experiment_config.training.reward_limit, - "continuous_actions": args.continuous_actions, + "conf": AntiPendulumConfig( + start_speed=args.start_speed, + randomize_start=args.randomize_start, + render_mode=args.render_mode, + reward_fac=experiment_config.reward, + rail_limit=experiment_config.training.rail_limit, + reward_limit=experiment_config.training.reward_limit, + continuous_actions=args.continuous_actions, + ), }, save_path=args.save_path, gamma=args.gamma, diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index a394c00..f3c68d5 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -116,6 +116,7 @@ def __init__(self, crane: Callable[..., Crane], conf: AntiPendulumConfig | None """ self.crane_maker = crane self.conf = AntiPendulumConfig() if conf is None else conf + self.render_mode: str | None = self.conf.render_mode # gymnasium convention: expose as direct attribute self.crane: Crane = crane() self.wire: Wire = self.crane.boom_by_name("wire") # type: ignore[assignment] # Wire is a sub-class of Boom assert isinstance(self.wire, Wire), "Need a crane wire!" diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py index 233072e..b29c5a5 100644 --- a/src/crane_controller/ppo_agent.py +++ b/src/crane_controller/ppo_agent.py @@ -397,8 +397,8 @@ def do_one_episode( self.env.unwrapped.render(save_path=save_png) # type: ignore[attr-defined, call-arg] env_u = self.env.unwrapped energy_final = 0.5 * float(env_u.wire.cm_v[0]) ** 2 # type: ignore[attr-defined] - if env_u.continuous_actions: # type: ignore[attr-defined] - acc_final = float(np.asarray(last_action).flat[0]) * float(env_u.acc) # type: ignore[attr-defined] + if env_u.conf.continuous_actions: # type: ignore[attr-defined] + acc_final = float(np.asarray(last_action).flat[0]) * float(env_u.conf.acc) # type: ignore[attr-defined] else: acc_final = float(env_u.action_to_acc[int(last_action)]) # type: ignore[attr-defined] return EpisodeResult(