dnv-opensource · eisDNV · Jun 19, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 19, 2026
diff --git a/experiments/hybrid_cv01.yaml b/experiments/hybrid_cv01.yaml
@@ -1,6 +1,7 @@
 # Validated hybrid reward: energy + crane_velocity penalty + position return.
-# energy=1.0 (KE+PE physics signal), crane_velocity=0.1 (-x_dot^2 penalty to
-# damp trolley oscillation), position=0.02 (-|x| to encourage return to origin).
+# energy=1.0 (KE+PE physics signal), crane_velocity=-0.1 (x_dot^2 penalty to
+# damp trolley oscillation; negative because rc.crane_velocity * x_dot^2 is used),
+# position=0.1 (-|x| to encourage return to origin).
 # terminal_penalty=-5.0 provides a one-shot crash signal propagated ~100 steps
 # back by gamma=0.99. Trained with randomize_start=True on speeds +-[0.1, 1.0].
 # Seeds 2718, 3141, 31415 achieve 6/6 OOD generalisation at start_speed=7.0.
@@ -13,7 +14,7 @@ reward:
   terminal_penalty: -5.0
   angle: 0.0
   angular_velocity: 0.0
-  crane_velocity: 0.1
+  crane_velocity: -0.1
   crane_acceleration: 0.0
   angular_acceleration: 0.0
   t_min_crane: 0.0

diff --git a/experiments/hybrid_t_min.yaml b/experiments/hybrid_t_min.yaml
@@ -10,7 +10,7 @@ reward:
   terminal_penalty: -5.0
   angle: 0.0
   angular_velocity: 0.0
-  crane_velocity: 0.1
+  crane_velocity: -0.1
   crane_acceleration: 0.0
   angular_acceleration: 0.0
   t_min_crane: 0.01

diff --git a/pyproject.toml b/pyproject.toml
@@ -77,7 +77,6 @@ dependencies = [
     "matplotlib>=3.10",
     "seaborn>=0.13.2",
     "tqdm>=4.67",
-    "pyment>=0.3.3",
 ]
 
 [project.urls]

diff --git a/scripts/analyse_q.py b/scripts/analyse_q.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.q_agent import QLearningAgent
 
 LOGGER = logging.getLogger(__name__)
@@ -36,7 +36,7 @@ def _build_dummy_env() -> AntiPendulumEnv:
     AntiPendulumEnv
         Environment with discrete observation space.
     """
-    return AntiPendulumEnv(build_crane, discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy())
+    return AntiPendulumEnv(build_crane, conf=AntiPendulumConfig(discrete="energy"))
 
 
 def main() -> None:
@@ -53,7 +53,7 @@ def main() -> None:
     args = parser.parse_args()
 
     env = _build_dummy_env()
-    agent = QLearningAgent(env, trained=(args.model_path, True))
+    agent = QLearningAgent(env, filename=args.model_path, use_file="r")
 
     logging.basicConfig(level=logging.INFO, format="%(message)s")
 

diff --git a/scripts/play_ppo.py b/scripts/play_ppo.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.experiment_config import load_training_sidecar
 from crane_controller.ppo_agent import EpisodeResult, ProximalPolicyOptimizationAgent
 
@@ -191,13 +191,15 @@ def main() -> None:
         model_path=args.model_path,
         env_kwargs={
             "crane": build_crane,
-            "start_speed": speeds[0],
-            "randomize_start": args.randomize_start,
-            "render_mode": args.render_mode,
-            "reward_fac": config.reward,
-            "rail_limit": config.training.rail_limit,
-            "reward_limit": config.training.reward_limit,
-            "continuous_actions": args.continuous_actions,
+            "conf": AntiPendulumConfig(
+                start_speed=speeds[0],
+                randomize_start=args.randomize_start,
+                render_mode=args.render_mode,
+                reward_fac=config.reward,
+                rail_limit=config.training.rail_limit,
+                reward_limit=config.training.reward_limit,
+                continuous_actions=args.continuous_actions,
+            ),
         },
         max_episode_steps=mep,
     )
@@ -206,7 +208,7 @@ def main() -> None:
     all_results: list[EpisodeResult] = []
 
     for speed in speeds:
-        agent.env.unwrapped.start_speed = speed  # type: ignore[attr-defined]
+        agent.env.unwrapped.conf = dataclasses.replace(agent.env.unwrapped.conf, start_speed=speed)  # type: ignore[attr-defined]
         for episode in range(args.episodes):
             LOGGER.info("Episode %s/%s  speed=%+.1f", episode + 1, args.episodes, speed)
             png_path: str | None = None

diff --git a/scripts/play_q.py b/scripts/play_q.py
@@ -12,7 +12,7 @@
 import logging
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.q_agent import QLearningAgent
 
 LOGGER = logging.getLogger(__name__)
@@ -31,11 +31,13 @@ def main() -> None:
 
     env = AntiPendulumEnv(
         build_crane,
-        start_speed=args.v0,
-        render_mode=args.render_mode,
-        discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy(),
+        conf=AntiPendulumConfig(
+            start_speed=args.v0,
+            render_mode=args.render_mode,
+            discrete="energy",
+        ),
     )
-    agent = QLearningAgent(env, trained=(args.model_path, True))
+    agent = QLearningAgent(env, filename=args.model_path, use_file="r")
 
     logging.basicConfig(level=logging.INFO, format="%(message)s")
     for episode in range(args.episodes):

diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.experiment_config import (
     ExperimentConfig,
     RewardConfig,
@@ -185,8 +185,7 @@ def main() -> None:  # noqa: PLR0915
             n_envs=1,
             env_kwargs={
                 "crane": build_crane,
-                "start_speed": -1.0,
-                "render_mode": "reward-tracking",
+                "conf": AntiPendulumConfig(start_speed=-1.0, render_mode="reward-tracking"),
             },
         )
         agent.do_training(1000, progress_bar=False)
@@ -229,13 +228,15 @@ def main() -> None:  # noqa: PLR0915
             model_path=args.resume_from,
             env_kwargs={
                 "crane": build_crane,
-                "start_speed": args.start_speed,
-                "randomize_start": args.randomize_start,
-                "render_mode": args.render_mode,
-                "reward_fac": resume_config.reward,
-                "rail_limit": args.rail_limit,
-                "reward_limit": resume_config.training.reward_limit,
-                "continuous_actions": args.continuous_actions,
+                "conf": AntiPendulumConfig(
+                    start_speed=args.start_speed,
+                    randomize_start=args.randomize_start,
+                    render_mode=args.render_mode,
+                    reward_fac=resume_config.reward,
+                    rail_limit=args.rail_limit,
+                    reward_limit=resume_config.training.reward_limit,
+                    continuous_actions=args.continuous_actions,
+                ),
             },
             save_path=args.save_path,
             n_envs=args.n_envs,
@@ -254,13 +255,15 @@ def main() -> None:  # noqa: PLR0915
             n_envs=args.n_envs,
             env_kwargs={
                 "crane": build_crane,
-                "start_speed": args.start_speed,
-                "randomize_start": args.randomize_start,
-                "render_mode": args.render_mode,
-                "reward_fac": experiment_config.reward,
-                "rail_limit": experiment_config.training.rail_limit,
-                "reward_limit": experiment_config.training.reward_limit,
-                "continuous_actions": args.continuous_actions,
+                "conf": AntiPendulumConfig(
+                    start_speed=args.start_speed,
+                    randomize_start=args.randomize_start,
+                    render_mode=args.render_mode,
+                    reward_fac=experiment_config.reward,
+                    rail_limit=experiment_config.training.rail_limit,
+                    reward_limit=experiment_config.training.reward_limit,
+                    continuous_actions=args.continuous_actions,
+                ),
             },
             save_path=args.save_path,
             gamma=args.gamma,

diff --git a/scripts/train_q.py b/scripts/train_q.py
@@ -16,7 +16,7 @@
 from pathlib import Path
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.q_agent import QLearningAgent
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
@@ -55,20 +55,24 @@ def main() -> None:
 
     env = AntiPendulumEnv(
         build_crane,
-        start_speed=args.v0,
-        render_mode="plot" if args.dry_run else "none",
-        reward_limit=args.reward_limit,
-        discrete=AntiPendulumEnv.DEFAULT_DISCRETE.copy(),
+        conf=AntiPendulumConfig(
+            start_speed=args.v0,
+            render_mode="plot" if args.dry_run else "none",
+            reward_limit=args.reward_limit,
+            discrete="energy",
+        ),
     )
 
     if args.dry_run:
-        agent = QLearningAgent(env, trained=None)
+        agent = QLearningAgent(env)
         agent.do_episodes(n_episodes=50, max_steps=1000)
 
     else:
         Path(args.save_path).parent.mkdir(parents=True, exist_ok=True)
-        trained = (args.trained, True) if args.trained else (args.save_path, False)
-        agent = QLearningAgent(env, trained=trained)
+        if args.trained:
+            agent = QLearningAgent(env, filename=Path(args.trained), use_file="rw")
+        else:
+            agent = QLearningAgent(env, filename=Path(args.save_path), use_file="w")
         agent.do_episodes(n_episodes=args.episodes, max_steps=5000)
         LOGGER.info(f"Model saved to {args.save_path}")
 

diff --git a/scripts/use_q_ide.py b/scripts/use_q_ide.py
@@ -11,7 +11,7 @@
 from typing import Any
 
 from crane_controller.crane_factory import build_crane
-from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv
+from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv
 from crane_controller.envs.simple_test_env import SimpleTestEnv
 from crane_controller.experiment_config import RewardConfig
 from crane_controller.q_agent import QLearningAgent
@@ -72,20 +72,22 @@ def do_use(conf: Config | dict[str, Any] | None = None) -> None:
     _conf = Config() if conf is None else (Config(**conf) if isinstance(conf, dict) else conf)
     env = AntiPendulumEnv(
         build_crane,
-        start_speed=_conf.v0,
-        randomize_start=_conf.randomize_start,
-        seed=_conf.seed,
-        dt=_conf.dt,
-        render_mode=_conf.render,
-        discrete=_conf.discretization,
-        reward_fac=_conf.rc,
-        reward_limit=_conf.r_limit,
-        discount=_conf.discount,
+        conf=AntiPendulumConfig(
+            start_speed=_conf.v0,
+            randomize_start=_conf.randomize_start,
+            seed=_conf.seed,
+            dt=_conf.dt,
+            render_mode=_conf.render,
+            discrete=_conf.discretization,
+            reward_fac=_conf.rc,
+            reward_limit=_conf.r_limit,
+            discount=_conf.discount,
+        ),
     )
 
-    filename = _conf.file
+    filename = Path(_conf.file) if _conf.file is not None else None
     if filename is not None:
-        Path(filename).parent.mkdir(parents=True, exist_ok=True)
+        filename.parent.mkdir(parents=True, exist_ok=True)
     agent = QLearningAgent(env, filename=filename, use_file=_conf.use_file, strategy=_conf.strategy)
     LOGGER.info(f"DISCRETE: {agent.env.discrete}")
     agent.do_episodes(n_episodes=_conf.episodes, max_steps=_conf.steps, show=0)
@@ -110,7 +112,7 @@ def simple_env(episodes: int, render: str, file: str, use: str, r_limit: float |
         dt=1.0,
         render_mode=render,
     )
-    agent = QLearningAgent(env, filename=file, use_file=use)
+    agent = QLearningAgent(env, filename=Path(file), use_file=use)
     agent.do_episodes(n_episodes=episodes, max_steps=steps)
 
 

diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py
@@ -116,6 +116,7 @@ def __init__(self, crane: Callable[..., Crane], conf: AntiPendulumConfig | None
         """
         self.crane_maker = crane
         self.conf = AntiPendulumConfig() if conf is None else conf
+        self.render_mode: str | None = self.conf.render_mode  # gymnasium convention: expose as direct attribute
         self.crane: Crane = crane()
         self.wire: Wire = self.crane.boom_by_name("wire")  # type: ignore[assignment]  # Wire is a sub-class of Boom
         assert isinstance(self.wire, Wire), "Need a crane wire!"

diff --git a/src/crane_controller/experiment_config.py b/src/crane_controller/experiment_config.py
@@ -44,7 +44,8 @@ class RewardConfig:
         Uses pure angular velocity ``(cm_v[0] - origin_v[0]) / wire.length``,
         excluding crane translation.
     crane_velocity : float
-        Weight for the squared crane velocity penalty ``-x_dot^2`` (default 0.0).
+        Weight for the squared crane velocity term ``+x_dot^2`` (default 0.0).
+        Positive values reward crane velocity; use a negative value to penalise it.
     crane_acceleration : float
         Weight for the squared crane acceleration penalty ``-x_ddot^2`` (default 0.0).
         Equals the control action squared (acc = action * self.acc).

diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py
@@ -397,8 +397,8 @@ def do_one_episode(
         self.env.unwrapped.render(save_path=save_png)  # type: ignore[attr-defined, call-arg]
         env_u = self.env.unwrapped
         energy_final = 0.5 * float(env_u.wire.cm_v[0]) ** 2  # type: ignore[attr-defined]
-        if env_u.continuous_actions:  # type: ignore[attr-defined]
-            acc_final = float(np.asarray(last_action).flat[0]) * float(env_u.acc)  # type: ignore[attr-defined]
+        if env_u.conf.continuous_actions:  # type: ignore[attr-defined]
+            acc_final = float(np.asarray(last_action).flat[0]) * float(env_u.conf.acc)  # type: ignore[attr-defined]
         else:
             acc_final = float(env_u.action_to_acc[int(last_action)])  # type: ignore[attr-defined]
         return EpisodeResult(

diff --git a/src/crane_controller/q_agent.py b/src/crane_controller/q_agent.py
@@ -228,7 +228,8 @@ def do_episodes(self, n_episodes: int = 1000, max_steps: int = 5000, show: int =
             num_truncated += int(trunc)
             if _episode >= n_episodes - 100:
                 log_r0 = np.log(-self.env.rewards[0])  # type: ignore[attr-defined] ## extended class
-                _t = [-i * self.env.conf.dt / (np.log(-r) - log_r0) for i, r in enumerate(self.env.rewards[1:])]  # type: ignore[attr-defined] ## extended class
+                _env_dt = getattr(getattr(self.env, "conf", self.env), "dt", 1.0)
+                _t = [-i * _env_dt / (np.log(-r) - log_r0) for i, r in enumerate(self.env.rewards[1:])]  # type: ignore[attr-defined] ## extended class
                 tau.append(np.average(_t))
                 rewards[0].extend(list(range(len(self.env.rewards))))  # type: ignore[attr-defined] ## extended class
                 rewards[1].extend([np.log(-x) - log_r0 for x in self.env.rewards])  # type: ignore[attr-defined] ## extended class

diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
@@ -10,7 +10,9 @@
 logger = logging.getLogger(__name__)
 
 
-@pytest.mark.skip(reason="Test needs to be updated")
+@pytest.mark.skip(
+    reason="AlgorithmAgent uses obs[1]/obs[2] for pos/speed; 'energy' discretization moved them to obs[2]/obs[3]"
+)
 def test_algorithm_strategies(
     crane: Callable[..., Crane],
     *,
@@ -29,7 +31,9 @@ def test_algorithm_strategies(
     agent.do_strategies(max_steps=5000 if show else 10)
 
 
-@pytest.mark.skip(reason="Test needs to be updated.")
+@pytest.mark.skip(
+    reason="AlgorithmAgent uses obs[1]/obs[2] for pos/speed; 'energy' discretization moved them to obs[2]/obs[3]"
+)
 def test_algorithm(crane: Callable[..., Crane], *, show: bool) -> None:
     env = AntiPendulumEnv(
         crane,

diff --git a/tests/test_environment.py b/tests/test_environment.py
@@ -256,24 +256,6 @@ def test_step_accepts_correct_action(crane: Callable[..., Crane], continuous_act
     assert obs.shape == (4,)
 
 
-@pytest.mark.skip(reason="The t_min reward term is not in use any more")
-def test_t_min_crane_reward_term(crane: Callable[..., Crane]) -> None:
-    """t_min_crane weight adds -t_min to the reward; zero at origin at rest."""
-    rc = RewardConfig(energy=0.0, positional=0.0, position=0.0, acceleration=0.0, t_min_crane=1.0)
-    env = AntiPendulumEnv(crane, conf=AntiPendulumConfig(start_speed=1.0, reward_fac=rc, continuous_actions=False))
-    _ = env.reset()
-    # Displace crane so t_min > 0
-    env.crane.position[0] = 1.0
-    env.crane.velocity[0] = 0.0
-    _, reward, _, _, _ = env.step(1)  # coast — minimal physics change
-    assert reward < 0.0, f"Expected negative reward from t_min penalty, got {reward}"
-    # At origin at rest t_min = 0 → contribution is exactly 0
-    env.crane.position[0] = 0.0
-    env.crane.velocity[0] = 0.0
-    t_min = env._t_min_crane()  # type: ignore[reportPrivateUsage]
-    assert t_min == 0.0, f"Expected t_min=0 at origin at rest, got {t_min}"
-
-
 if __name__ == "__main__":
     import os
     from pathlib import Path