From 2a4d55d3a3bde58f1b95e8975a125a2fe0cb4488 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Sun, 3 May 2026 21:53:53 -0400
Subject: [PATCH 1/8] Port Four Rooms from old fork

Copy the Four Rooms native environment sources and config from the old pufferlib-v3 fork into the v4 layout. This is an intentionally unadapted checkpoint; follow-up commits will update it for the current v4 APIs and build system.
---
 config/four_rooms.ini          |  26 +++
 ocean/four_rooms/binding.c     |  20 ++
 ocean/four_rooms/four_rooms.c  |  34 +++
 ocean/four_rooms/four_rooms.h  | 412 +++++++++++++++++++++++++++++++++
 ocean/four_rooms/four_rooms.py |  63 +++++
 5 files changed, 555 insertions(+)
 create mode 100644 config/four_rooms.ini
 create mode 100644 ocean/four_rooms/binding.c
 create mode 100644 ocean/four_rooms/four_rooms.c
 create mode 100644 ocean/four_rooms/four_rooms.h
 create mode 100644 ocean/four_rooms/four_rooms.py

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
new file mode 100644
index 0000000000..115b9a6b09
--- /dev/null
+++ b/config/four_rooms.ini
@@ -0,0 +1,26 @@
+[base]
+package = ocean
+env_name = puffer_four_rooms
+policy_name = Policy
+rnn_name = Recurrent
+#policy_name = FourRooms
+#rnn_name = FourRoomsLSTM
+
+[vec]
+num_workers = 12
+num_envs = 12
+
+[env]
+size = 27
+num_envs = 256
+
+[train]
+device = mps
+total_timesteps = 50_000_000
+gamma = 0.99
+learning_rate = 0.01
+minibatch_size = 32768
+clip_coef = 0.1
+ent_coef = 0.01
+vf_coef = 0.5
+max_grad_norm = 0.5
diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c
new file mode 100644
index 0000000000..e4f68a7eba
--- /dev/null
+++ b/ocean/four_rooms/binding.c
@@ -0,0 +1,20 @@
+#include "four_rooms.h"
+
+#define Env FourRooms
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->size = unpack(kwargs, "size");
+    env->see_through_walls = 0;
+    // Allocate grid memory for full state (stores OBJECT_IDX values)
+    env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char));
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "perf", log->perf);
+    assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    return 0;
+}
diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c
new file mode 100644
index 0000000000..5681084950
--- /dev/null
+++ b/ocean/four_rooms/four_rooms.c
@@ -0,0 +1,34 @@
+#include "four_rooms.h"
+
+int main() {
+    FourRooms env = {};
+    env.size = 19;
+    env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding
+    env.actions = (int*)calloc(1, sizeof(int));
+    env.rewards = (float*)calloc(1, sizeof(float));
+    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+    env.grid = (unsigned char*)calloc(env.size * env.size, sizeof(unsigned char));
+
+    c_reset(&env);
+    c_render(&env);
+    while (!WindowShouldClose()) {
+        if (IsKeyDown(KEY_LEFT_SHIFT)) {
+            env.actions[0] = 7; // Invalid action = no-op
+            if (IsKeyDown(KEY_UP) || IsKeyDown(KEY_W)) env.actions[0] = FORWARD;
+            if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) env.actions[0] = LEFT;
+            if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) env.actions[0] = RIGHT;
+        } else {
+            env.actions[0] = rand() % 3; // Only use left, right, forward
+        }
+        c_step(&env);
+        c_render(&env);
+    }
+    free(env.observations);
+    free(env.actions);
+    free(env.rewards);
+    free(env.terminals);
+    free(env.grid);
+    c_close(&env);
+    return 0;
+}
+
diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h
new file mode 100644
index 0000000000..9ef334f0c1
--- /dev/null
+++ b/ocean/four_rooms/four_rooms.h
@@ -0,0 +1,412 @@
+#include <stdlib.h>
+#include <string.h>
+#include "raylib.h"
+
+// Action space
+const unsigned char LEFT = 0;
+const unsigned char RIGHT = 1; 
+const unsigned char FORWARD = 2;
+const unsigned char PICKUP = 3; // Unused
+const unsigned char DROP = 4; // Unused
+const unsigned char TOGGLE = 5; // Unused
+const unsigned char DONE = 6; // Unused
+
+// Observation: Objects
+const unsigned char UNSEEN = 0;
+const unsigned char EMPTY = 1;
+const unsigned char WALL = 2;
+const unsigned char FLOOR = 3; // Unused
+const unsigned char DOOR = 4; // Unused
+const unsigned char KEY = 5; // Unused
+const unsigned char BALL = 6; // Unused
+const unsigned char BOX = 7; // Unused
+const unsigned char GOAL = 8;
+const unsigned char LAVA = 9; // Unused
+const unsigned char AGENT = 10;
+
+// Observation: Colors
+const unsigned char COLOR_BLACK = 0;
+const unsigned char COLOR_GREEN = 1;
+const unsigned char COLOR_BLUE = 2;
+const unsigned char COLOR_PURPLE = 3;
+const unsigned char COLOR_YELLOW = 4;
+const unsigned char COLOR_GREY = 5;
+
+// PufferLib standard colors for rendering
+const Color PUFF_RED = (Color){187, 0, 0, 255};
+const Color PUFF_CYAN = (Color){0, 187, 187, 255};
+const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255};
+
+typedef struct {
+    float perf;
+    float score;
+    float episode_return;
+    float episode_length;
+    float n;
+} Log;
+
+typedef struct {
+    Log log;
+    unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell
+    int* actions;
+    float* rewards;
+    unsigned char* terminals;
+    int size; // default 19
+    int tick;
+    int agent_x, agent_y;
+    int agent_dir; // 0=East, 1=South, 2=West, 3=North
+    int goal_x, goal_y;
+    unsigned char* grid; // Stores OBJECT_IDX values
+    int see_through_walls;
+    Texture2D puffers;
+} FourRooms;
+
+void add_log(FourRooms* env) {
+    env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0;
+    env->log.score += env->rewards[0];
+    env->log.episode_length += env->tick;
+    env->log.episode_return += env->rewards[0];
+    env->log.n++;
+}
+
+int can_see_cell(FourRooms* env, int agent_x, int agent_y, int target_x, int target_y) {
+    if (env->see_through_walls) {
+        return 1;
+    }
+
+    // Use Bresenham's line algorithm to check line of sight
+    int dx = abs(target_x - agent_x);
+    int dy = abs(target_y - agent_y);
+    int x = agent_x;
+    int y = agent_y;
+    int x_inc = (target_x > agent_x) ? 1 : -1;
+    int y_inc = (target_y > agent_y) ? 1 : -1;
+    int error = dx - dy;
+
+    while (x != target_x || y != target_y) {
+        // If we've reached the target cell, stop (target cell should always be visible)
+        if (x == target_x && y == target_y) {
+            break;
+        }
+
+        int error2 = 2 * error;
+        if (error2 > -dy) {
+            error -= dy;
+            x += x_inc;
+        }
+        if (error2 < dx) {
+            error += dx;
+            y += y_inc;
+        }
+
+        // If the next cell (not the target) is a wall, block vision beyond but allow seeing the wall itself
+        if ((x != target_x || y != target_y) &&
+            x >= 0 && x < env->size && y >= 0 && y < env->size &&
+            env->grid[y * env->size + x] == WALL) {
+            return 0; // Wall blocks the view beyond, but wall itself is visible
+        }
+    }
+    return 1; // Target cell is visible
+}
+
+void generate_observation(FourRooms* env) {
+    // Generate 7x7x3 observation centered on agent's view direction
+    int view_size = 7;
+    int half_view = view_size / 2;
+    
+    // Calculate the center of the view based on agent's direction
+    int center_x = env->agent_x;
+    int center_y = env->agent_y;
+    
+    // Shift center forward in the direction the agent is facing
+    if (env->agent_dir == 0) center_x += half_view; // East
+    else if (env->agent_dir == 1) center_y += half_view; // South
+    else if (env->agent_dir == 2) center_x -= half_view; // West
+    else if (env->agent_dir == 3) center_y -= half_view; // North
+    
+    for (int i = 0; i < view_size; i++) {
+        for (int j = 0; j < view_size; j++) {
+            int world_x = center_x - half_view + j;
+            int world_y = center_y - half_view + i;
+            
+            // Calculate flat index for this cell in the 7x7x3 observation
+            int base_idx = (i * view_size + j) * 3;
+            
+            unsigned char object_idx, color_idx, state;
+            
+            // Check bounds, out of bounds is treated as wall
+            if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) {
+                object_idx = WALL;
+                color_idx = COLOR_GREY;
+                state = 0;
+            } else if (!can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) {
+                object_idx = UNSEEN; // Cell is blocked by walls
+                color_idx = COLOR_BLACK;
+                state = 0;
+            } else {
+                int grid_idx = world_y * env->size + world_x;
+                unsigned char grid_cell = env->grid[grid_idx];
+                
+                // Map grid cell to MiniGrid encoding
+                switch (grid_cell) {
+                    case EMPTY:
+                        object_idx = EMPTY;
+                        color_idx = COLOR_BLACK;
+                        state = 0;
+                        break;
+                    case WALL:
+                        object_idx = WALL;
+                        color_idx = COLOR_GREY;
+                        state = 0;
+                        break;
+                    case AGENT:
+                        object_idx = AGENT;
+                        color_idx = COLOR_BLUE;
+                        state = 0;
+                        break;
+                    case GOAL:
+                        object_idx = GOAL;
+                        color_idx = COLOR_GREEN;
+                        state = 0;
+                        break;
+                    default:
+                        object_idx = EMPTY;
+                        color_idx = 0;
+                        state = 0;
+                        break;
+                }
+            }
+            
+            env->observations[base_idx] = object_idx;
+            env->observations[base_idx + 1] = color_idx;
+            env->observations[base_idx + 2] = state;
+        }
+    }
+}
+
+void create_four_rooms_grid(FourRooms* env) {
+    int size = env->size;
+    
+    // Clear grid
+    memset(env->grid, EMPTY, size * size * sizeof(unsigned char));
+    
+    // Create outer walls
+    for (int i = 0; i < size; i++) {
+        env->grid[0 * size + i] = WALL; // Top
+        env->grid[(size-1) * size + i] = WALL; // Bottom
+        env->grid[i * size + 0] = WALL; // Left
+        env->grid[i * size + (size-1)] = WALL; // Right
+    }
+    
+    int room_w = size / 2;
+    int room_h = size / 2;
+    
+    // Create vertical separating wall
+    for (int y = 0; y < size; y++) {
+        env->grid[y * size + room_w] = WALL;
+    }
+    
+    // Create horizontal separating wall
+    for (int x = 0; x < size; x++) {
+        env->grid[room_h * size + x] = WALL;
+    }
+    
+    // Create 4 gaps in the separating walls
+    // Gap in vertical wall (top half)
+    int gap_y1 = 1 + rand() % (room_h - 2);
+    env->grid[gap_y1 * size + room_w] = EMPTY;
+    
+    // Gap in vertical wall (bottom half)
+    int gap_y2 = room_h + 1 + rand() % (room_h - 2);
+    env->grid[gap_y2 * size + room_w] = EMPTY;
+    
+    // Gap in horizontal wall (left half)
+    int gap_x1 = 1 + rand() % (room_w - 2);
+    env->grid[room_h * size + gap_x1] = EMPTY;
+    
+    // Gap in horizontal wall (right half)
+    int gap_x2 = room_w + 1 + rand() % (room_w - 2);
+    env->grid[room_h * size + gap_x2] = EMPTY;
+}
+
+void c_reset(FourRooms* env) {
+
+    create_four_rooms_grid(env);
+    
+    // Place agent randomly in valid position
+    do {
+        env->agent_x = 1 + rand() % (env->size - 2);
+        env->agent_y = 1 + rand() % (env->size - 2);
+    } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY);
+    
+    // Place goal randomly in valid position (different from agent)
+    do {
+        env->goal_x = 1 + rand() % (env->size - 2);
+        env->goal_y = 1 + rand() % (env->size - 2);
+    } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY ||
+             (env->goal_x == env->agent_x && env->goal_y == env->agent_y));
+    
+    // Set agent and goal on grid
+    env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
+    env->grid[env->goal_y * env->size + env->goal_x] = GOAL;
+    
+    // Random initial direction
+    env->agent_dir = rand() % 4;
+    env->tick = 0;
+    
+    generate_observation(env);
+}
+
+void c_step(FourRooms* env) {
+    env->tick += 1;
+    
+    int action = env->actions[0];
+    env->terminals[0] = 0;
+    env->rewards[0] = 0.0;
+    
+    // Clear agent from current position
+    env->grid[env->agent_y * env->size + env->agent_x] = EMPTY;
+    
+    int new_x = env->agent_x;
+    int new_y = env->agent_y;
+    int new_dir = env->agent_dir;
+    
+    if (action == LEFT) {
+        new_dir = (env->agent_dir + 3) % 4;
+    } else if (action == RIGHT) {
+        new_dir = (env->agent_dir + 1) % 4;
+    } else if (action == FORWARD) {
+        if (env->agent_dir == 0) new_x += 1;
+        else if (env->agent_dir == 1) new_y += 1;
+        else if (env->agent_dir == 2) new_x -= 1;
+        else if (env->agent_dir == 3) new_y -= 1;
+
+        // Check if move is valid
+        if (new_x >= 0 && new_x < env->size && new_y >= 0 && new_y < env->size &&
+            env->grid[new_y * env->size + new_x] != WALL) {
+            env->agent_x = new_x;
+            env->agent_y = new_y;
+        }
+    }
+    
+    env->agent_dir = new_dir;
+    
+    // Check if agent reached goal
+    if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) {
+        env->terminals[0] = 1;
+        env->rewards[0] = 1.0;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+    
+    // Place agent back on grid
+    env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
+    
+    // Check timeout
+    if (env->tick >= 4 * env->size) {
+        env->terminals[0] = 1;
+        env->rewards[0] = 0.0;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+    
+    generate_observation(env);
+}
+
+void c_render(FourRooms* env) {
+    if (!IsWindowReady()) {
+        InitWindow(32*env->size, 32*env->size, "PufferLib FourRooms");
+        SetTargetFPS(10);
+        env->puffers = LoadTexture("resources/shared/puffers_128.png");
+    }
+
+    if (IsKeyDown(KEY_ESCAPE)) {
+        exit(0);
+    }
+
+    BeginDrawing();
+    ClearBackground(PUFF_BACKGROUND);
+
+    int px = 32;
+    
+    // Draw the main grid
+    for (int y = 0; y < env->size; y++) {
+        for (int x = 0; x < env->size; x++) {
+            int cell = env->grid[y * env->size + x];
+            Color color = PUFF_BACKGROUND;
+            
+            if (cell == WALL) color = PUFF_BACKGROUND2;
+            else if (cell == GOAL) color = PUFF_RED;
+
+            if (cell != EMPTY && cell != AGENT) {
+                DrawRectangle(x*px, y*px, px, px, color);
+            }
+        }
+    }
+    
+    // Draw agent's 7x7 observation window
+    int view_size = 7;
+    int half_view = view_size / 2;
+    
+    // Calculate the center of the view based on agent's direction
+    int center_x = env->agent_x;
+    int center_y = env->agent_y;
+    
+    // Shift center forward in the direction the agent is facing
+    if (env->agent_dir == 0) center_x += half_view; // East
+    else if (env->agent_dir == 1) center_y += half_view; // South
+    else if (env->agent_dir == 2) center_x -= half_view; // West
+    else if (env->agent_dir == 3) center_y -= half_view; // North
+    
+    // Draw semi-transparent overlay for observation window
+    Color obs_overlay = (Color){180, 180, 180, 80};
+    for (int i = 0; i < view_size; i++) {
+        for (int j = 0; j < view_size; j++) {
+            int world_x = center_x - half_view + j;
+            int world_y = center_y - half_view + i;
+            
+            // Only draw overlay for cells within grid bounds and visible to agent
+            if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size &&
+                can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) {
+                DrawRectangle(world_x*px, world_y*px, px, px, obs_overlay);
+            }
+        }
+    }
+    
+    // Draw agent
+    int starting_sprite_x = 0;
+    int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°)
+    if (rotation == 180) {
+        starting_sprite_x = 128; // Use flipped sprite for 180° rotation
+        rotation = 0;
+    }
+    
+    DrawTexturePro(
+        env->puffers,
+        (Rectangle){starting_sprite_x, 0, 128, 128},
+        (Rectangle){
+            env->agent_x * px + px/2,
+            env->agent_y * px + px/2,
+            px,
+            px
+        },
+        (Vector2){px/2, px/2},
+        rotation,
+        WHITE
+    );
+
+    EndDrawing();
+}
+
+void c_close(FourRooms* env) {
+    if (IsWindowReady()) {
+        UnloadTexture(env->puffers);
+        CloseWindow();
+    }
+    if (env->grid) {
+        free(env->grid);
+    }
+}
diff --git a/ocean/four_rooms/four_rooms.py b/ocean/four_rooms/four_rooms.py
new file mode 100644
index 0000000000..b530fd8573
--- /dev/null
+++ b/ocean/four_rooms/four_rooms.py
@@ -0,0 +1,63 @@
+
+import gymnasium
+import numpy as np
+
+import pufferlib
+from pufferlib.ocean.four_rooms import binding
+
+class FourRooms(pufferlib.PufferEnv):
+    def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=19, buf=None, seed=0):
+        self.single_observation_space = gymnasium.spaces.Box(low=0, high=10,
+            shape=(7*7*3,), dtype=np.uint8)
+        self.single_action_space = gymnasium.spaces.Discrete(7)
+        self.render_mode = render_mode
+        self.num_agents = num_envs
+        self.log_interval = log_interval
+
+        super().__init__(buf)
+        self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards,
+            self.terminals, self.truncations, num_envs, seed, size=size)
+ 
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        self.tick = 0
+        return self.observations, []
+
+    def step(self, actions):
+        self.tick += 1
+
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+
+        info = []
+        if self.tick % self.log_interval == 0:
+            info.append(binding.vec_log(self.c_envs))
+
+        return (self.observations, self.rewards,
+            self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
+
+if __name__ == '__main__':
+    N = 4096
+
+    env = FourRooms(num_envs=N)
+    env.reset()
+    steps = 0
+
+    CACHE = 1024
+    actions = np.random.randint(0, 7, (CACHE, N))  # 7 actions: left, right, forward, pickup, drop, toggle, done
+
+    i = 0
+    import time
+    start = time.time()
+    while time.time() - start < 10:
+        env.step(actions[i % CACHE])
+        steps += N
+        i += 1
+
+    print('FourRooms SPS:', int(steps / (time.time() - start)))

From 0fe7aacef7fade6c28a2f0418af6d47bc7340bfa Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Sun, 3 May 2026 22:04:54 -0400
Subject: [PATCH 2/8] Adapt Four Rooms to Ocean v4

Update Four Rooms for the v4 static vecenv interface: binding metadata, float action/terminal buffers, per-env RNG, and v4 config keys. Remove the old v3 Python wrapper because v4 integrates native Ocean environments through build.sh and config files.
---
 config/four_rooms.ini          |  30 ++++-----
 ocean/four_rooms/binding.c     |  25 +++----
 ocean/four_rooms/four_rooms.c  |  10 +--
 ocean/four_rooms/four_rooms.h  | 117 ++++++++++++++++++---------------
 ocean/four_rooms/four_rooms.py |  63 ------------------
 5 files changed, 96 insertions(+), 149 deletions(-)
 delete mode 100644 ocean/four_rooms/four_rooms.py

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index 115b9a6b09..816dbb7640 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -1,26 +1,24 @@
 [base]
-package = ocean
-env_name = puffer_four_rooms
-policy_name = Policy
-rnn_name = Recurrent
-#policy_name = FourRooms
-#rnn_name = FourRoomsLSTM
+env_name = four_rooms
 
 [vec]
-num_workers = 12
-num_envs = 12
+total_agents = 4096
+num_buffers = 2
+num_threads = 8
 
 [env]
-size = 27
-num_envs = 256
+size = 19
+
+[policy]
+hidden_size = 256
+num_layers = 2
+expansion_factor = 1
 
 [train]
-device = mps
-total_timesteps = 50_000_000
+total_timesteps = 100_000_000
 gamma = 0.99
-learning_rate = 0.01
+gae_lambda = 0.95
+learning_rate = 0.005
 minibatch_size = 32768
-clip_coef = 0.1
+horizon = 64
 ent_coef = 0.01
-vf_coef = 0.5
-max_grad_norm = 0.5
diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c
index e4f68a7eba..98d52b67fc 100644
--- a/ocean/four_rooms/binding.c
+++ b/ocean/four_rooms/binding.c
@@ -1,20 +1,23 @@
 #include "four_rooms.h"
 
+#define OBS_SIZE (7 * 7 * 3)
+#define NUM_ATNS 1
+#define ACT_SIZES {7}
+#define OBS_TENSOR_T ByteTensor
+
 #define Env FourRooms
-#include "../env_binding.h"
+#include "vecenv.h"
 
-static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
-    env->size = unpack(kwargs, "size");
+void my_init(Env* env, Dict* kwargs) {
+    env->num_agents = 1;
+    env->size = (int)dict_get(kwargs, "size")->value;
     env->see_through_walls = 0;
-    // Allocate grid memory for full state (stores OBJECT_IDX values)
     env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char));
-    return 0;
 }
 
-static int my_log(PyObject* dict, Log* log) {
-    assign_to_dict(dict, "perf", log->perf);
-    assign_to_dict(dict, "score", log->score);
-    assign_to_dict(dict, "episode_return", log->episode_return);
-    assign_to_dict(dict, "episode_length", log->episode_length);
-    return 0;
+void my_log(Log* log, Dict* out) {
+    dict_set(out, "perf", log->perf);
+    dict_set(out, "score", log->score);
+    dict_set(out, "episode_return", log->episode_return);
+    dict_set(out, "episode_length", log->episode_length);
 }
diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c
index 5681084950..7425e3229b 100644
--- a/ocean/four_rooms/four_rooms.c
+++ b/ocean/four_rooms/four_rooms.c
@@ -3,10 +3,12 @@
 int main() {
     FourRooms env = {};
     env.size = 19;
+    env.num_agents = 1;
+    env.rng = 0;
     env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding
-    env.actions = (int*)calloc(1, sizeof(int));
+    env.actions = (float*)calloc(1, sizeof(float));
     env.rewards = (float*)calloc(1, sizeof(float));
-    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+    env.terminals = (float*)calloc(1, sizeof(float));
     env.grid = (unsigned char*)calloc(env.size * env.size, sizeof(unsigned char));
 
     c_reset(&env);
@@ -18,7 +20,7 @@ int main() {
             if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) env.actions[0] = LEFT;
             if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) env.actions[0] = RIGHT;
         } else {
-            env.actions[0] = rand() % 3; // Only use left, right, forward
+            env.actions[0] = four_rooms_rand(&env, 3); // Only use left, right, forward
         }
         c_step(&env);
         c_render(&env);
@@ -27,8 +29,6 @@ int main() {
     free(env.actions);
     free(env.rewards);
     free(env.terminals);
-    free(env.grid);
     c_close(&env);
     return 0;
 }
-
diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h
index 9ef334f0c1..148d302b95 100644
--- a/ocean/four_rooms/four_rooms.h
+++ b/ocean/four_rooms/four_rooms.h
@@ -4,7 +4,7 @@
 
 // Action space
 const unsigned char LEFT = 0;
-const unsigned char RIGHT = 1; 
+const unsigned char RIGHT = 1;
 const unsigned char FORWARD = 2;
 const unsigned char PICKUP = 3; // Unused
 const unsigned char DROP = 4; // Unused
@@ -49,9 +49,10 @@ typedef struct {
 typedef struct {
     Log log;
     unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell
-    int* actions;
+    float* actions;
     float* rewards;
-    unsigned char* terminals;
+    float* terminals;
+    int num_agents;
     int size; // default 19
     int tick;
     int agent_x, agent_y;
@@ -59,9 +60,15 @@ typedef struct {
     int goal_x, goal_y;
     unsigned char* grid; // Stores OBJECT_IDX values
     int see_through_walls;
+    unsigned int rng;
+    int texture_loaded;
     Texture2D puffers;
 } FourRooms;
 
+static inline int four_rooms_rand(FourRooms* env, int n) {
+    return rand_r(&env->rng) % n;
+}
+
 void add_log(FourRooms* env) {
     env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0;
     env->log.score += env->rewards[0];
@@ -114,27 +121,27 @@ void generate_observation(FourRooms* env) {
     // Generate 7x7x3 observation centered on agent's view direction
     int view_size = 7;
     int half_view = view_size / 2;
-    
+
     // Calculate the center of the view based on agent's direction
     int center_x = env->agent_x;
     int center_y = env->agent_y;
-    
+
     // Shift center forward in the direction the agent is facing
     if (env->agent_dir == 0) center_x += half_view; // East
     else if (env->agent_dir == 1) center_y += half_view; // South
     else if (env->agent_dir == 2) center_x -= half_view; // West
     else if (env->agent_dir == 3) center_y -= half_view; // North
-    
+
     for (int i = 0; i < view_size; i++) {
         for (int j = 0; j < view_size; j++) {
             int world_x = center_x - half_view + j;
             int world_y = center_y - half_view + i;
-            
+
             // Calculate flat index for this cell in the 7x7x3 observation
             int base_idx = (i * view_size + j) * 3;
-            
+
             unsigned char object_idx, color_idx, state;
-            
+
             // Check bounds, out of bounds is treated as wall
             if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) {
                 object_idx = WALL;
@@ -147,7 +154,7 @@ void generate_observation(FourRooms* env) {
             } else {
                 int grid_idx = world_y * env->size + world_x;
                 unsigned char grid_cell = env->grid[grid_idx];
-                
+
                 // Map grid cell to MiniGrid encoding
                 switch (grid_cell) {
                     case EMPTY:
@@ -177,7 +184,7 @@ void generate_observation(FourRooms* env) {
                         break;
                 }
             }
-            
+
             env->observations[base_idx] = object_idx;
             env->observations[base_idx + 1] = color_idx;
             env->observations[base_idx + 2] = state;
@@ -187,10 +194,10 @@ void generate_observation(FourRooms* env) {
 
 void create_four_rooms_grid(FourRooms* env) {
     int size = env->size;
-    
+
     // Clear grid
     memset(env->grid, EMPTY, size * size * sizeof(unsigned char));
-    
+
     // Create outer walls
     for (int i = 0; i < size; i++) {
         env->grid[0 * size + i] = WALL; // Top
@@ -198,80 +205,80 @@ void create_four_rooms_grid(FourRooms* env) {
         env->grid[i * size + 0] = WALL; // Left
         env->grid[i * size + (size-1)] = WALL; // Right
     }
-    
+
     int room_w = size / 2;
     int room_h = size / 2;
-    
+
     // Create vertical separating wall
     for (int y = 0; y < size; y++) {
         env->grid[y * size + room_w] = WALL;
     }
-    
+
     // Create horizontal separating wall
     for (int x = 0; x < size; x++) {
         env->grid[room_h * size + x] = WALL;
     }
-    
+
     // Create 4 gaps in the separating walls
     // Gap in vertical wall (top half)
-    int gap_y1 = 1 + rand() % (room_h - 2);
+    int gap_y1 = 1 + four_rooms_rand(env, room_h - 2);
     env->grid[gap_y1 * size + room_w] = EMPTY;
-    
+
     // Gap in vertical wall (bottom half)
-    int gap_y2 = room_h + 1 + rand() % (room_h - 2);
+    int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 2);
     env->grid[gap_y2 * size + room_w] = EMPTY;
-    
+
     // Gap in horizontal wall (left half)
-    int gap_x1 = 1 + rand() % (room_w - 2);
+    int gap_x1 = 1 + four_rooms_rand(env, room_w - 2);
     env->grid[room_h * size + gap_x1] = EMPTY;
-    
+
     // Gap in horizontal wall (right half)
-    int gap_x2 = room_w + 1 + rand() % (room_w - 2);
+    int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 2);
     env->grid[room_h * size + gap_x2] = EMPTY;
 }
 
 void c_reset(FourRooms* env) {
 
     create_four_rooms_grid(env);
-    
+
     // Place agent randomly in valid position
     do {
-        env->agent_x = 1 + rand() % (env->size - 2);
-        env->agent_y = 1 + rand() % (env->size - 2);
+        env->agent_x = 1 + four_rooms_rand(env, env->size - 2);
+        env->agent_y = 1 + four_rooms_rand(env, env->size - 2);
     } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY);
-    
+
     // Place goal randomly in valid position (different from agent)
     do {
-        env->goal_x = 1 + rand() % (env->size - 2);
-        env->goal_y = 1 + rand() % (env->size - 2);
+        env->goal_x = 1 + four_rooms_rand(env, env->size - 2);
+        env->goal_y = 1 + four_rooms_rand(env, env->size - 2);
     } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY ||
              (env->goal_x == env->agent_x && env->goal_y == env->agent_y));
-    
+
     // Set agent and goal on grid
     env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
     env->grid[env->goal_y * env->size + env->goal_x] = GOAL;
-    
+
     // Random initial direction
-    env->agent_dir = rand() % 4;
+    env->agent_dir = four_rooms_rand(env, 4);
     env->tick = 0;
-    
+
     generate_observation(env);
 }
 
 void c_step(FourRooms* env) {
     env->tick += 1;
-    
-    int action = env->actions[0];
+
+    int action = (int)env->actions[0];
     env->terminals[0] = 0;
     env->rewards[0] = 0.0;
-    
+
     // Clear agent from current position
     env->grid[env->agent_y * env->size + env->agent_x] = EMPTY;
-    
+
     int new_x = env->agent_x;
     int new_y = env->agent_y;
     int new_dir = env->agent_dir;
-    
+
     if (action == LEFT) {
         new_dir = (env->agent_dir + 3) % 4;
     } else if (action == RIGHT) {
@@ -289,9 +296,9 @@ void c_step(FourRooms* env) {
             env->agent_y = new_y;
         }
     }
-    
+
     env->agent_dir = new_dir;
-    
+
     // Check if agent reached goal
     if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) {
         env->terminals[0] = 1;
@@ -300,10 +307,10 @@ void c_step(FourRooms* env) {
         c_reset(env);
         return;
     }
-    
+
     // Place agent back on grid
     env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
-    
+
     // Check timeout
     if (env->tick >= 4 * env->size) {
         env->terminals[0] = 1;
@@ -312,7 +319,7 @@ void c_step(FourRooms* env) {
         c_reset(env);
         return;
     }
-    
+
     generate_observation(env);
 }
 
@@ -321,6 +328,7 @@ void c_render(FourRooms* env) {
         InitWindow(32*env->size, 32*env->size, "PufferLib FourRooms");
         SetTargetFPS(10);
         env->puffers = LoadTexture("resources/shared/puffers_128.png");
+        env->texture_loaded = 1;
     }
 
     if (IsKeyDown(KEY_ESCAPE)) {
@@ -331,13 +339,13 @@ void c_render(FourRooms* env) {
     ClearBackground(PUFF_BACKGROUND);
 
     int px = 32;
-    
+
     // Draw the main grid
     for (int y = 0; y < env->size; y++) {
         for (int x = 0; x < env->size; x++) {
             int cell = env->grid[y * env->size + x];
             Color color = PUFF_BACKGROUND;
-            
+
             if (cell == WALL) color = PUFF_BACKGROUND2;
             else if (cell == GOAL) color = PUFF_RED;
 
@@ -346,28 +354,28 @@ void c_render(FourRooms* env) {
             }
         }
     }
-    
+
     // Draw agent's 7x7 observation window
     int view_size = 7;
     int half_view = view_size / 2;
-    
+
     // Calculate the center of the view based on agent's direction
     int center_x = env->agent_x;
     int center_y = env->agent_y;
-    
+
     // Shift center forward in the direction the agent is facing
     if (env->agent_dir == 0) center_x += half_view; // East
     else if (env->agent_dir == 1) center_y += half_view; // South
     else if (env->agent_dir == 2) center_x -= half_view; // West
     else if (env->agent_dir == 3) center_y -= half_view; // North
-    
+
     // Draw semi-transparent overlay for observation window
     Color obs_overlay = (Color){180, 180, 180, 80};
     for (int i = 0; i < view_size; i++) {
         for (int j = 0; j < view_size; j++) {
             int world_x = center_x - half_view + j;
             int world_y = center_y - half_view + i;
-            
+
             // Only draw overlay for cells within grid bounds and visible to agent
             if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size &&
                 can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) {
@@ -375,7 +383,7 @@ void c_render(FourRooms* env) {
             }
         }
     }
-    
+
     // Draw agent
     int starting_sprite_x = 0;
     int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°)
@@ -383,7 +391,7 @@ void c_render(FourRooms* env) {
         starting_sprite_x = 128; // Use flipped sprite for 180° rotation
         rotation = 0;
     }
-    
+
     DrawTexturePro(
         env->puffers,
         (Rectangle){starting_sprite_x, 0, 128, 128},
@@ -402,9 +410,10 @@ void c_render(FourRooms* env) {
 }
 
 void c_close(FourRooms* env) {
-    if (IsWindowReady()) {
+    if (env->texture_loaded) {
         UnloadTexture(env->puffers);
         CloseWindow();
+        env->texture_loaded = 0;
     }
     if (env->grid) {
         free(env->grid);
diff --git a/ocean/four_rooms/four_rooms.py b/ocean/four_rooms/four_rooms.py
deleted file mode 100644
index b530fd8573..0000000000
--- a/ocean/four_rooms/four_rooms.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-import gymnasium
-import numpy as np
-
-import pufferlib
-from pufferlib.ocean.four_rooms import binding
-
-class FourRooms(pufferlib.PufferEnv):
-    def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=19, buf=None, seed=0):
-        self.single_observation_space = gymnasium.spaces.Box(low=0, high=10,
-            shape=(7*7*3,), dtype=np.uint8)
-        self.single_action_space = gymnasium.spaces.Discrete(7)
-        self.render_mode = render_mode
-        self.num_agents = num_envs
-        self.log_interval = log_interval
-
-        super().__init__(buf)
-        self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards,
-            self.terminals, self.truncations, num_envs, seed, size=size)
- 
-    def reset(self, seed=0):
-        binding.vec_reset(self.c_envs, seed)
-        self.tick = 0
-        return self.observations, []
-
-    def step(self, actions):
-        self.tick += 1
-
-        self.actions[:] = actions
-        binding.vec_step(self.c_envs)
-
-        info = []
-        if self.tick % self.log_interval == 0:
-            info.append(binding.vec_log(self.c_envs))
-
-        return (self.observations, self.rewards,
-            self.terminals, self.truncations, info)
-
-    def render(self):
-        binding.vec_render(self.c_envs, 0)
-
-    def close(self):
-        binding.vec_close(self.c_envs)
-
-if __name__ == '__main__':
-    N = 4096
-
-    env = FourRooms(num_envs=N)
-    env.reset()
-    steps = 0
-
-    CACHE = 1024
-    actions = np.random.randint(0, 7, (CACHE, N))  # 7 actions: left, right, forward, pickup, drop, toggle, done
-
-    i = 0
-    import time
-    start = time.time()
-    while time.time() - start < 10:
-        env.step(actions[i % CACHE])
-        steps += N
-        i += 1
-
-    print('FourRooms SPS:', int(steps / (time.time() - start)))

From 3b77f2c7c7a247279a1c4e30be14c6f907b2d703 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Tue, 5 May 2026 18:30:47 -0400
Subject: [PATCH 3/8] Refine Four Rooms metrics and CPU stepping

---
 ocean/four_rooms/binding.c    | 19 +++++++++++++++++++
 ocean/four_rooms/four_rooms.h | 11 ++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c
index 98d52b67fc..4406434146 100644
--- a/ocean/four_rooms/binding.c
+++ b/ocean/four_rooms/binding.c
@@ -5,9 +5,28 @@
 #define ACT_SIZES {7}
 #define OBS_TENSOR_T ByteTensor
 
+#define MY_VEC_STEP four_rooms_vec_step
+#define MY_VEC_STEP_RANGE four_rooms_vec_step_range
 #define Env FourRooms
 #include "vecenv.h"
 
+void four_rooms_vec_step(StaticVec* vec) {
+    memset(vec->rewards, 0, vec->total_agents * sizeof(float));
+    memset(vec->terminals, 0, vec->total_agents * sizeof(float));
+    FourRooms* envs = (FourRooms*)vec->envs;
+    for (int i = 0; i < vec->size; i++) {
+        c_step(&envs[i]);
+    }
+}
+
+void four_rooms_vec_step_range(StaticVec* vec, int env_start, int env_count, int num_workers) {
+    (void)num_workers;
+    FourRooms* envs = (FourRooms*)vec->envs;
+    for (int i = env_start; i < env_start + env_count; i++) {
+        c_step(&envs[i]);
+    }
+}
+
 void my_init(Env* env, Dict* kwargs) {
     env->num_agents = 1;
     env->size = (int)dict_get(kwargs, "size")->value;
diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h
index 148d302b95..b10c5214e9 100644
--- a/ocean/four_rooms/four_rooms.h
+++ b/ocean/four_rooms/four_rooms.h
@@ -55,6 +55,7 @@ typedef struct {
     int num_agents;
     int size; // default 19
     int tick;
+    float episode_return;
     int agent_x, agent_y;
     int agent_dir; // 0=East, 1=South, 2=West, 3=North
     int goal_x, goal_y;
@@ -70,10 +71,10 @@ static inline int four_rooms_rand(FourRooms* env, int n) {
 }
 
 void add_log(FourRooms* env) {
-    env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0;
+    env->log.perf += (env->rewards[0] > 0) ? 1.0f : 0.0f;
     env->log.score += env->rewards[0];
     env->log.episode_length += env->tick;
-    env->log.episode_return += env->rewards[0];
+    env->log.episode_return += env->episode_return;
     env->log.n++;
 }
 
@@ -261,6 +262,7 @@ void c_reset(FourRooms* env) {
     // Random initial direction
     env->agent_dir = four_rooms_rand(env, 4);
     env->tick = 0;
+    env->episode_return = 0.0f;
 
     generate_observation(env);
 }
@@ -302,7 +304,8 @@ void c_step(FourRooms* env) {
     // Check if agent reached goal
     if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) {
         env->terminals[0] = 1;
-        env->rewards[0] = 1.0;
+        env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (4.0f * (float)env->size);
+        env->episode_return += env->rewards[0];
         add_log(env);
         c_reset(env);
         return;
@@ -315,11 +318,13 @@ void c_step(FourRooms* env) {
     if (env->tick >= 4 * env->size) {
         env->terminals[0] = 1;
         env->rewards[0] = 0.0;
+        env->episode_return += env->rewards[0];
         add_log(env);
         c_reset(env);
         return;
     }
 
+    env->episode_return += env->rewards[0];
     generate_observation(env);
 }
 

From e3c6a11a59dd653be74898d94d5feeb0e1a582dd Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Tue, 5 May 2026 18:31:19 -0400
Subject: [PATCH 4/8] Add Four Rooms sweep config

---
 config/four_rooms.ini | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index 816dbb7640..b72a8f8b34 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -22,3 +22,36 @@ learning_rate = 0.005
 minibatch_size = 32768
 horizon = 64
 ent_coef = 0.01
+
+[sweep]
+metric = score
+metric_distribution = linear
+goal = maximize
+max_runs = 40
+gpus = 1
+downsample = 5
+sweep_only = hidden_size,num_layers,total_timesteps,learning_rate
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 128
+max = 512
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = int_uniform
+min = 1
+max = 4
+scale = auto
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 30_000_000
+max = 300_000_000
+scale = time
+
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.0005
+max = 0.01
+scale = auto

From 6c17cb750d6075c06ce9617e03eb1ef7d7c5ef12 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Tue, 5 May 2026 21:03:44 -0400
Subject: [PATCH 5/8] Tune Four Rooms sweep ranges

---
 config/four_rooms.ini | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index b72a8f8b34..9f61a8cb02 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -27,28 +27,31 @@ ent_coef = 0.01
 metric = score
 metric_distribution = linear
 goal = maximize
-max_runs = 40
+max_runs = 100
 gpus = 1
 downsample = 5
 sweep_only = hidden_size,num_layers,total_timesteps,learning_rate
 
 [sweep.policy.hidden_size]
 distribution = uniform_pow2
-min = 128
-max = 512
+min = 64
+max = 1024
+mean = 256
 scale = auto
 
 [sweep.policy.num_layers]
 distribution = int_uniform
 min = 1
 max = 4
+mean = 2
 scale = auto
 
 [sweep.train.total_timesteps]
 distribution = log_normal
-min = 30_000_000
-max = 300_000_000
-scale = time
+min = 20_000_000
+max = 500_000_000
+mean = 100_000_000
+scale = auto
 
 [sweep.train.learning_rate]
 distribution = log_normal

From fe9ae73f45cde3fa98f1b88f656a038e059041e1 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Wed, 6 May 2026 21:54:48 -0400
Subject: [PATCH 6/8] more sweeps

---
 config/four_rooms.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index 9f61a8cb02..29d9a74803 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -11,11 +11,11 @@ size = 19
 
 [policy]
 hidden_size = 256
-num_layers = 2
+num_layers = 6
 expansion_factor = 1
 
 [train]
-total_timesteps = 100_000_000
+total_timesteps = 300_000_000
 gamma = 0.99
 gae_lambda = 0.95
 learning_rate = 0.005

From 607448260f916721174f5c3b256a0728adc20820 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Thu, 7 May 2026 16:43:24 -0400
Subject: [PATCH 7/8] Harden Four Rooms environment

---
 config/four_rooms.ini         |   2 +
 ocean/four_rooms/binding.c    |   8 +-
 ocean/four_rooms/four_rooms.c |   6 +-
 ocean/four_rooms/four_rooms.h | 315 ++++++++++++++++------------------
 4 files changed, 161 insertions(+), 170 deletions(-)

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index 29d9a74803..32dc91f09e 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -8,6 +8,8 @@ num_threads = 8
 
 [env]
 size = 19
+# 0 derives the timeout from the map size: 4 * size. Positive values override it.
+max_steps = 0
 
 [policy]
 hidden_size = 256
diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c
index 4406434146..3c9c95c1c7 100644
--- a/ocean/four_rooms/binding.c
+++ b/ocean/four_rooms/binding.c
@@ -1,8 +1,8 @@
 #include "four_rooms.h"
 
-#define OBS_SIZE (7 * 7 * 3)
+#define OBS_SIZE (FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS)
 #define NUM_ATNS 1
-#define ACT_SIZES {7}
+#define ACT_SIZES {FOUR_ROOMS_NUM_ACTIONS}
 #define OBS_TENSOR_T ByteTensor
 
 #define MY_VEC_STEP four_rooms_vec_step
@@ -30,6 +30,10 @@ void four_rooms_vec_step_range(StaticVec* vec, int env_start, int env_count, int
 void my_init(Env* env, Dict* kwargs) {
     env->num_agents = 1;
     env->size = (int)dict_get(kwargs, "size")->value;
+    env->max_steps = (int)dict_get(kwargs, "max_steps")->value;
+    if (env->max_steps <= 0) {
+        env->max_steps = 4 * env->size;
+    }
     env->see_through_walls = 0;
     env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char));
 }
diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c
index 7425e3229b..b73ad5989f 100644
--- a/ocean/four_rooms/four_rooms.c
+++ b/ocean/four_rooms/four_rooms.c
@@ -3,9 +3,13 @@
 int main() {
     FourRooms env = {};
     env.size = 19;
+    env.max_steps = 0;
     env.num_agents = 1;
     env.rng = 0;
-    env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding
+    env.observations = (unsigned char*)calloc(
+        FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS,
+        sizeof(unsigned char)
+    );
     env.actions = (float*)calloc(1, sizeof(float));
     env.rewards = (float*)calloc(1, sizeof(float));
     env.terminals = (float*)calloc(1, sizeof(float));
diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h
index b10c5214e9..98c9e0469e 100644
--- a/ocean/four_rooms/four_rooms.h
+++ b/ocean/four_rooms/four_rooms.h
@@ -2,41 +2,41 @@
 #include <string.h>
 #include "raylib.h"
 
+#define FOUR_ROOMS_VIEW_SIZE 7
+#define FOUR_ROOMS_OBS_CHANNELS 3
+#define FOUR_ROOMS_NUM_ACTIONS 7
+
 // Action space
-const unsigned char LEFT = 0;
-const unsigned char RIGHT = 1;
-const unsigned char FORWARD = 2;
-const unsigned char PICKUP = 3; // Unused
-const unsigned char DROP = 4; // Unused
-const unsigned char TOGGLE = 5; // Unused
-const unsigned char DONE = 6; // Unused
+enum {
+    LEFT = 0,
+    RIGHT = 1,
+    FORWARD = 2,
+    PICKUP = 3,
+    DROP = 4,
+    TOGGLE = 5,
+    DONE = 6,
+};
 
 // Observation: Objects
-const unsigned char UNSEEN = 0;
-const unsigned char EMPTY = 1;
-const unsigned char WALL = 2;
-const unsigned char FLOOR = 3; // Unused
-const unsigned char DOOR = 4; // Unused
-const unsigned char KEY = 5; // Unused
-const unsigned char BALL = 6; // Unused
-const unsigned char BOX = 7; // Unused
-const unsigned char GOAL = 8;
-const unsigned char LAVA = 9; // Unused
-const unsigned char AGENT = 10;
+enum {
+    UNSEEN = 0,
+    EMPTY = 1,
+    WALL = 2,
+    GOAL = 8,
+    AGENT = 10,
+};
 
 // Observation: Colors
-const unsigned char COLOR_BLACK = 0;
-const unsigned char COLOR_GREEN = 1;
-const unsigned char COLOR_BLUE = 2;
-const unsigned char COLOR_PURPLE = 3;
-const unsigned char COLOR_YELLOW = 4;
-const unsigned char COLOR_GREY = 5;
+enum {
+    COLOR_BLACK = 0,
+    COLOR_GREEN = 1,
+    COLOR_GREY = 5,
+};
 
 // PufferLib standard colors for rendering
-const Color PUFF_RED = (Color){187, 0, 0, 255};
-const Color PUFF_CYAN = (Color){0, 187, 187, 255};
-const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
-const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255};
+static const Color PUFF_RED = (Color){187, 0, 0, 255};
+static const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+static const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255};
 
 typedef struct {
     float perf;
@@ -48,18 +48,19 @@ typedef struct {
 
 typedef struct {
     Log log;
-    unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell
+    unsigned char* observations;
     float* actions;
     float* rewards;
     float* terminals;
     int num_agents;
-    int size; // default 19
+    int size;
+    int max_steps;
     int tick;
     float episode_return;
     int agent_x, agent_y;
-    int agent_dir; // 0=East, 1=South, 2=West, 3=North
+    int agent_dir;
     int goal_x, goal_y;
-    unsigned char* grid; // Stores OBJECT_IDX values
+    unsigned char* grid;
     int see_through_walls;
     unsigned int rng;
     int texture_loaded;
@@ -70,6 +71,10 @@ static inline int four_rooms_rand(FourRooms* env, int n) {
     return rand_r(&env->rng) % n;
 }
 
+static inline int four_rooms_grid_idx(FourRooms* env, int x, int y) {
+    return y * env->size + x;
+}
+
 void add_log(FourRooms* env) {
     env->log.perf += (env->rewards[0] > 0) ? 1.0f : 0.0f;
     env->log.score += env->rewards[0];
@@ -78,117 +83,108 @@ void add_log(FourRooms* env) {
     env->log.n++;
 }
 
-int can_see_cell(FourRooms* env, int agent_x, int agent_y, int target_x, int target_y) {
-    if (env->see_through_walls) {
-        return 1;
+void encode_cell(unsigned char object, unsigned char* object_idx, unsigned char* color_idx, unsigned char* state) {
+    *state = 0;
+    if (object == WALL) {
+        *object_idx = WALL;
+        *color_idx = COLOR_GREY;
+    } else if (object == GOAL) {
+        *object_idx = GOAL;
+        *color_idx = COLOR_GREEN;
+    } else {
+        *object_idx = EMPTY;
+        *color_idx = COLOR_BLACK;
     }
+}
 
-    // Use Bresenham's line algorithm to check line of sight
-    int dx = abs(target_x - agent_x);
-    int dy = abs(target_y - agent_y);
-    int x = agent_x;
-    int y = agent_y;
-    int x_inc = (target_x > agent_x) ? 1 : -1;
-    int y_inc = (target_y > agent_y) ? 1 : -1;
-    int error = dx - dy;
-
-    while (x != target_x || y != target_y) {
-        // If we've reached the target cell, stop (target cell should always be visible)
-        if (x == target_x && y == target_y) {
-            break;
-        }
+void observation_to_world(FourRooms* env, int obs_x, int obs_y, int* world_x, int* world_y) {
+    int forward_x = 0;
+    int forward_y = 0;
+    if (env->agent_dir == 0) forward_x = 1;
+    else if (env->agent_dir == 1) forward_y = 1;
+    else if (env->agent_dir == 2) forward_x = -1;
+    else forward_y = -1;
+
+    int right_x = -forward_y;
+    int right_y = forward_x;
+    int right_offset = obs_x - FOUR_ROOMS_VIEW_SIZE / 2;
+    int forward_offset = FOUR_ROOMS_VIEW_SIZE - 1 - obs_y;
+
+    *world_x = env->agent_x + forward_x * forward_offset + right_x * right_offset;
+    *world_y = env->agent_y + forward_y * forward_offset + right_y * right_offset;
+}
 
-        int error2 = 2 * error;
-        if (error2 > -dy) {
-            error -= dy;
-            x += x_inc;
-        }
-        if (error2 < dx) {
-            error += dx;
-            y += y_inc;
+void compute_visibility(unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE],
+        unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]) {
+    memset(visible, 0, FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * sizeof(unsigned char));
+    visible[FOUR_ROOMS_VIEW_SIZE - 1][FOUR_ROOMS_VIEW_SIZE / 2] = 1;
+
+    // MiniGrid propagates visibility from the agent at bottom-center after rotating the view.
+    for (int y = FOUR_ROOMS_VIEW_SIZE - 1; y >= 0; y--) {
+        for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE - 1; x++) {
+            if (!visible[y][x] || view[y][x] == WALL) {
+                continue;
+            }
+            visible[y][x + 1] = 1;
+            if (y > 0) {
+                visible[y - 1][x] = 1;
+                visible[y - 1][x + 1] = 1;
+            }
         }
 
-        // If the next cell (not the target) is a wall, block vision beyond but allow seeing the wall itself
-        if ((x != target_x || y != target_y) &&
-            x >= 0 && x < env->size && y >= 0 && y < env->size &&
-            env->grid[y * env->size + x] == WALL) {
-            return 0; // Wall blocks the view beyond, but wall itself is visible
+        for (int x = FOUR_ROOMS_VIEW_SIZE - 1; x > 0; x--) {
+            if (!visible[y][x] || view[y][x] == WALL) {
+                continue;
+            }
+            visible[y][x - 1] = 1;
+            if (y > 0) {
+                visible[y - 1][x] = 1;
+                visible[y - 1][x - 1] = 1;
+            }
         }
     }
-    return 1; // Target cell is visible
 }
 
 void generate_observation(FourRooms* env) {
-    // Generate 7x7x3 observation centered on agent's view direction
-    int view_size = 7;
-    int half_view = view_size / 2;
-
-    // Calculate the center of the view based on agent's direction
-    int center_x = env->agent_x;
-    int center_y = env->agent_y;
-
-    // Shift center forward in the direction the agent is facing
-    if (env->agent_dir == 0) center_x += half_view; // East
-    else if (env->agent_dir == 1) center_y += half_view; // South
-    else if (env->agent_dir == 2) center_x -= half_view; // West
-    else if (env->agent_dir == 3) center_y -= half_view; // North
+    unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE];
+    unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE];
 
-    for (int i = 0; i < view_size; i++) {
-        for (int j = 0; j < view_size; j++) {
-            int world_x = center_x - half_view + j;
-            int world_y = center_y - half_view + i;
-
-            // Calculate flat index for this cell in the 7x7x3 observation
-            int base_idx = (i * view_size + j) * 3;
-
-            unsigned char object_idx, color_idx, state;
-
-            // Check bounds, out of bounds is treated as wall
+    for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) {
+        for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) {
+            int world_x, world_y;
+            observation_to_world(env, x, y, &world_x, &world_y);
             if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) {
-                object_idx = WALL;
-                color_idx = COLOR_GREY;
-                state = 0;
-            } else if (!can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) {
-                object_idx = UNSEEN; // Cell is blocked by walls
-                color_idx = COLOR_BLACK;
-                state = 0;
+                view[y][x] = WALL;
+            } else if (world_x == env->agent_x && world_y == env->agent_y) {
+                view[y][x] = EMPTY;
             } else {
-                int grid_idx = world_y * env->size + world_x;
-                unsigned char grid_cell = env->grid[grid_idx];
-
-                // Map grid cell to MiniGrid encoding
-                switch (grid_cell) {
-                    case EMPTY:
-                        object_idx = EMPTY;
-                        color_idx = COLOR_BLACK;
-                        state = 0;
-                        break;
-                    case WALL:
-                        object_idx = WALL;
-                        color_idx = COLOR_GREY;
-                        state = 0;
-                        break;
-                    case AGENT:
-                        object_idx = AGENT;
-                        color_idx = COLOR_BLUE;
-                        state = 0;
-                        break;
-                    case GOAL:
-                        object_idx = GOAL;
-                        color_idx = COLOR_GREEN;
-                        state = 0;
-                        break;
-                    default:
-                        object_idx = EMPTY;
-                        color_idx = 0;
-                        state = 0;
-                        break;
-                }
+                view[y][x] = env->grid[four_rooms_grid_idx(env, world_x, world_y)];
+            }
+        }
+    }
+
+    if (env->see_through_walls) {
+        memset(visible, 1, FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * sizeof(unsigned char));
+    } else {
+        compute_visibility(view, visible);
+    }
+
+    for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) {
+        for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) {
+            int base_idx = (y * FOUR_ROOMS_VIEW_SIZE + x) * FOUR_ROOMS_OBS_CHANNELS;
+            if (!visible[y][x]) {
+                env->observations[base_idx] = UNSEEN;
+                env->observations[base_idx + 1] = COLOR_BLACK;
+                env->observations[base_idx + 2] = 0;
+                continue;
             }
 
-            env->observations[base_idx] = object_idx;
-            env->observations[base_idx + 1] = color_idx;
-            env->observations[base_idx + 2] = state;
+            encode_cell(
+                view[y][x],
+                &env->observations[base_idx],
+                &env->observations[base_idx + 1],
+                &env->observations[base_idx + 2]
+            );
         }
     }
 }
@@ -222,23 +218,26 @@ void create_four_rooms_grid(FourRooms* env) {
 
     // Create 4 gaps in the separating walls
     // Gap in vertical wall (top half)
-    int gap_y1 = 1 + four_rooms_rand(env, room_h - 2);
+    int gap_y1 = 1 + four_rooms_rand(env, room_h - 1);
     env->grid[gap_y1 * size + room_w] = EMPTY;
 
     // Gap in vertical wall (bottom half)
-    int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 2);
+    int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 1);
     env->grid[gap_y2 * size + room_w] = EMPTY;
 
     // Gap in horizontal wall (left half)
-    int gap_x1 = 1 + four_rooms_rand(env, room_w - 2);
+    int gap_x1 = 1 + four_rooms_rand(env, room_w - 1);
     env->grid[room_h * size + gap_x1] = EMPTY;
 
     // Gap in horizontal wall (right half)
-    int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 2);
+    int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 1);
     env->grid[room_h * size + gap_x2] = EMPTY;
 }
 
 void c_reset(FourRooms* env) {
+    if (env->max_steps <= 0) {
+        env->max_steps = 4 * env->size;
+    }
 
     create_four_rooms_grid(env);
 
@@ -246,18 +245,18 @@ void c_reset(FourRooms* env) {
     do {
         env->agent_x = 1 + four_rooms_rand(env, env->size - 2);
         env->agent_y = 1 + four_rooms_rand(env, env->size - 2);
-    } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY);
+    } while (env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] != EMPTY);
 
     // Place goal randomly in valid position (different from agent)
     do {
         env->goal_x = 1 + four_rooms_rand(env, env->size - 2);
         env->goal_y = 1 + four_rooms_rand(env, env->size - 2);
-    } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY ||
+    } while (env->grid[four_rooms_grid_idx(env, env->goal_x, env->goal_y)] != EMPTY ||
              (env->goal_x == env->agent_x && env->goal_y == env->agent_y));
 
     // Set agent and goal on grid
-    env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
-    env->grid[env->goal_y * env->size + env->goal_x] = GOAL;
+    env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = AGENT;
+    env->grid[four_rooms_grid_idx(env, env->goal_x, env->goal_y)] = GOAL;
 
     // Random initial direction
     env->agent_dir = four_rooms_rand(env, 4);
@@ -275,7 +274,7 @@ void c_step(FourRooms* env) {
     env->rewards[0] = 0.0;
 
     // Clear agent from current position
-    env->grid[env->agent_y * env->size + env->agent_x] = EMPTY;
+    env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = EMPTY;
 
     int new_x = env->agent_x;
     int new_y = env->agent_y;
@@ -293,7 +292,7 @@ void c_step(FourRooms* env) {
 
         // Check if move is valid
         if (new_x >= 0 && new_x < env->size && new_y >= 0 && new_y < env->size &&
-            env->grid[new_y * env->size + new_x] != WALL) {
+            env->grid[four_rooms_grid_idx(env, new_x, new_y)] != WALL) {
             env->agent_x = new_x;
             env->agent_y = new_y;
         }
@@ -304,7 +303,7 @@ void c_step(FourRooms* env) {
     // Check if agent reached goal
     if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) {
         env->terminals[0] = 1;
-        env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (4.0f * (float)env->size);
+        env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (float)env->max_steps;
         env->episode_return += env->rewards[0];
         add_log(env);
         c_reset(env);
@@ -312,10 +311,10 @@ void c_step(FourRooms* env) {
     }
 
     // Place agent back on grid
-    env->grid[env->agent_y * env->size + env->agent_x] = AGENT;
+    env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = AGENT;
 
     // Check timeout
-    if (env->tick >= 4 * env->size) {
+    if (env->tick >= env->max_steps) {
         env->terminals[0] = 1;
         env->rewards[0] = 0.0;
         env->episode_return += env->rewards[0];
@@ -360,40 +359,21 @@ void c_render(FourRooms* env) {
         }
     }
 
-    // Draw agent's 7x7 observation window
-    int view_size = 7;
-    int half_view = view_size / 2;
-
-    // Calculate the center of the view based on agent's direction
-    int center_x = env->agent_x;
-    int center_y = env->agent_y;
-
-    // Shift center forward in the direction the agent is facing
-    if (env->agent_dir == 0) center_x += half_view; // East
-    else if (env->agent_dir == 1) center_y += half_view; // South
-    else if (env->agent_dir == 2) center_x -= half_view; // West
-    else if (env->agent_dir == 3) center_y -= half_view; // North
-
-    // Draw semi-transparent overlay for observation window
     Color obs_overlay = (Color){180, 180, 180, 80};
-    for (int i = 0; i < view_size; i++) {
-        for (int j = 0; j < view_size; j++) {
-            int world_x = center_x - half_view + j;
-            int world_y = center_y - half_view + i;
-
-            // Only draw overlay for cells within grid bounds and visible to agent
-            if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size &&
-                can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) {
+    for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) {
+        for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) {
+            int world_x, world_y;
+            observation_to_world(env, x, y, &world_x, &world_y);
+            if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size) {
                 DrawRectangle(world_x*px, world_y*px, px, px, obs_overlay);
             }
         }
     }
 
-    // Draw agent
     int starting_sprite_x = 0;
-    int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°)
+    int rotation = 90 * env->agent_dir;
     if (rotation == 180) {
-        starting_sprite_x = 128; // Use flipped sprite for 180° rotation
+        starting_sprite_x = 128;
         rotation = 0;
     }
 
@@ -422,5 +402,6 @@ void c_close(FourRooms* env) {
     }
     if (env->grid) {
         free(env->grid);
+        env->grid = NULL;
     }
 }

From de05e0b472c02c8f340f3a363016e0e4f225f2c3 Mon Sep 17 00:00:00 2001
From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com>
Date: Thu, 7 May 2026 20:41:20 -0400
Subject: [PATCH 8/8] lighter config

---
 config/four_rooms.ini | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/four_rooms.ini b/config/four_rooms.ini
index 32dc91f09e..0a931a2ff7 100644
--- a/config/four_rooms.ini
+++ b/config/four_rooms.ini
@@ -8,16 +8,16 @@ num_threads = 8
 
 [env]
 size = 19
-# 0 derives the timeout from the map size: 4 * size. Positive values override it.
+# if 0, max_steps = 4 * size. Positive values override it.
 max_steps = 0
 
 [policy]
-hidden_size = 256
-num_layers = 6
+hidden_size = 128
+num_layers = 2
 expansion_factor = 1
 
 [train]
-total_timesteps = 300_000_000
+total_timesteps = 100_000_000
 gamma = 0.99
 gae_lambda = 0.95
 learning_rate = 0.005