From 2a4d55d3a3bde58f1b95e8975a125a2fe0cb4488 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Sun, 3 May 2026 21:53:53 -0400 Subject: [PATCH 1/8] Port Four Rooms from old fork Copy the Four Rooms native environment sources and config from the old pufferlib-v3 fork into the v4 layout. This is an intentionally unadapted checkpoint; follow-up commits will update it for the current v4 APIs and build system. --- config/four_rooms.ini | 26 +++ ocean/four_rooms/binding.c | 20 ++ ocean/four_rooms/four_rooms.c | 34 +++ ocean/four_rooms/four_rooms.h | 412 +++++++++++++++++++++++++++++++++ ocean/four_rooms/four_rooms.py | 63 +++++ 5 files changed, 555 insertions(+) create mode 100644 config/four_rooms.ini create mode 100644 ocean/four_rooms/binding.c create mode 100644 ocean/four_rooms/four_rooms.c create mode 100644 ocean/four_rooms/four_rooms.h create mode 100644 ocean/four_rooms/four_rooms.py diff --git a/config/four_rooms.ini b/config/four_rooms.ini new file mode 100644 index 0000000000..115b9a6b09 --- /dev/null +++ b/config/four_rooms.ini @@ -0,0 +1,26 @@ +[base] +package = ocean +env_name = puffer_four_rooms +policy_name = Policy +rnn_name = Recurrent +#policy_name = FourRooms +#rnn_name = FourRoomsLSTM + +[vec] +num_workers = 12 +num_envs = 12 + +[env] +size = 27 +num_envs = 256 + +[train] +device = mps +total_timesteps = 50_000_000 +gamma = 0.99 +learning_rate = 0.01 +minibatch_size = 32768 +clip_coef = 0.1 +ent_coef = 0.01 +vf_coef = 0.5 +max_grad_norm = 0.5 diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c new file mode 100644 index 0000000000..e4f68a7eba --- /dev/null +++ b/ocean/four_rooms/binding.c @@ -0,0 +1,20 @@ +#include "four_rooms.h" + +#define Env FourRooms +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->size = unpack(kwargs, "size"); + env->see_through_walls = 0; + // Allocate grid memory for full state (stores OBJECT_IDX values) + env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char)); + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + return 0; +} diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c new file mode 100644 index 0000000000..5681084950 --- /dev/null +++ b/ocean/four_rooms/four_rooms.c @@ -0,0 +1,34 @@ +#include "four_rooms.h" + +int main() { + FourRooms env = {}; + env.size = 19; + env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding + env.actions = (int*)calloc(1, sizeof(int)); + env.rewards = (float*)calloc(1, sizeof(float)); + env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); + env.grid = (unsigned char*)calloc(env.size * env.size, sizeof(unsigned char)); + + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + if (IsKeyDown(KEY_LEFT_SHIFT)) { + env.actions[0] = 7; // Invalid action = no-op + if (IsKeyDown(KEY_UP) || IsKeyDown(KEY_W)) env.actions[0] = FORWARD; + if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) env.actions[0] = LEFT; + if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) env.actions[0] = RIGHT; + } else { + env.actions[0] = rand() % 3; // Only use left, right, forward + } + c_step(&env); + c_render(&env); + } + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + free(env.grid); + c_close(&env); + return 0; +} + diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h new file mode 100644 index 0000000000..9ef334f0c1 --- /dev/null +++ b/ocean/four_rooms/four_rooms.h @@ -0,0 +1,412 @@ +#include +#include +#include "raylib.h" + +// Action space +const unsigned char LEFT = 0; +const unsigned char RIGHT = 1; +const unsigned char FORWARD = 2; +const unsigned char PICKUP = 3; // Unused +const unsigned char DROP = 4; // Unused +const unsigned char TOGGLE = 5; // Unused +const unsigned char DONE = 6; // Unused + +// Observation: Objects +const unsigned char UNSEEN = 0; +const unsigned char EMPTY = 1; +const unsigned char WALL = 2; +const unsigned char FLOOR = 3; // Unused +const unsigned char DOOR = 4; // Unused +const unsigned char KEY = 5; // Unused +const unsigned char BALL = 6; // Unused +const unsigned char BOX = 7; // Unused +const unsigned char GOAL = 8; +const unsigned char LAVA = 9; // Unused +const unsigned char AGENT = 10; + +// Observation: Colors +const unsigned char COLOR_BLACK = 0; +const unsigned char COLOR_GREEN = 1; +const unsigned char COLOR_BLUE = 2; +const unsigned char COLOR_PURPLE = 3; +const unsigned char COLOR_YELLOW = 4; +const unsigned char COLOR_GREY = 5; + +// PufferLib standard colors for rendering +const Color PUFF_RED = (Color){187, 0, 0, 255}; +const Color PUFF_CYAN = (Color){0, 187, 187, 255}; +const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; +const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255}; + +typedef struct { + float perf; + float score; + float episode_return; + float episode_length; + float n; +} Log; + +typedef struct { + Log log; + unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell + int* actions; + float* rewards; + unsigned char* terminals; + int size; // default 19 + int tick; + int agent_x, agent_y; + int agent_dir; // 0=East, 1=South, 2=West, 3=North + int goal_x, goal_y; + unsigned char* grid; // Stores OBJECT_IDX values + int see_through_walls; + Texture2D puffers; +} FourRooms; + +void add_log(FourRooms* env) { + env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0; + env->log.score += env->rewards[0]; + env->log.episode_length += env->tick; + env->log.episode_return += env->rewards[0]; + env->log.n++; +} + +int can_see_cell(FourRooms* env, int agent_x, int agent_y, int target_x, int target_y) { + if (env->see_through_walls) { + return 1; + } + + // Use Bresenham's line algorithm to check line of sight + int dx = abs(target_x - agent_x); + int dy = abs(target_y - agent_y); + int x = agent_x; + int y = agent_y; + int x_inc = (target_x > agent_x) ? 1 : -1; + int y_inc = (target_y > agent_y) ? 1 : -1; + int error = dx - dy; + + while (x != target_x || y != target_y) { + // If we've reached the target cell, stop (target cell should always be visible) + if (x == target_x && y == target_y) { + break; + } + + int error2 = 2 * error; + if (error2 > -dy) { + error -= dy; + x += x_inc; + } + if (error2 < dx) { + error += dx; + y += y_inc; + } + + // If the next cell (not the target) is a wall, block vision beyond but allow seeing the wall itself + if ((x != target_x || y != target_y) && + x >= 0 && x < env->size && y >= 0 && y < env->size && + env->grid[y * env->size + x] == WALL) { + return 0; // Wall blocks the view beyond, but wall itself is visible + } + } + return 1; // Target cell is visible +} + +void generate_observation(FourRooms* env) { + // Generate 7x7x3 observation centered on agent's view direction + int view_size = 7; + int half_view = view_size / 2; + + // Calculate the center of the view based on agent's direction + int center_x = env->agent_x; + int center_y = env->agent_y; + + // Shift center forward in the direction the agent is facing + if (env->agent_dir == 0) center_x += half_view; // East + else if (env->agent_dir == 1) center_y += half_view; // South + else if (env->agent_dir == 2) center_x -= half_view; // West + else if (env->agent_dir == 3) center_y -= half_view; // North + + for (int i = 0; i < view_size; i++) { + for (int j = 0; j < view_size; j++) { + int world_x = center_x - half_view + j; + int world_y = center_y - half_view + i; + + // Calculate flat index for this cell in the 7x7x3 observation + int base_idx = (i * view_size + j) * 3; + + unsigned char object_idx, color_idx, state; + + // Check bounds, out of bounds is treated as wall + if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) { + object_idx = WALL; + color_idx = COLOR_GREY; + state = 0; + } else if (!can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) { + object_idx = UNSEEN; // Cell is blocked by walls + color_idx = COLOR_BLACK; + state = 0; + } else { + int grid_idx = world_y * env->size + world_x; + unsigned char grid_cell = env->grid[grid_idx]; + + // Map grid cell to MiniGrid encoding + switch (grid_cell) { + case EMPTY: + object_idx = EMPTY; + color_idx = COLOR_BLACK; + state = 0; + break; + case WALL: + object_idx = WALL; + color_idx = COLOR_GREY; + state = 0; + break; + case AGENT: + object_idx = AGENT; + color_idx = COLOR_BLUE; + state = 0; + break; + case GOAL: + object_idx = GOAL; + color_idx = COLOR_GREEN; + state = 0; + break; + default: + object_idx = EMPTY; + color_idx = 0; + state = 0; + break; + } + } + + env->observations[base_idx] = object_idx; + env->observations[base_idx + 1] = color_idx; + env->observations[base_idx + 2] = state; + } + } +} + +void create_four_rooms_grid(FourRooms* env) { + int size = env->size; + + // Clear grid + memset(env->grid, EMPTY, size * size * sizeof(unsigned char)); + + // Create outer walls + for (int i = 0; i < size; i++) { + env->grid[0 * size + i] = WALL; // Top + env->grid[(size-1) * size + i] = WALL; // Bottom + env->grid[i * size + 0] = WALL; // Left + env->grid[i * size + (size-1)] = WALL; // Right + } + + int room_w = size / 2; + int room_h = size / 2; + + // Create vertical separating wall + for (int y = 0; y < size; y++) { + env->grid[y * size + room_w] = WALL; + } + + // Create horizontal separating wall + for (int x = 0; x < size; x++) { + env->grid[room_h * size + x] = WALL; + } + + // Create 4 gaps in the separating walls + // Gap in vertical wall (top half) + int gap_y1 = 1 + rand() % (room_h - 2); + env->grid[gap_y1 * size + room_w] = EMPTY; + + // Gap in vertical wall (bottom half) + int gap_y2 = room_h + 1 + rand() % (room_h - 2); + env->grid[gap_y2 * size + room_w] = EMPTY; + + // Gap in horizontal wall (left half) + int gap_x1 = 1 + rand() % (room_w - 2); + env->grid[room_h * size + gap_x1] = EMPTY; + + // Gap in horizontal wall (right half) + int gap_x2 = room_w + 1 + rand() % (room_w - 2); + env->grid[room_h * size + gap_x2] = EMPTY; +} + +void c_reset(FourRooms* env) { + + create_four_rooms_grid(env); + + // Place agent randomly in valid position + do { + env->agent_x = 1 + rand() % (env->size - 2); + env->agent_y = 1 + rand() % (env->size - 2); + } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY); + + // Place goal randomly in valid position (different from agent) + do { + env->goal_x = 1 + rand() % (env->size - 2); + env->goal_y = 1 + rand() % (env->size - 2); + } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY || + (env->goal_x == env->agent_x && env->goal_y == env->agent_y)); + + // Set agent and goal on grid + env->grid[env->agent_y * env->size + env->agent_x] = AGENT; + env->grid[env->goal_y * env->size + env->goal_x] = GOAL; + + // Random initial direction + env->agent_dir = rand() % 4; + env->tick = 0; + + generate_observation(env); +} + +void c_step(FourRooms* env) { + env->tick += 1; + + int action = env->actions[0]; + env->terminals[0] = 0; + env->rewards[0] = 0.0; + + // Clear agent from current position + env->grid[env->agent_y * env->size + env->agent_x] = EMPTY; + + int new_x = env->agent_x; + int new_y = env->agent_y; + int new_dir = env->agent_dir; + + if (action == LEFT) { + new_dir = (env->agent_dir + 3) % 4; + } else if (action == RIGHT) { + new_dir = (env->agent_dir + 1) % 4; + } else if (action == FORWARD) { + if (env->agent_dir == 0) new_x += 1; + else if (env->agent_dir == 1) new_y += 1; + else if (env->agent_dir == 2) new_x -= 1; + else if (env->agent_dir == 3) new_y -= 1; + + // Check if move is valid + if (new_x >= 0 && new_x < env->size && new_y >= 0 && new_y < env->size && + env->grid[new_y * env->size + new_x] != WALL) { + env->agent_x = new_x; + env->agent_y = new_y; + } + } + + env->agent_dir = new_dir; + + // Check if agent reached goal + if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) { + env->terminals[0] = 1; + env->rewards[0] = 1.0; + add_log(env); + c_reset(env); + return; + } + + // Place agent back on grid + env->grid[env->agent_y * env->size + env->agent_x] = AGENT; + + // Check timeout + if (env->tick >= 4 * env->size) { + env->terminals[0] = 1; + env->rewards[0] = 0.0; + add_log(env); + c_reset(env); + return; + } + + generate_observation(env); +} + +void c_render(FourRooms* env) { + if (!IsWindowReady()) { + InitWindow(32*env->size, 32*env->size, "PufferLib FourRooms"); + SetTargetFPS(10); + env->puffers = LoadTexture("resources/shared/puffers_128.png"); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground(PUFF_BACKGROUND); + + int px = 32; + + // Draw the main grid + for (int y = 0; y < env->size; y++) { + for (int x = 0; x < env->size; x++) { + int cell = env->grid[y * env->size + x]; + Color color = PUFF_BACKGROUND; + + if (cell == WALL) color = PUFF_BACKGROUND2; + else if (cell == GOAL) color = PUFF_RED; + + if (cell != EMPTY && cell != AGENT) { + DrawRectangle(x*px, y*px, px, px, color); + } + } + } + + // Draw agent's 7x7 observation window + int view_size = 7; + int half_view = view_size / 2; + + // Calculate the center of the view based on agent's direction + int center_x = env->agent_x; + int center_y = env->agent_y; + + // Shift center forward in the direction the agent is facing + if (env->agent_dir == 0) center_x += half_view; // East + else if (env->agent_dir == 1) center_y += half_view; // South + else if (env->agent_dir == 2) center_x -= half_view; // West + else if (env->agent_dir == 3) center_y -= half_view; // North + + // Draw semi-transparent overlay for observation window + Color obs_overlay = (Color){180, 180, 180, 80}; + for (int i = 0; i < view_size; i++) { + for (int j = 0; j < view_size; j++) { + int world_x = center_x - half_view + j; + int world_y = center_y - half_view + i; + + // Only draw overlay for cells within grid bounds and visible to agent + if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size && + can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) { + DrawRectangle(world_x*px, world_y*px, px, px, obs_overlay); + } + } + } + + // Draw agent + int starting_sprite_x = 0; + int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°) + if (rotation == 180) { + starting_sprite_x = 128; // Use flipped sprite for 180° rotation + rotation = 0; + } + + DrawTexturePro( + env->puffers, + (Rectangle){starting_sprite_x, 0, 128, 128}, + (Rectangle){ + env->agent_x * px + px/2, + env->agent_y * px + px/2, + px, + px + }, + (Vector2){px/2, px/2}, + rotation, + WHITE + ); + + EndDrawing(); +} + +void c_close(FourRooms* env) { + if (IsWindowReady()) { + UnloadTexture(env->puffers); + CloseWindow(); + } + if (env->grid) { + free(env->grid); + } +} diff --git a/ocean/four_rooms/four_rooms.py b/ocean/four_rooms/four_rooms.py new file mode 100644 index 0000000000..b530fd8573 --- /dev/null +++ b/ocean/four_rooms/four_rooms.py @@ -0,0 +1,63 @@ + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.four_rooms import binding + +class FourRooms(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=19, buf=None, seed=0): + self.single_observation_space = gymnasium.spaces.Box(low=0, high=10, + shape=(7*7*3,), dtype=np.uint8) + self.single_action_space = gymnasium.spaces.Discrete(7) + self.render_mode = render_mode + self.num_agents = num_envs + self.log_interval = log_interval + + super().__init__(buf) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, size=size) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + +if __name__ == '__main__': + N = 4096 + + env = FourRooms(num_envs=N) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 7, (CACHE, N)) # 7 actions: left, right, forward, pickup, drop, toggle, done + + i = 0 + import time + start = time.time() + while time.time() - start < 10: + env.step(actions[i % CACHE]) + steps += N + i += 1 + + print('FourRooms SPS:', int(steps / (time.time() - start))) From 0fe7aacef7fade6c28a2f0418af6d47bc7340bfa Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Sun, 3 May 2026 22:04:54 -0400 Subject: [PATCH 2/8] Adapt Four Rooms to Ocean v4 Update Four Rooms for the v4 static vecenv interface: binding metadata, float action/terminal buffers, per-env RNG, and v4 config keys. Remove the old v3 Python wrapper because v4 integrates native Ocean environments through build.sh and config files. --- config/four_rooms.ini | 30 ++++----- ocean/four_rooms/binding.c | 25 +++---- ocean/four_rooms/four_rooms.c | 10 +-- ocean/four_rooms/four_rooms.h | 117 ++++++++++++++++++--------------- ocean/four_rooms/four_rooms.py | 63 ------------------ 5 files changed, 96 insertions(+), 149 deletions(-) delete mode 100644 ocean/four_rooms/four_rooms.py diff --git a/config/four_rooms.ini b/config/four_rooms.ini index 115b9a6b09..816dbb7640 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -1,26 +1,24 @@ [base] -package = ocean -env_name = puffer_four_rooms -policy_name = Policy -rnn_name = Recurrent -#policy_name = FourRooms -#rnn_name = FourRoomsLSTM +env_name = four_rooms [vec] -num_workers = 12 -num_envs = 12 +total_agents = 4096 +num_buffers = 2 +num_threads = 8 [env] -size = 27 -num_envs = 256 +size = 19 + +[policy] +hidden_size = 256 +num_layers = 2 +expansion_factor = 1 [train] -device = mps -total_timesteps = 50_000_000 +total_timesteps = 100_000_000 gamma = 0.99 -learning_rate = 0.01 +gae_lambda = 0.95 +learning_rate = 0.005 minibatch_size = 32768 -clip_coef = 0.1 +horizon = 64 ent_coef = 0.01 -vf_coef = 0.5 -max_grad_norm = 0.5 diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c index e4f68a7eba..98d52b67fc 100644 --- a/ocean/four_rooms/binding.c +++ b/ocean/four_rooms/binding.c @@ -1,20 +1,23 @@ #include "four_rooms.h" +#define OBS_SIZE (7 * 7 * 3) +#define NUM_ATNS 1 +#define ACT_SIZES {7} +#define OBS_TENSOR_T ByteTensor + #define Env FourRooms -#include "../env_binding.h" +#include "vecenv.h" -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->size = unpack(kwargs, "size"); +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->size = (int)dict_get(kwargs, "size")->value; env->see_through_walls = 0; - // Allocate grid memory for full state (stores OBJECT_IDX values) env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char)); - return 0; } -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); } diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c index 5681084950..7425e3229b 100644 --- a/ocean/four_rooms/four_rooms.c +++ b/ocean/four_rooms/four_rooms.c @@ -3,10 +3,12 @@ int main() { FourRooms env = {}; env.size = 19; + env.num_agents = 1; + env.rng = 0; env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding - env.actions = (int*)calloc(1, sizeof(int)); + env.actions = (float*)calloc(1, sizeof(float)); env.rewards = (float*)calloc(1, sizeof(float)); - env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); + env.terminals = (float*)calloc(1, sizeof(float)); env.grid = (unsigned char*)calloc(env.size * env.size, sizeof(unsigned char)); c_reset(&env); @@ -18,7 +20,7 @@ int main() { if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) env.actions[0] = LEFT; if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) env.actions[0] = RIGHT; } else { - env.actions[0] = rand() % 3; // Only use left, right, forward + env.actions[0] = four_rooms_rand(&env, 3); // Only use left, right, forward } c_step(&env); c_render(&env); @@ -27,8 +29,6 @@ int main() { free(env.actions); free(env.rewards); free(env.terminals); - free(env.grid); c_close(&env); return 0; } - diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h index 9ef334f0c1..148d302b95 100644 --- a/ocean/four_rooms/four_rooms.h +++ b/ocean/four_rooms/four_rooms.h @@ -4,7 +4,7 @@ // Action space const unsigned char LEFT = 0; -const unsigned char RIGHT = 1; +const unsigned char RIGHT = 1; const unsigned char FORWARD = 2; const unsigned char PICKUP = 3; // Unused const unsigned char DROP = 4; // Unused @@ -49,9 +49,10 @@ typedef struct { typedef struct { Log log; unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell - int* actions; + float* actions; float* rewards; - unsigned char* terminals; + float* terminals; + int num_agents; int size; // default 19 int tick; int agent_x, agent_y; @@ -59,9 +60,15 @@ typedef struct { int goal_x, goal_y; unsigned char* grid; // Stores OBJECT_IDX values int see_through_walls; + unsigned int rng; + int texture_loaded; Texture2D puffers; } FourRooms; +static inline int four_rooms_rand(FourRooms* env, int n) { + return rand_r(&env->rng) % n; +} + void add_log(FourRooms* env) { env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0; env->log.score += env->rewards[0]; @@ -114,27 +121,27 @@ void generate_observation(FourRooms* env) { // Generate 7x7x3 observation centered on agent's view direction int view_size = 7; int half_view = view_size / 2; - + // Calculate the center of the view based on agent's direction int center_x = env->agent_x; int center_y = env->agent_y; - + // Shift center forward in the direction the agent is facing if (env->agent_dir == 0) center_x += half_view; // East else if (env->agent_dir == 1) center_y += half_view; // South else if (env->agent_dir == 2) center_x -= half_view; // West else if (env->agent_dir == 3) center_y -= half_view; // North - + for (int i = 0; i < view_size; i++) { for (int j = 0; j < view_size; j++) { int world_x = center_x - half_view + j; int world_y = center_y - half_view + i; - + // Calculate flat index for this cell in the 7x7x3 observation int base_idx = (i * view_size + j) * 3; - + unsigned char object_idx, color_idx, state; - + // Check bounds, out of bounds is treated as wall if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) { object_idx = WALL; @@ -147,7 +154,7 @@ void generate_observation(FourRooms* env) { } else { int grid_idx = world_y * env->size + world_x; unsigned char grid_cell = env->grid[grid_idx]; - + // Map grid cell to MiniGrid encoding switch (grid_cell) { case EMPTY: @@ -177,7 +184,7 @@ void generate_observation(FourRooms* env) { break; } } - + env->observations[base_idx] = object_idx; env->observations[base_idx + 1] = color_idx; env->observations[base_idx + 2] = state; @@ -187,10 +194,10 @@ void generate_observation(FourRooms* env) { void create_four_rooms_grid(FourRooms* env) { int size = env->size; - + // Clear grid memset(env->grid, EMPTY, size * size * sizeof(unsigned char)); - + // Create outer walls for (int i = 0; i < size; i++) { env->grid[0 * size + i] = WALL; // Top @@ -198,80 +205,80 @@ void create_four_rooms_grid(FourRooms* env) { env->grid[i * size + 0] = WALL; // Left env->grid[i * size + (size-1)] = WALL; // Right } - + int room_w = size / 2; int room_h = size / 2; - + // Create vertical separating wall for (int y = 0; y < size; y++) { env->grid[y * size + room_w] = WALL; } - + // Create horizontal separating wall for (int x = 0; x < size; x++) { env->grid[room_h * size + x] = WALL; } - + // Create 4 gaps in the separating walls // Gap in vertical wall (top half) - int gap_y1 = 1 + rand() % (room_h - 2); + int gap_y1 = 1 + four_rooms_rand(env, room_h - 2); env->grid[gap_y1 * size + room_w] = EMPTY; - + // Gap in vertical wall (bottom half) - int gap_y2 = room_h + 1 + rand() % (room_h - 2); + int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 2); env->grid[gap_y2 * size + room_w] = EMPTY; - + // Gap in horizontal wall (left half) - int gap_x1 = 1 + rand() % (room_w - 2); + int gap_x1 = 1 + four_rooms_rand(env, room_w - 2); env->grid[room_h * size + gap_x1] = EMPTY; - + // Gap in horizontal wall (right half) - int gap_x2 = room_w + 1 + rand() % (room_w - 2); + int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 2); env->grid[room_h * size + gap_x2] = EMPTY; } void c_reset(FourRooms* env) { create_four_rooms_grid(env); - + // Place agent randomly in valid position do { - env->agent_x = 1 + rand() % (env->size - 2); - env->agent_y = 1 + rand() % (env->size - 2); + env->agent_x = 1 + four_rooms_rand(env, env->size - 2); + env->agent_y = 1 + four_rooms_rand(env, env->size - 2); } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY); - + // Place goal randomly in valid position (different from agent) do { - env->goal_x = 1 + rand() % (env->size - 2); - env->goal_y = 1 + rand() % (env->size - 2); + env->goal_x = 1 + four_rooms_rand(env, env->size - 2); + env->goal_y = 1 + four_rooms_rand(env, env->size - 2); } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY || (env->goal_x == env->agent_x && env->goal_y == env->agent_y)); - + // Set agent and goal on grid env->grid[env->agent_y * env->size + env->agent_x] = AGENT; env->grid[env->goal_y * env->size + env->goal_x] = GOAL; - + // Random initial direction - env->agent_dir = rand() % 4; + env->agent_dir = four_rooms_rand(env, 4); env->tick = 0; - + generate_observation(env); } void c_step(FourRooms* env) { env->tick += 1; - - int action = env->actions[0]; + + int action = (int)env->actions[0]; env->terminals[0] = 0; env->rewards[0] = 0.0; - + // Clear agent from current position env->grid[env->agent_y * env->size + env->agent_x] = EMPTY; - + int new_x = env->agent_x; int new_y = env->agent_y; int new_dir = env->agent_dir; - + if (action == LEFT) { new_dir = (env->agent_dir + 3) % 4; } else if (action == RIGHT) { @@ -289,9 +296,9 @@ void c_step(FourRooms* env) { env->agent_y = new_y; } } - + env->agent_dir = new_dir; - + // Check if agent reached goal if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) { env->terminals[0] = 1; @@ -300,10 +307,10 @@ void c_step(FourRooms* env) { c_reset(env); return; } - + // Place agent back on grid env->grid[env->agent_y * env->size + env->agent_x] = AGENT; - + // Check timeout if (env->tick >= 4 * env->size) { env->terminals[0] = 1; @@ -312,7 +319,7 @@ void c_step(FourRooms* env) { c_reset(env); return; } - + generate_observation(env); } @@ -321,6 +328,7 @@ void c_render(FourRooms* env) { InitWindow(32*env->size, 32*env->size, "PufferLib FourRooms"); SetTargetFPS(10); env->puffers = LoadTexture("resources/shared/puffers_128.png"); + env->texture_loaded = 1; } if (IsKeyDown(KEY_ESCAPE)) { @@ -331,13 +339,13 @@ void c_render(FourRooms* env) { ClearBackground(PUFF_BACKGROUND); int px = 32; - + // Draw the main grid for (int y = 0; y < env->size; y++) { for (int x = 0; x < env->size; x++) { int cell = env->grid[y * env->size + x]; Color color = PUFF_BACKGROUND; - + if (cell == WALL) color = PUFF_BACKGROUND2; else if (cell == GOAL) color = PUFF_RED; @@ -346,28 +354,28 @@ void c_render(FourRooms* env) { } } } - + // Draw agent's 7x7 observation window int view_size = 7; int half_view = view_size / 2; - + // Calculate the center of the view based on agent's direction int center_x = env->agent_x; int center_y = env->agent_y; - + // Shift center forward in the direction the agent is facing if (env->agent_dir == 0) center_x += half_view; // East else if (env->agent_dir == 1) center_y += half_view; // South else if (env->agent_dir == 2) center_x -= half_view; // West else if (env->agent_dir == 3) center_y -= half_view; // North - + // Draw semi-transparent overlay for observation window Color obs_overlay = (Color){180, 180, 180, 80}; for (int i = 0; i < view_size; i++) { for (int j = 0; j < view_size; j++) { int world_x = center_x - half_view + j; int world_y = center_y - half_view + i; - + // Only draw overlay for cells within grid bounds and visible to agent if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size && can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) { @@ -375,7 +383,7 @@ void c_render(FourRooms* env) { } } } - + // Draw agent int starting_sprite_x = 0; int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°) @@ -383,7 +391,7 @@ void c_render(FourRooms* env) { starting_sprite_x = 128; // Use flipped sprite for 180° rotation rotation = 0; } - + DrawTexturePro( env->puffers, (Rectangle){starting_sprite_x, 0, 128, 128}, @@ -402,9 +410,10 @@ void c_render(FourRooms* env) { } void c_close(FourRooms* env) { - if (IsWindowReady()) { + if (env->texture_loaded) { UnloadTexture(env->puffers); CloseWindow(); + env->texture_loaded = 0; } if (env->grid) { free(env->grid); diff --git a/ocean/four_rooms/four_rooms.py b/ocean/four_rooms/four_rooms.py deleted file mode 100644 index b530fd8573..0000000000 --- a/ocean/four_rooms/four_rooms.py +++ /dev/null @@ -1,63 +0,0 @@ - -import gymnasium -import numpy as np - -import pufferlib -from pufferlib.ocean.four_rooms import binding - -class FourRooms(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=19, buf=None, seed=0): - self.single_observation_space = gymnasium.spaces.Box(low=0, high=10, - shape=(7*7*3,), dtype=np.uint8) - self.single_action_space = gymnasium.spaces.Discrete(7) - self.render_mode = render_mode - self.num_agents = num_envs - self.log_interval = log_interval - - super().__init__(buf) - self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, - self.terminals, self.truncations, num_envs, seed, size=size) - - def reset(self, seed=0): - binding.vec_reset(self.c_envs, seed) - self.tick = 0 - return self.observations, [] - - def step(self, actions): - self.tick += 1 - - self.actions[:] = actions - binding.vec_step(self.c_envs) - - info = [] - if self.tick % self.log_interval == 0: - info.append(binding.vec_log(self.c_envs)) - - return (self.observations, self.rewards, - self.terminals, self.truncations, info) - - def render(self): - binding.vec_render(self.c_envs, 0) - - def close(self): - binding.vec_close(self.c_envs) - -if __name__ == '__main__': - N = 4096 - - env = FourRooms(num_envs=N) - env.reset() - steps = 0 - - CACHE = 1024 - actions = np.random.randint(0, 7, (CACHE, N)) # 7 actions: left, right, forward, pickup, drop, toggle, done - - i = 0 - import time - start = time.time() - while time.time() - start < 10: - env.step(actions[i % CACHE]) - steps += N - i += 1 - - print('FourRooms SPS:', int(steps / (time.time() - start))) From 3b77f2c7c7a247279a1c4e30be14c6f907b2d703 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Tue, 5 May 2026 18:30:47 -0400 Subject: [PATCH 3/8] Refine Four Rooms metrics and CPU stepping --- ocean/four_rooms/binding.c | 19 +++++++++++++++++++ ocean/four_rooms/four_rooms.h | 11 ++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c index 98d52b67fc..4406434146 100644 --- a/ocean/four_rooms/binding.c +++ b/ocean/four_rooms/binding.c @@ -5,9 +5,28 @@ #define ACT_SIZES {7} #define OBS_TENSOR_T ByteTensor +#define MY_VEC_STEP four_rooms_vec_step +#define MY_VEC_STEP_RANGE four_rooms_vec_step_range #define Env FourRooms #include "vecenv.h" +void four_rooms_vec_step(StaticVec* vec) { + memset(vec->rewards, 0, vec->total_agents * sizeof(float)); + memset(vec->terminals, 0, vec->total_agents * sizeof(float)); + FourRooms* envs = (FourRooms*)vec->envs; + for (int i = 0; i < vec->size; i++) { + c_step(&envs[i]); + } +} + +void four_rooms_vec_step_range(StaticVec* vec, int env_start, int env_count, int num_workers) { + (void)num_workers; + FourRooms* envs = (FourRooms*)vec->envs; + for (int i = env_start; i < env_start + env_count; i++) { + c_step(&envs[i]); + } +} + void my_init(Env* env, Dict* kwargs) { env->num_agents = 1; env->size = (int)dict_get(kwargs, "size")->value; diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h index 148d302b95..b10c5214e9 100644 --- a/ocean/four_rooms/four_rooms.h +++ b/ocean/four_rooms/four_rooms.h @@ -55,6 +55,7 @@ typedef struct { int num_agents; int size; // default 19 int tick; + float episode_return; int agent_x, agent_y; int agent_dir; // 0=East, 1=South, 2=West, 3=North int goal_x, goal_y; @@ -70,10 +71,10 @@ static inline int four_rooms_rand(FourRooms* env, int n) { } void add_log(FourRooms* env) { - env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0; + env->log.perf += (env->rewards[0] > 0) ? 1.0f : 0.0f; env->log.score += env->rewards[0]; env->log.episode_length += env->tick; - env->log.episode_return += env->rewards[0]; + env->log.episode_return += env->episode_return; env->log.n++; } @@ -261,6 +262,7 @@ void c_reset(FourRooms* env) { // Random initial direction env->agent_dir = four_rooms_rand(env, 4); env->tick = 0; + env->episode_return = 0.0f; generate_observation(env); } @@ -302,7 +304,8 @@ void c_step(FourRooms* env) { // Check if agent reached goal if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) { env->terminals[0] = 1; - env->rewards[0] = 1.0; + env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (4.0f * (float)env->size); + env->episode_return += env->rewards[0]; add_log(env); c_reset(env); return; @@ -315,11 +318,13 @@ void c_step(FourRooms* env) { if (env->tick >= 4 * env->size) { env->terminals[0] = 1; env->rewards[0] = 0.0; + env->episode_return += env->rewards[0]; add_log(env); c_reset(env); return; } + env->episode_return += env->rewards[0]; generate_observation(env); } From e3c6a11a59dd653be74898d94d5feeb0e1a582dd Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Tue, 5 May 2026 18:31:19 -0400 Subject: [PATCH 4/8] Add Four Rooms sweep config --- config/four_rooms.ini | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/config/four_rooms.ini b/config/four_rooms.ini index 816dbb7640..b72a8f8b34 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -22,3 +22,36 @@ learning_rate = 0.005 minibatch_size = 32768 horizon = 64 ent_coef = 0.01 + +[sweep] +metric = score +metric_distribution = linear +goal = maximize +max_runs = 40 +gpus = 1 +downsample = 5 +sweep_only = hidden_size,num_layers,total_timesteps,learning_rate + +[sweep.policy.hidden_size] +distribution = uniform_pow2 +min = 128 +max = 512 +scale = auto + +[sweep.policy.num_layers] +distribution = int_uniform +min = 1 +max = 4 +scale = auto + +[sweep.train.total_timesteps] +distribution = log_normal +min = 30_000_000 +max = 300_000_000 +scale = time + +[sweep.train.learning_rate] +distribution = log_normal +min = 0.0005 +max = 0.01 +scale = auto From 6c17cb750d6075c06ce9617e03eb1ef7d7c5ef12 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Tue, 5 May 2026 21:03:44 -0400 Subject: [PATCH 5/8] Tune Four Rooms sweep ranges --- config/four_rooms.ini | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/config/four_rooms.ini b/config/four_rooms.ini index b72a8f8b34..9f61a8cb02 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -27,28 +27,31 @@ ent_coef = 0.01 metric = score metric_distribution = linear goal = maximize -max_runs = 40 +max_runs = 100 gpus = 1 downsample = 5 sweep_only = hidden_size,num_layers,total_timesteps,learning_rate [sweep.policy.hidden_size] distribution = uniform_pow2 -min = 128 -max = 512 +min = 64 +max = 1024 +mean = 256 scale = auto [sweep.policy.num_layers] distribution = int_uniform min = 1 max = 4 +mean = 2 scale = auto [sweep.train.total_timesteps] distribution = log_normal -min = 30_000_000 -max = 300_000_000 -scale = time +min = 20_000_000 +max = 500_000_000 +mean = 100_000_000 +scale = auto [sweep.train.learning_rate] distribution = log_normal From fe9ae73f45cde3fa98f1b88f656a038e059041e1 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Wed, 6 May 2026 21:54:48 -0400 Subject: [PATCH 6/8] more sweeps --- config/four_rooms.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/four_rooms.ini b/config/four_rooms.ini index 9f61a8cb02..29d9a74803 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -11,11 +11,11 @@ size = 19 [policy] hidden_size = 256 -num_layers = 2 +num_layers = 6 expansion_factor = 1 [train] -total_timesteps = 100_000_000 +total_timesteps = 300_000_000 gamma = 0.99 gae_lambda = 0.95 learning_rate = 0.005 From 607448260f916721174f5c3b256a0728adc20820 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Thu, 7 May 2026 16:43:24 -0400 Subject: [PATCH 7/8] Harden Four Rooms environment --- config/four_rooms.ini | 2 + ocean/four_rooms/binding.c | 8 +- ocean/four_rooms/four_rooms.c | 6 +- ocean/four_rooms/four_rooms.h | 315 ++++++++++++++++------------------ 4 files changed, 161 insertions(+), 170 deletions(-) diff --git a/config/four_rooms.ini b/config/four_rooms.ini index 29d9a74803..32dc91f09e 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -8,6 +8,8 @@ num_threads = 8 [env] size = 19 +# 0 derives the timeout from the map size: 4 * size. Positive values override it. +max_steps = 0 [policy] hidden_size = 256 diff --git a/ocean/four_rooms/binding.c b/ocean/four_rooms/binding.c index 4406434146..3c9c95c1c7 100644 --- a/ocean/four_rooms/binding.c +++ b/ocean/four_rooms/binding.c @@ -1,8 +1,8 @@ #include "four_rooms.h" -#define OBS_SIZE (7 * 7 * 3) +#define OBS_SIZE (FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS) #define NUM_ATNS 1 -#define ACT_SIZES {7} +#define ACT_SIZES {FOUR_ROOMS_NUM_ACTIONS} #define OBS_TENSOR_T ByteTensor #define MY_VEC_STEP four_rooms_vec_step @@ -30,6 +30,10 @@ void four_rooms_vec_step_range(StaticVec* vec, int env_start, int env_count, int void my_init(Env* env, Dict* kwargs) { env->num_agents = 1; env->size = (int)dict_get(kwargs, "size")->value; + env->max_steps = (int)dict_get(kwargs, "max_steps")->value; + if (env->max_steps <= 0) { + env->max_steps = 4 * env->size; + } env->see_through_walls = 0; env->grid = (unsigned char*)calloc(env->size * env->size, sizeof(unsigned char)); } diff --git a/ocean/four_rooms/four_rooms.c b/ocean/four_rooms/four_rooms.c index 7425e3229b..b73ad5989f 100644 --- a/ocean/four_rooms/four_rooms.c +++ b/ocean/four_rooms/four_rooms.c @@ -3,9 +3,13 @@ int main() { FourRooms env = {}; env.size = 19; + env.max_steps = 0; env.num_agents = 1; env.rng = 0; - env.observations = (unsigned char*)calloc(7*7*3, sizeof(unsigned char)); // 7x7x3 for MinGrid encoding + env.observations = (unsigned char*)calloc( + FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_OBS_CHANNELS, + sizeof(unsigned char) + ); env.actions = (float*)calloc(1, sizeof(float)); env.rewards = (float*)calloc(1, sizeof(float)); env.terminals = (float*)calloc(1, sizeof(float)); diff --git a/ocean/four_rooms/four_rooms.h b/ocean/four_rooms/four_rooms.h index b10c5214e9..98c9e0469e 100644 --- a/ocean/four_rooms/four_rooms.h +++ b/ocean/four_rooms/four_rooms.h @@ -2,41 +2,41 @@ #include #include "raylib.h" +#define FOUR_ROOMS_VIEW_SIZE 7 +#define FOUR_ROOMS_OBS_CHANNELS 3 +#define FOUR_ROOMS_NUM_ACTIONS 7 + // Action space -const unsigned char LEFT = 0; -const unsigned char RIGHT = 1; -const unsigned char FORWARD = 2; -const unsigned char PICKUP = 3; // Unused -const unsigned char DROP = 4; // Unused -const unsigned char TOGGLE = 5; // Unused -const unsigned char DONE = 6; // Unused +enum { + LEFT = 0, + RIGHT = 1, + FORWARD = 2, + PICKUP = 3, + DROP = 4, + TOGGLE = 5, + DONE = 6, +}; // Observation: Objects -const unsigned char UNSEEN = 0; -const unsigned char EMPTY = 1; -const unsigned char WALL = 2; -const unsigned char FLOOR = 3; // Unused -const unsigned char DOOR = 4; // Unused -const unsigned char KEY = 5; // Unused -const unsigned char BALL = 6; // Unused -const unsigned char BOX = 7; // Unused -const unsigned char GOAL = 8; -const unsigned char LAVA = 9; // Unused -const unsigned char AGENT = 10; +enum { + UNSEEN = 0, + EMPTY = 1, + WALL = 2, + GOAL = 8, + AGENT = 10, +}; // Observation: Colors -const unsigned char COLOR_BLACK = 0; -const unsigned char COLOR_GREEN = 1; -const unsigned char COLOR_BLUE = 2; -const unsigned char COLOR_PURPLE = 3; -const unsigned char COLOR_YELLOW = 4; -const unsigned char COLOR_GREY = 5; +enum { + COLOR_BLACK = 0, + COLOR_GREEN = 1, + COLOR_GREY = 5, +}; // PufferLib standard colors for rendering -const Color PUFF_RED = (Color){187, 0, 0, 255}; -const Color PUFF_CYAN = (Color){0, 187, 187, 255}; -const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; -const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255}; +static const Color PUFF_RED = (Color){187, 0, 0, 255}; +static const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; +static const Color PUFF_BACKGROUND2 = (Color){18, 72, 72, 255}; typedef struct { float perf; @@ -48,18 +48,19 @@ typedef struct { typedef struct { Log log; - unsigned char* observations; // 7x7x3 observation: (OBJECT_IDX, COLOR_IDX, STATE) per cell + unsigned char* observations; float* actions; float* rewards; float* terminals; int num_agents; - int size; // default 19 + int size; + int max_steps; int tick; float episode_return; int agent_x, agent_y; - int agent_dir; // 0=East, 1=South, 2=West, 3=North + int agent_dir; int goal_x, goal_y; - unsigned char* grid; // Stores OBJECT_IDX values + unsigned char* grid; int see_through_walls; unsigned int rng; int texture_loaded; @@ -70,6 +71,10 @@ static inline int four_rooms_rand(FourRooms* env, int n) { return rand_r(&env->rng) % n; } +static inline int four_rooms_grid_idx(FourRooms* env, int x, int y) { + return y * env->size + x; +} + void add_log(FourRooms* env) { env->log.perf += (env->rewards[0] > 0) ? 1.0f : 0.0f; env->log.score += env->rewards[0]; @@ -78,117 +83,108 @@ void add_log(FourRooms* env) { env->log.n++; } -int can_see_cell(FourRooms* env, int agent_x, int agent_y, int target_x, int target_y) { - if (env->see_through_walls) { - return 1; +void encode_cell(unsigned char object, unsigned char* object_idx, unsigned char* color_idx, unsigned char* state) { + *state = 0; + if (object == WALL) { + *object_idx = WALL; + *color_idx = COLOR_GREY; + } else if (object == GOAL) { + *object_idx = GOAL; + *color_idx = COLOR_GREEN; + } else { + *object_idx = EMPTY; + *color_idx = COLOR_BLACK; } +} - // Use Bresenham's line algorithm to check line of sight - int dx = abs(target_x - agent_x); - int dy = abs(target_y - agent_y); - int x = agent_x; - int y = agent_y; - int x_inc = (target_x > agent_x) ? 1 : -1; - int y_inc = (target_y > agent_y) ? 1 : -1; - int error = dx - dy; - - while (x != target_x || y != target_y) { - // If we've reached the target cell, stop (target cell should always be visible) - if (x == target_x && y == target_y) { - break; - } +void observation_to_world(FourRooms* env, int obs_x, int obs_y, int* world_x, int* world_y) { + int forward_x = 0; + int forward_y = 0; + if (env->agent_dir == 0) forward_x = 1; + else if (env->agent_dir == 1) forward_y = 1; + else if (env->agent_dir == 2) forward_x = -1; + else forward_y = -1; + + int right_x = -forward_y; + int right_y = forward_x; + int right_offset = obs_x - FOUR_ROOMS_VIEW_SIZE / 2; + int forward_offset = FOUR_ROOMS_VIEW_SIZE - 1 - obs_y; + + *world_x = env->agent_x + forward_x * forward_offset + right_x * right_offset; + *world_y = env->agent_y + forward_y * forward_offset + right_y * right_offset; +} - int error2 = 2 * error; - if (error2 > -dy) { - error -= dy; - x += x_inc; - } - if (error2 < dx) { - error += dx; - y += y_inc; +void compute_visibility(unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE], + unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]) { + memset(visible, 0, FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * sizeof(unsigned char)); + visible[FOUR_ROOMS_VIEW_SIZE - 1][FOUR_ROOMS_VIEW_SIZE / 2] = 1; + + // MiniGrid propagates visibility from the agent at bottom-center after rotating the view. + for (int y = FOUR_ROOMS_VIEW_SIZE - 1; y >= 0; y--) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE - 1; x++) { + if (!visible[y][x] || view[y][x] == WALL) { + continue; + } + visible[y][x + 1] = 1; + if (y > 0) { + visible[y - 1][x] = 1; + visible[y - 1][x + 1] = 1; + } } - // If the next cell (not the target) is a wall, block vision beyond but allow seeing the wall itself - if ((x != target_x || y != target_y) && - x >= 0 && x < env->size && y >= 0 && y < env->size && - env->grid[y * env->size + x] == WALL) { - return 0; // Wall blocks the view beyond, but wall itself is visible + for (int x = FOUR_ROOMS_VIEW_SIZE - 1; x > 0; x--) { + if (!visible[y][x] || view[y][x] == WALL) { + continue; + } + visible[y][x - 1] = 1; + if (y > 0) { + visible[y - 1][x] = 1; + visible[y - 1][x - 1] = 1; + } } } - return 1; // Target cell is visible } void generate_observation(FourRooms* env) { - // Generate 7x7x3 observation centered on agent's view direction - int view_size = 7; - int half_view = view_size / 2; - - // Calculate the center of the view based on agent's direction - int center_x = env->agent_x; - int center_y = env->agent_y; - - // Shift center forward in the direction the agent is facing - if (env->agent_dir == 0) center_x += half_view; // East - else if (env->agent_dir == 1) center_y += half_view; // South - else if (env->agent_dir == 2) center_x -= half_view; // West - else if (env->agent_dir == 3) center_y -= half_view; // North + unsigned char view[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]; + unsigned char visible[FOUR_ROOMS_VIEW_SIZE][FOUR_ROOMS_VIEW_SIZE]; - for (int i = 0; i < view_size; i++) { - for (int j = 0; j < view_size; j++) { - int world_x = center_x - half_view + j; - int world_y = center_y - half_view + i; - - // Calculate flat index for this cell in the 7x7x3 observation - int base_idx = (i * view_size + j) * 3; - - unsigned char object_idx, color_idx, state; - - // Check bounds, out of bounds is treated as wall + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int world_x, world_y; + observation_to_world(env, x, y, &world_x, &world_y); if (world_x < 0 || world_x >= env->size || world_y < 0 || world_y >= env->size) { - object_idx = WALL; - color_idx = COLOR_GREY; - state = 0; - } else if (!can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) { - object_idx = UNSEEN; // Cell is blocked by walls - color_idx = COLOR_BLACK; - state = 0; + view[y][x] = WALL; + } else if (world_x == env->agent_x && world_y == env->agent_y) { + view[y][x] = EMPTY; } else { - int grid_idx = world_y * env->size + world_x; - unsigned char grid_cell = env->grid[grid_idx]; - - // Map grid cell to MiniGrid encoding - switch (grid_cell) { - case EMPTY: - object_idx = EMPTY; - color_idx = COLOR_BLACK; - state = 0; - break; - case WALL: - object_idx = WALL; - color_idx = COLOR_GREY; - state = 0; - break; - case AGENT: - object_idx = AGENT; - color_idx = COLOR_BLUE; - state = 0; - break; - case GOAL: - object_idx = GOAL; - color_idx = COLOR_GREEN; - state = 0; - break; - default: - object_idx = EMPTY; - color_idx = 0; - state = 0; - break; - } + view[y][x] = env->grid[four_rooms_grid_idx(env, world_x, world_y)]; + } + } + } + + if (env->see_through_walls) { + memset(visible, 1, FOUR_ROOMS_VIEW_SIZE * FOUR_ROOMS_VIEW_SIZE * sizeof(unsigned char)); + } else { + compute_visibility(view, visible); + } + + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int base_idx = (y * FOUR_ROOMS_VIEW_SIZE + x) * FOUR_ROOMS_OBS_CHANNELS; + if (!visible[y][x]) { + env->observations[base_idx] = UNSEEN; + env->observations[base_idx + 1] = COLOR_BLACK; + env->observations[base_idx + 2] = 0; + continue; } - env->observations[base_idx] = object_idx; - env->observations[base_idx + 1] = color_idx; - env->observations[base_idx + 2] = state; + encode_cell( + view[y][x], + &env->observations[base_idx], + &env->observations[base_idx + 1], + &env->observations[base_idx + 2] + ); } } } @@ -222,23 +218,26 @@ void create_four_rooms_grid(FourRooms* env) { // Create 4 gaps in the separating walls // Gap in vertical wall (top half) - int gap_y1 = 1 + four_rooms_rand(env, room_h - 2); + int gap_y1 = 1 + four_rooms_rand(env, room_h - 1); env->grid[gap_y1 * size + room_w] = EMPTY; // Gap in vertical wall (bottom half) - int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 2); + int gap_y2 = room_h + 1 + four_rooms_rand(env, room_h - 1); env->grid[gap_y2 * size + room_w] = EMPTY; // Gap in horizontal wall (left half) - int gap_x1 = 1 + four_rooms_rand(env, room_w - 2); + int gap_x1 = 1 + four_rooms_rand(env, room_w - 1); env->grid[room_h * size + gap_x1] = EMPTY; // Gap in horizontal wall (right half) - int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 2); + int gap_x2 = room_w + 1 + four_rooms_rand(env, room_w - 1); env->grid[room_h * size + gap_x2] = EMPTY; } void c_reset(FourRooms* env) { + if (env->max_steps <= 0) { + env->max_steps = 4 * env->size; + } create_four_rooms_grid(env); @@ -246,18 +245,18 @@ void c_reset(FourRooms* env) { do { env->agent_x = 1 + four_rooms_rand(env, env->size - 2); env->agent_y = 1 + four_rooms_rand(env, env->size - 2); - } while (env->grid[env->agent_y * env->size + env->agent_x] != EMPTY); + } while (env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] != EMPTY); // Place goal randomly in valid position (different from agent) do { env->goal_x = 1 + four_rooms_rand(env, env->size - 2); env->goal_y = 1 + four_rooms_rand(env, env->size - 2); - } while (env->grid[env->goal_y * env->size + env->goal_x] != EMPTY || + } while (env->grid[four_rooms_grid_idx(env, env->goal_x, env->goal_y)] != EMPTY || (env->goal_x == env->agent_x && env->goal_y == env->agent_y)); // Set agent and goal on grid - env->grid[env->agent_y * env->size + env->agent_x] = AGENT; - env->grid[env->goal_y * env->size + env->goal_x] = GOAL; + env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = AGENT; + env->grid[four_rooms_grid_idx(env, env->goal_x, env->goal_y)] = GOAL; // Random initial direction env->agent_dir = four_rooms_rand(env, 4); @@ -275,7 +274,7 @@ void c_step(FourRooms* env) { env->rewards[0] = 0.0; // Clear agent from current position - env->grid[env->agent_y * env->size + env->agent_x] = EMPTY; + env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = EMPTY; int new_x = env->agent_x; int new_y = env->agent_y; @@ -293,7 +292,7 @@ void c_step(FourRooms* env) { // Check if move is valid if (new_x >= 0 && new_x < env->size && new_y >= 0 && new_y < env->size && - env->grid[new_y * env->size + new_x] != WALL) { + env->grid[four_rooms_grid_idx(env, new_x, new_y)] != WALL) { env->agent_x = new_x; env->agent_y = new_y; } @@ -304,7 +303,7 @@ void c_step(FourRooms* env) { // Check if agent reached goal if (env->agent_x == env->goal_x && env->agent_y == env->goal_y) { env->terminals[0] = 1; - env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (4.0f * (float)env->size); + env->rewards[0] = 1.0f - 0.9f * (float)env->tick / (float)env->max_steps; env->episode_return += env->rewards[0]; add_log(env); c_reset(env); @@ -312,10 +311,10 @@ void c_step(FourRooms* env) { } // Place agent back on grid - env->grid[env->agent_y * env->size + env->agent_x] = AGENT; + env->grid[four_rooms_grid_idx(env, env->agent_x, env->agent_y)] = AGENT; // Check timeout - if (env->tick >= 4 * env->size) { + if (env->tick >= env->max_steps) { env->terminals[0] = 1; env->rewards[0] = 0.0; env->episode_return += env->rewards[0]; @@ -360,40 +359,21 @@ void c_render(FourRooms* env) { } } - // Draw agent's 7x7 observation window - int view_size = 7; - int half_view = view_size / 2; - - // Calculate the center of the view based on agent's direction - int center_x = env->agent_x; - int center_y = env->agent_y; - - // Shift center forward in the direction the agent is facing - if (env->agent_dir == 0) center_x += half_view; // East - else if (env->agent_dir == 1) center_y += half_view; // South - else if (env->agent_dir == 2) center_x -= half_view; // West - else if (env->agent_dir == 3) center_y -= half_view; // North - - // Draw semi-transparent overlay for observation window Color obs_overlay = (Color){180, 180, 180, 80}; - for (int i = 0; i < view_size; i++) { - for (int j = 0; j < view_size; j++) { - int world_x = center_x - half_view + j; - int world_y = center_y - half_view + i; - - // Only draw overlay for cells within grid bounds and visible to agent - if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size && - can_see_cell(env, env->agent_x, env->agent_y, world_x, world_y)) { + for (int y = 0; y < FOUR_ROOMS_VIEW_SIZE; y++) { + for (int x = 0; x < FOUR_ROOMS_VIEW_SIZE; x++) { + int world_x, world_y; + observation_to_world(env, x, y, &world_x, &world_y); + if (world_x >= 0 && world_x < env->size && world_y >= 0 && world_y < env->size) { DrawRectangle(world_x*px, world_y*px, px, px, obs_overlay); } } } - // Draw agent int starting_sprite_x = 0; - int rotation = 90 * env->agent_dir; // 0=East(0°), 1=South(90°), 2=West(180°), 3=North(270°) + int rotation = 90 * env->agent_dir; if (rotation == 180) { - starting_sprite_x = 128; // Use flipped sprite for 180° rotation + starting_sprite_x = 128; rotation = 0; } @@ -422,5 +402,6 @@ void c_close(FourRooms* env) { } if (env->grid) { free(env->grid); + env->grid = NULL; } } From de05e0b472c02c8f340f3a363016e0e4f225f2c3 Mon Sep 17 00:00:00 2001 From: Paul Merceur <70440072+paulmerceur@users.noreply.github.com> Date: Thu, 7 May 2026 20:41:20 -0400 Subject: [PATCH 8/8] lighter config --- config/four_rooms.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/four_rooms.ini b/config/four_rooms.ini index 32dc91f09e..0a931a2ff7 100644 --- a/config/four_rooms.ini +++ b/config/four_rooms.ini @@ -8,16 +8,16 @@ num_threads = 8 [env] size = 19 -# 0 derives the timeout from the map size: 4 * size. Positive values override it. +# if 0, max_steps = 4 * size. Positive values override it. max_steps = 0 [policy] -hidden_size = 256 -num_layers = 6 +hidden_size = 128 +num_layers = 2 expansion_factor = 1 [train] -total_timesteps = 300_000_000 +total_timesteps = 100_000_000 gamma = 0.99 gae_lambda = 0.95 learning_rate = 0.005