PufferAI · jsuarez5341 · Apr 12, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/pufferlib/config/ocean/boxoban.ini b/pufferlib/config/ocean/boxoban.ini
@@ -0,0 +1,58 @@
+[base]
+package = ocean
+env_name = puffer_boxoban
+policy_name = MinGRU
+rnn_name = Recurrent
+
+[vec]
+total_agents = 16384
+num_buffers = 8
+num_threads = 8 
+
+[policy]
+num_layers = 1 
+hidden_size = 256
+
+[env]
+num_agents = 1
+#0 basic, 1 easy, 2 medium, 3 hard, 4 unfiltered
+difficulty = 1
+#reward per intermediate target (once per episode)
+int_r_coeff = 0.25
+#moving box off target
+target_loss_pen_coeff = 0.0
+max_steps = 300
+
+
+[train]
+anneal_lr = 1
+beta1 = 0.9774372816193448
+beta2 = 0.9659403664380584
+clip_coef = 0.6046560670053024
+ent_coef = 0.00002079831529141607
+eps = 0.00000000000001
+gae_lambda = 0.9258914518467392
+gamma = 0.9772998708784648
+gpus = 1
+horizon = 64
+learning_rate = 0.004480255741933225
+max_grad_norm = 1.221684008665154
+min_lr_ratio = 0.37872027027338984
+minibatch_size = 8192
+prio_alpha = 1
+prio_beta0 = 0.8789921736378042
+replay_ratio = 3.210300031048168
+seed = 42
+total_timesteps = 55504884
+use_rnn = true
+vf_clip_coef = 4.339748010438874
+vf_coef = 4.240274862679744
+vtrace_c_clip = 1.3625779006162615
+vtrace_rho_clip = 3.17260199042977
+
+
+#EASY
+
+[sweep]
+metric = perf
+
diff --git a/pufferlib/ocean/boxoban/binding.c b/pufferlib/ocean/boxoban/binding.c
@@ -0,0 +1,29 @@
+#define BOXOBAN_MAPS_IMPLEMENTATION //enables mmap
+#include "boxoban.h"
+#define OBS_SIZE 400
+#define NUM_ATNS 1
+#define ACT_SIZES {5}
+#define OBS_TENSOR_T ByteTensor
+
+
+#define Env Boxoban
+#include "vecenv.h"
+
+
+void my_init(Env* env, Dict* kwargs) {
+    env->difficulty_id = (int)dict_get(kwargs, "difficulty")->value;
+    env->size = 10;
+    env->num_agents = 1;
+    env->max_steps = (int)dict_get(kwargs, "max_steps")->value;
+    env->int_r_coeff = (float)dict_get(kwargs, "int_r_coeff")->value;
+    env->target_loss_pen_coeff = (float)dict_get(kwargs, "target_loss_pen_coeff")->value;
+    init(env);
+}
+
+void my_log(Log* log, Dict* out) {
+    dict_set(out, "perf", log->perf);
+    dict_set(out, "score", log->score);
+    dict_set(out, "episode_return", log->episode_return);
+    dict_set(out, "episode_length", log->episode_length);
+    dict_set(out, "targets_hit", log->on_targets);
+}
diff --git a/pufferlib/ocean/boxoban/boxoban.c b/pufferlib/ocean/boxoban/boxoban.c
@@ -0,0 +1,194 @@
+/* Pure C demo file for Boxoban. Usage:
+ *   bash scripts/build_ocean.sh boxoban
+ *   ./boxoban [difficulty|path_to_bin]
+ *
+ * If you pass one of the known difficulty names (basic, easy, medium,
+ * hard, unfiltered) the demo looks for pufferlib/ocean/boxoban/boxoban_maps_<difficulty>.bin
+ * Otherwise the argument is treated as an explicit path to a bin file.
+ */
+
+#define BOXOBAN_MAPS_IMPLEMENTATION
+#include "boxoban.h"
+
+static int is_named_difficulty(const char* arg) {
+    return strcmp(arg, "basic") == 0 ||
+        strcmp(arg, "easy") == 0 ||
+        strcmp(arg, "medium") == 0 ||
+        strcmp(arg, "hard") == 0 ||
+        strcmp(arg, "unfiltered") == 0;
+}
+
+static const char* resolve_map_path(int argc, char** argv, char* buffer, size_t buf_sz) {
+    const char* arg = argc > 1 ? argv[1] : NULL;
+    if (arg == NULL) {
+        if (boxoban_prepare_maps_for_difficulty("easy", buffer, buf_sz) != 0) {
+            return NULL;
+        }
+        return buffer;
+    }
+    if (strchr(arg, '/')) {
+        return arg;
+    }
+    if (is_named_difficulty(arg)) {
+        if (boxoban_prepare_maps_for_difficulty(arg, buffer, buf_sz) != 0) {
+            return NULL;
+        }
+        return buffer;
+    }
+    snprintf(buffer, buf_sz, "pufferlib/ocean/boxoban/boxoban_maps_%s.bin", arg);
+    return buffer;
+}
+
+
+int demo(int argc, char** argv) {
+    char path_buffer[512];
+    const char* chosen_path = resolve_map_path(argc, argv, path_buffer, sizeof(path_buffer));
+    if (chosen_path == NULL) {
+        fprintf(stderr, "Failed to prepare map path\n");
+        return 1;
+    }
+    if (boxoban_set_map_path(chosen_path) != 0) {
+        fprintf(stderr, "Failed to set map path: %s\n", chosen_path);
+        return 1;
+    }
+
+    Boxoban env = {
+        .size = 10,
+        .observations = NULL,
+        .actions = NULL,
+        .rewards = NULL,
+        .terminals = NULL,
+        .max_steps = 500,
+        .int_r_coeff = 0.1f,
+        .target_loss_pen_coeff = 0.5f,
+        .tick = 0,
+        .agent_x = 0,
+        .agent_y = 0,
+        .intermediate_rewards = NULL,
+        .on_target = 0,
+        .n_boxes = 0,
+        .win = 0,
+        .difficulty_id = -1,
+        .client = NULL,
+        .n_targets = 0,
+
+    };
+
+    size_t obs_count = 4u * (size_t)env.size * (size_t)env.size;
+    env.observations = calloc(obs_count, sizeof(unsigned char));
+    env.actions = calloc(1, sizeof(int));
+    env.rewards = calloc(1, sizeof(float));
+    env.terminals = calloc(1, sizeof(unsigned char));
+
+    init(&env);
+    c_reset(&env);
+    c_render(&env);
+    while (!WindowShouldClose()) {
+        if (IsKeyPressed(KEY_LEFT_SHIFT) || IsKeyPressed(KEY_RIGHT_SHIFT)) {
+            TraceLog(LOG_INFO, "Shift key pressed");
+        }
+        bool manual = IsKeyDown(KEY_LEFT_SHIFT) || IsKeyDown(KEY_RIGHT_SHIFT);
+        bool stepped = false;
+        if (manual) {
+            int new_action = -1;
+            if (IsKeyDown(KEY_UP)    || IsKeyDown(KEY_W)) new_action = UP;
+            if (IsKeyDown(KEY_DOWN)  || IsKeyDown(KEY_S)) new_action = DOWN;
+            if (IsKeyDown(KEY_LEFT)  || IsKeyDown(KEY_A)) new_action = LEFT;
+            if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) new_action = RIGHT;
+
+            if (new_action >= 0) {
+                env.actions[0] = new_action;
+                c_step(&env);
+                stepped = true;
+            }
+        } else {
+            env.actions[0] = rand() % 5;
+            c_step(&env);
+            stepped = true;
+        }
+
+        if (!stepped) {
+            // Manual mode with no direction: stay paused
+        }
+        c_render(&env);
+    }
+    free(env.observations);
+    free(env.actions);
+    free(env.rewards);
+    free(env.terminals);
+    c_close(&env);
+    return 0;
+}
+
+void test_performance(int argc, char** argv, int timeout) {
+    char path_buffer[512];
+    const char* chosen_path = resolve_map_path(argc, argv, path_buffer, sizeof(path_buffer));
+    if (chosen_path == NULL) {
+        fprintf(stderr, "Failed to prepare map path\n");
+        return;
+    }
+    if (boxoban_set_map_path(chosen_path) != 0) {
+        fprintf(stderr, "Failed to set map path: %s\n", chosen_path);
+        return;
+    }
+    printf("Loaded map: %s\n", chosen_path);
+
+    Boxoban env = {
+        .size = 10,
+        .observations = NULL,
+        .actions = NULL,
+        .rewards = NULL,
+        .terminals = NULL,
+        .max_steps = 500,
+        .int_r_coeff = 0.1f,
+        .target_loss_pen_coeff = 0.5f,
+        .tick = 0,
+        .agent_x = 0,
+        .agent_y = 0,
+        .intermediate_rewards = NULL,
+        .on_target = 0,
+        .n_boxes = 0,
+        .win = 0,
+        .difficulty_id = -1,
+        .client = NULL,
+        .n_targets = 0,
+    };
+
+    size_t obs_count = 4u * (size_t)env.size * (size_t)env.size;
+    env.observations = calloc(obs_count, sizeof(unsigned char));
+    env.actions = calloc(1, sizeof(int));
+    env.rewards = calloc(1, sizeof(float));
+    env.terminals = calloc(1, sizeof(unsigned char));
+
+    printf("Initializing...\n");
+    init(&env);
+    printf("Resetting...\n");
+    c_reset(&env);
+    printf("Starting test...\n");
+
+    int start = time(NULL);
+    int num_steps = 0;
+    while (time(NULL) - start < timeout) {
+        env.actions[0] = rand() % 5;
+        c_step(&env);
+        num_steps++;
+    }
+
+    int end = time(NULL);
+    float sps = num_steps / (end - start);
+    printf("Test Environment SPS: %f\n", sps);
+    free(env.observations);
+    free(env.actions);
+    free(env.rewards);
+    free(env.terminals);
+    c_close(&env);
+}
+
+int main(int argc, char** argv) {
+    demo(argc, argv);
+    setbuf(stdout, NULL);
+    fprintf(stderr, "Entered main\n");
+    fflush(stderr);
+    //test_performance(argc, argv,10);
+    return 0;
+}