From 1cf8cd04a4d74adfaa96f3ad895ba15e6f1da9de Mon Sep 17 00:00:00 2001
From: Noridom1 <thinhphuc2005@gmail.com>
Date: Fri, 3 Jul 2026 22:28:01 +0700
Subject: [PATCH 1/4] feat: run remote agentic-replay from pre-built AIPerf
 image

Point the remote multi-endpoint smoke config's image: at the pre-built
aiperf:0.8.0 image instead of the vLLM serving image, so the client
skips the per-job pip install. install_agentic_deps now short-circuits
when aiperf is already on PATH, and the endpoint pre-check falls back
to busybox wget when curl is unavailable (the distroless image ships
wget but not curl). Also narrow the smoke config's conc-list to [2, 4].

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml |   5 +-
 benchmarks/benchmark_lib.sh        |  18 ++++-
 docs/REMOTE_AIPERF_DOCKER.md       | 101 ++++++++++++++---------------
 docs/REMOTE_JOB_SUBMISSION_VI.md   |  35 ++++++----
 4 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index fa1518410..dee12294e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9546,7 +9546,7 @@ qwen3-4b-weka-bf16-bench-client-sglang-remote-smoke:
 # are flattened to aiperf's comma-separated multi-URL syntax; aiperf
 # round-robins requests across the model endpoints by default.
 deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke:
-  image: vllm/vllm-openai:v0.21.0
+  image: aiperf:0.8.0
   model: deepseek-coder-v2-lite-fp8
   model-prefix: deepseek-coder-v2-lite-weka
   runner: benchmark-client
@@ -9561,7 +9561,6 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke:
     - http://192.168.4.13:8000/metrics
     - http://192.168.4.13:8001/metrics
     gpu-telemetry-url: http://192.168.4.13:9400/metrics
-    aiperf-docker-image: aiperf:0.8.0
   scenarios:
     agentic-replay:
     - custom-dataset-type: weka_trace
@@ -9572,4 +9571,4 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke:
       num-dataset-entries: 949
       duration: 900
       search-space:
-      - { tp: 1, ep: 1, conc-list: [2, 4, 8, 16] }
+      - { tp: 1, ep: 1, conc-list: [2, 4] }
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 3a0729299..b11b6a390 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1329,6 +1329,16 @@ resolve_trace_source() {
 install_agentic_deps() {
     AIPERF_USE_DOCKER=false
 
+    # Full-image bypass: when the remote client runs from a pre-built full
+    # AIPerf image (the remote config points image: at it — see
+    # docs/REMOTE_AIPERF_DOCKER.md), aiperf is already installed. Skip the slow
+    # editable install; results are identical since it's the same build. Set
+    # AIPERF_FORCE_PIP_INSTALL=true to force the source install anyway.
+    if [[ "${AIPERF_FORCE_PIP_INSTALL:-}" != "true" ]] && command -v aiperf >/dev/null 2>&1; then
+        echo "[aiperf] aiperf already installed ($(command -v aiperf)); skipping pip install."
+        return 0
+    fi
+
     # Opt-in bypass: if the runner already has a pre-built aiperf image
     # (see utils/aiperf-mooncake's `make docker`), skip the pip install
     # entirely instead of re-running the (slow, transformers-from-git)
@@ -1388,7 +1398,13 @@ _probe_endpoint() {
     local url="$1" max_time="$2" retries="$3" attempt
 
     for (( attempt=1; attempt<=retries; attempt++ )); do
-        if curl --output /dev/null --silent --fail --max-time "$max_time" "$url"; then
+        if command -v curl >/dev/null 2>&1; then
+            if curl --output /dev/null --silent --fail --max-time "$max_time" "$url"; then
+                return 0
+            fi
+        # The pre-built AIPerf image is distroless: it ships busybox wget but
+        # not curl. wget's exit status is a good-enough reachability signal.
+        elif wget -q -T "$max_time" -O /dev/null "$url"; then
             return 0
         fi
         sleep 1
diff --git a/docs/REMOTE_AIPERF_DOCKER.md b/docs/REMOTE_AIPERF_DOCKER.md
index 08ab2a2e1..7962d79f3 100644
--- a/docs/REMOTE_AIPERF_DOCKER.md
+++ b/docs/REMOTE_AIPERF_DOCKER.md
@@ -1,8 +1,7 @@
 # Remote agentic-replay: running AIPerf from a pre-built image
 
-How the remote-replay benchmark client runs, why it re-installs AIPerf on every
-job today, and the future-work plan for using a pre-built AIPerf image on the
-`benchmark-client` runner to skip that install.
+How the remote-replay benchmark client runs, and how to use a pre-built full
+AIPerf image on the `benchmark-client` runner to skip the per-job install.
 
 ## How the remote path runs the client
 
@@ -31,10 +30,10 @@ which pip-installs AIPerf from the `utils/aiperf-mooncake` submodule on **every
 job**. The slow part is the editable install of AIPerf plus its
 transformers-from-git dependency.
 
-## The `aiperf-docker-image` config option
+## The `aiperf-docker-image` config option (deprecated, inert)
 
-The config schema and CI plumbing already carry an optional pre-built-image name
-end to end:
+The config schema and CI plumbing carry an optional pre-built-image name end to
+end:
 
 - `.github/configs/nvidia-master.yaml` — `remote.aiperf-docker-image: <name:tag>`
 - `utils/matrix_logic/validation.py` — `RemoteConfig.aiperf_docker_image`
@@ -44,51 +43,45 @@ end to end:
   the image exists locally, it skips the pip install and marks the run to invoke
   AIPerf via `docker run <image>` instead.
 
-**Known limitation (not yet wired end to end).** With the current
-`launch_remote.sh`, the whole orchestration already runs *inside* the top-level
-`image:` container, so that `docker run <aiperf-image>` would be a **nested**
-docker call (Docker-in-Docker). That needs a `docker` CLI inside the serving
-image and the host's `/var/run/docker.sock` mounted into the container — neither
-of which `launch_remote.sh` sets up. Until that is addressed, leave
-`aiperf-docker-image` unset so the runner keeps the pip-install path.
-
-## Why not just point `image:` at the AIPerf image
-
-This is the clean idea: since `image:` is the client runtime in the remote path,
-set it to an AIPerf image that already has the client installed, and drop the
-per-job install entirely. It would also avoid pulling the heavy vLLM image onto
-the `benchmark-client` runner, which never serves a model in this path.
-
-The blocker is *which* AIPerf image. `make docker` in `utils/aiperf-mooncake`
-builds the default `runtime` target, which is **distroless**
-([Dockerfile](../utils/aiperf-mooncake/Dockerfile) `runtime` stage): it ships only
-`/bin/bash`, the AIPerf venv, and ffmpeg. It has no `mkdir`, `timeout`, `tee`,
-`id`, `git`, `curl`, or `sleep`. Its `ENTRYPOINT ["/bin/bash", "-c"]` is built to
-run a single `aiperf …` command string.
-
-But `image:` has to host the **whole** orchestration, not just AIPerf:
-`_remote_replay.sh` needs `mkdir`/`timeout`/`tee` and `python3` for result
-aggregation and `analyze_benchmark_distributions.py`; the pre-check and pip paths
-in `benchmark_lib.sh` need `curl`/`sleep`/`git`. The distroless image would fail
-on the first line. It is perfect for running a single AIPerf command, and
-unusable as the orchestration host.
-
-## Future work (decided: defer)
-
-Preferred direction, to avoid pulling the unused vLLM image on the remote client
-runner:
-
-1. Build a **full** AIPerf image instead of the distroless `runtime` target —
-   e.g. base it on the Dockerfile's `test`/`local-dev` (Debian) stage, or add
-   `coreutils`, `git`, and `curl` to a runtime variant. It must have AIPerf
-   pre-installed plus the shell utilities and `python3` the orchestration uses.
-2. Point the remote config's top-level `image:` at that full AIPerf image.
-3. Add a one-line "AIPerf already installed → skip the slow editable install"
-   bypass to `install_agentic_deps` (mirroring the reuse check `ensure_aiperf`
-   already has). AIPerf then runs directly in the container — no nested docker,
-   no Docker-in-Docker, results identical to the pip path since it is the same
-   AIPerf build.
-
-This is deferred for now. Until it lands, remote-replay configs continue to use
-the serving `image:` and pip-install AIPerf per job; leave `aiperf-docker-image`
-unset.
+**This field is inert and should not be used.** `runners/launch_remote.sh` passes
+an explicit allowlist of env vars into the container (`RUN_ENV`), which does not
+include `AIPERF_DOCKER_IMAGE` — so the value never reaches the job. Even if it
+did, the whole orchestration already runs *inside* the top-level `image:`
+container, so `docker run <aiperf-image>` would be a **nested** docker call
+(Docker-in-Docker), which would need a `docker` CLI inside the container and the
+host's `/var/run/docker.sock` mounted in — neither of which `launch_remote.sh`
+sets up. Use the full-image approach below instead.
+
+## The full-image approach (implemented)
+
+Since `image:` is already the client runtime in the remote path, point it
+directly at a pre-built AIPerf image and drop the per-job install entirely. This
+also avoids pulling the heavy vLLM image onto the `benchmark-client` runner,
+which never serves a model in this path.
+
+`make docker` in `utils/aiperf-mooncake` builds the default `runtime` target,
+which is **distroless** ([Dockerfile](../utils/aiperf-mooncake/Dockerfile)
+`runtime` stage). It's built around `ENTRYPOINT ["/bin/bash", "-c"]` to run a
+single `aiperf …` command, but it turns out to be enough to host the whole
+orchestration too: on top of `/bin/bash`, the AIPerf venv, and `python3`, the
+base distroless image ships busybox, which provides `mkdir`, `timeout`, `tee`,
+`sleep`, `id`, and `wget`. The only orchestration dependency it's missing is
+`curl` (used only for the endpoint reachability pre-check) and `git` (used only
+by the pip-install path, which the full image skips). No rebuild needed:
+
+1. **`_probe_endpoint`** ([benchmark_lib.sh](../benchmarks/benchmark_lib.sh))
+   prefers `curl` and falls back to busybox `wget` when `curl` is absent.
+2. **`install_agentic_deps`** short-circuits with `command -v aiperf` (mirroring
+   the reuse check `ensure_aiperf` already has) — when the image already has
+   `aiperf` on `PATH`, the pip install is skipped entirely. Set
+   `AIPERF_FORCE_PIP_INSTALL=true` to force the source install anyway (e.g. to
+   pick up submodule changes not yet baked into the image).
+3. The remote config's top-level `image:` points at the pre-built AIPerf image
+   (e.g. `aiperf:0.8.0`) instead of the serving image, and
+   `remote.aiperf-docker-image` is removed.
+
+AIPerf then runs directly in the container — no nested docker, no
+Docker-in-Docker, results identical to the pip path since it's the same AIPerf
+build. Rebuilding/re-tagging the image is only needed to pick up
+`utils/aiperf-mooncake` submodule changes, since the full image pins whatever
+AIPerf version was baked in at build time.
diff --git a/docs/REMOTE_JOB_SUBMISSION_VI.md b/docs/REMOTE_JOB_SUBMISSION_VI.md
index 79ef6f6c6..693052513 100644
--- a/docs/REMOTE_JOB_SUBMISSION_VI.md
+++ b/docs/REMOTE_JOB_SUBMISSION_VI.md
@@ -196,20 +196,27 @@ set từ secret `REMOTE_ENDPOINT_API_KEY` khi `remote-url` khác rỗng (xem
 `.github/workflows/benchmark-tmpl.yml`). Nếu endpoint remote không cần API
 key, có thể để trống secret này — hệ thống sẽ fallback về giá trị `EMPTY`.
 
-## 7. Giới hạn hiện tại (aiperf-docker-image)
-
-Field `aiperf-docker-image` đã được plumbing đầy đủ từ config → matrix →
-workflow inputs → env var `AIPERF_DOCKER_IMAGE` → `install_agentic_deps` trong
-`benchmark_lib.sh`, nhưng **hiện tại không có tác dụng thực tế**:
-`runners/launch_remote.sh` (script khởi chạy container client) chưa forward
-biến `AIPERF_DOCKER_IMAGE` vào bên trong container, nên `install_agentic_deps`
-luôn thấy biến này unset và đi theo nhánh pip-install như cũ. Vì vậy:
-
-- Có thể khai báo `aiperf-docker-image` trong config mà **không gây lỗi hay
-  ảnh hưởng gì** đến job hiện tại (an toàn, nhưng vô tác dụng).
-- Cho tới khi tính năng này được nối dây đầy đủ (xem phần "Future work" trong
-  `docs/REMOTE_AIPERF_DOCKER.md`), mọi job remote đều sẽ tự cài AIPerf qua pip
-  trên mỗi lần chạy như bình thường.
+## 7. `aiperf-docker-image` đã deprecated — dùng full-image approach
+
+Field `remote.aiperf-docker-image` **vô tác dụng** (inert): `runners/launch_remote.sh`
+không forward biến `AIPERF_DOCKER_IMAGE` vào container, nên `install_agentic_deps`
+luôn thấy biến này unset. Kể cả khi forward được, cách này cần `docker run` lồng
+trong container đang chạy (Docker-in-Docker) — điều `launch_remote.sh` chưa hỗ trợ.
+Đừng dùng field này.
+
+Thay vào đó, dùng **full-image approach** (xem chi tiết trong
+`docs/REMOTE_AIPERF_DOCKER.md`): trỏ thẳng `image:` của config vào một image
+AIPerf đã build sẵn (`make docker` trong `utils/aiperf-mooncake`) thay vì image
+serving, và bỏ hẳn field `aiperf-docker-image`:
+
+- `install_agentic_deps` tự phát hiện `aiperf` đã có sẵn trên `PATH` (qua
+  `command -v aiperf`) và bỏ qua bước pip-install chậm.
+- `_probe_endpoint` (bước kiểm tra endpoint trước khi chạy) tự fallback sang
+  `wget` (busybox) nếu container không có `curl` — image distroless mặc định
+  không có `curl`.
+- Không cần Docker-in-Docker, không cần rebuild thêm gì khác.
+- Muốn ép chạy lại pip-install (ví dụ khi submodule `utils/aiperf-mooncake` có
+  thay đổi chưa được bake vào image), set `AIPERF_FORCE_PIP_INSTALL=true`.
 
 ## 8. Checklist nhanh
 

From 02b5617854d49200c5bf10cdcfaa9041b1803720 Mon Sep 17 00:00:00 2001
From: Noridom1 <thinhphuc2005@gmail.com>
Date: Fri, 3 Jul 2026 22:44:29 +0700
Subject: [PATCH 2/4] fix: run remote replay container as host runner user

The pre-built full AIPerf image (aiperf:0.8.0) is distroless and runs as
non-root UID 1000, so it could not create /workspace/results in the
bind-mounted workspace owned by the runner user (mkdir: Permission denied).
The old vllm serving image masked this by running as root.

Map the container to the host runner's uid/gid and point HOME at the
writable workspace so mkdir/results writes succeed and result files stay
runner-owned for the upload/cleanup steps.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 runners/launch_remote.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runners/launch_remote.sh b/runners/launch_remote.sh
index cb560d16e..d98cf14be 100755
--- a/runners/launch_remote.sh
+++ b/runners/launch_remote.sh
@@ -23,11 +23,19 @@ for v in "${RUN_ENV[@]}"; do
   ENV_ARGS+=(-e "$v")
 done
 
+# The pre-built full AIPerf image is distroless and runs as non-root UID 1000,
+# so it can't write into the bind-mounted workspace (owned by the runner user).
+# Map the container to the host runner's uid/gid so mkdir/results writes succeed
+# and result files stay runner-owned for the upload/cleanup steps. HOME=/app is
+# owned by 1000 and unwritable under the remapped uid, so point HOME at the
+# writable workspace (matplotlib is the only remaining HOME writer).
 docker run --rm \
   --init \
   --ipc=host \
   --network host \
   --shm-size=32g \
+  --user "$(id -u):$(id -g)" \
+  -e HOME=/workspace \
   -v "$GITHUB_WORKSPACE:/workspace" \
   -v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
   -w /workspace \

From bf570d36359e5ab6adf22aa30f436ab05b410882 Mon Sep 17 00:00:00 2001
From: Noridom1 <thinhphuc2005@gmail.com>
Date: Fri, 3 Jul 2026 22:49:05 +0700
Subject: [PATCH 3/4] fix: use busybox-compatible timeout flags in remote
 replay

The pre-built full AIPerf image is distroless and ships busybox timeout,
which rejects GNU long options: `timeout: unrecognized option
'--signal=TERM'`. As a result timeout exited immediately, aiperf never
ran, and process_agentic_result.py failed with "profile_export.jsonl not
found".

Switch to the short flags `-s TERM -k 60`, accepted by both busybox and
GNU timeout, so the remote-replay path works on the full image and the
pip-install path alike.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/_remote_replay.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/agentic/_remote_replay.sh b/benchmarks/single_node/agentic/_remote_replay.sh
index 1890e8ea9..4f64ffb28 100755
--- a/benchmarks/single_node/agentic/_remote_replay.sh
+++ b/benchmarks/single_node/agentic/_remote_replay.sh
@@ -46,10 +46,14 @@ set +x
 # AIPERF_DATASET_CONFIGURATION_TIMEOUT (1800s, see build_replay_cmd) plus the
 # benchmark duration itself, so the default here leaves headroom above that.
 AIPERF_MAX_RUNTIME="${AIPERF_MAX_RUNTIME:-2400}"
+# Use the short flags -s/-k rather than --signal/--kill-after: the pre-built
+# full AIPerf image is distroless and ships busybox timeout, which only accepts
+# `timeout [-s SIG] [-k KILL_SECS] SECS PROG`. GNU timeout accepts these too, so
+# this is portable across both the full-image and pip-install paths.
 if [[ "$AIPERF_USE_DOCKER" == "true" ]]; then
-    timeout --signal=TERM --kill-after=60 "$AIPERF_MAX_RUNTIME" "${DOCKER_REPLAY_ARGS[@]}" 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+    timeout -s TERM -k 60 "$AIPERF_MAX_RUNTIME" "${DOCKER_REPLAY_ARGS[@]}" 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
 else
-    timeout --signal=TERM --kill-after=60 "$AIPERF_MAX_RUNTIME" $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+    timeout -s TERM -k 60 "$AIPERF_MAX_RUNTIME" $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
 fi
 replay_exit="${PIPESTATUS[0]}"
 set -x

From 9e249e40ab80e5382e64d104f54287dc1898e916 Mon Sep 17 00:00:00 2001
From: Noridom1 <thinhphuc2005@gmail.com>
Date: Fri, 3 Jul 2026 23:21:53 +0700
Subject: [PATCH 4/4] test: sweep remote Weka replay at conc 2/4/8

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index dee12294e..10eb1425b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9571,4 +9571,4 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke:
       num-dataset-entries: 949
       duration: 900
       search-space:
-      - { tp: 1, ep: 1, conc-list: [2, 4] }
+      - { tp: 1, ep: 1, conc-list: [2, 4, 8] }