From 1cf8cd04a4d74adfaa96f3ad895ba15e6f1da9de Mon Sep 17 00:00:00 2001 From: Noridom1 Date: Fri, 3 Jul 2026 22:28:01 +0700 Subject: [PATCH 1/4] feat: run remote agentic-replay from pre-built AIPerf image Point the remote multi-endpoint smoke config's image: at the pre-built aiperf:0.8.0 image instead of the vLLM serving image, so the client skips the per-job pip install. install_agentic_deps now short-circuits when aiperf is already on PATH, and the endpoint pre-check falls back to busybox wget when curl is unavailable (the distroless image ships wget but not curl). Also narrow the smoke config's conc-list to [2, 4]. Co-Authored-By: Claude Sonnet 5 --- .github/configs/nvidia-master.yaml | 5 +- benchmarks/benchmark_lib.sh | 18 ++++- docs/REMOTE_AIPERF_DOCKER.md | 101 ++++++++++++++--------------- docs/REMOTE_JOB_SUBMISSION_VI.md | 35 ++++++---- 4 files changed, 87 insertions(+), 72 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fa1518410..dee12294e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9546,7 +9546,7 @@ qwen3-4b-weka-bf16-bench-client-sglang-remote-smoke: # are flattened to aiperf's comma-separated multi-URL syntax; aiperf # round-robins requests across the model endpoints by default. deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke: - image: vllm/vllm-openai:v0.21.0 + image: aiperf:0.8.0 model: deepseek-coder-v2-lite-fp8 model-prefix: deepseek-coder-v2-lite-weka runner: benchmark-client @@ -9561,7 +9561,6 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke: - http://192.168.4.13:8000/metrics - http://192.168.4.13:8001/metrics gpu-telemetry-url: http://192.168.4.13:9400/metrics - aiperf-docker-image: aiperf:0.8.0 scenarios: agentic-replay: - custom-dataset-type: weka_trace @@ -9572,4 +9571,4 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke: num-dataset-entries: 949 duration: 900 search-space: - - { tp: 1, ep: 1, conc-list: [2, 4, 8, 16] } + - { tp: 1, ep: 1, conc-list: [2, 4] } diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 3a0729299..b11b6a390 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1329,6 +1329,16 @@ resolve_trace_source() { install_agentic_deps() { AIPERF_USE_DOCKER=false + # Full-image bypass: when the remote client runs from a pre-built full + # AIPerf image (the remote config points image: at it — see + # docs/REMOTE_AIPERF_DOCKER.md), aiperf is already installed. Skip the slow + # editable install; results are identical since it's the same build. Set + # AIPERF_FORCE_PIP_INSTALL=true to force the source install anyway. + if [[ "${AIPERF_FORCE_PIP_INSTALL:-}" != "true" ]] && command -v aiperf >/dev/null 2>&1; then + echo "[aiperf] aiperf already installed ($(command -v aiperf)); skipping pip install." + return 0 + fi + # Opt-in bypass: if the runner already has a pre-built aiperf image # (see utils/aiperf-mooncake's `make docker`), skip the pip install # entirely instead of re-running the (slow, transformers-from-git) @@ -1388,7 +1398,13 @@ _probe_endpoint() { local url="$1" max_time="$2" retries="$3" attempt for (( attempt=1; attempt<=retries; attempt++ )); do - if curl --output /dev/null --silent --fail --max-time "$max_time" "$url"; then + if command -v curl >/dev/null 2>&1; then + if curl --output /dev/null --silent --fail --max-time "$max_time" "$url"; then + return 0 + fi + # The pre-built AIPerf image is distroless: it ships busybox wget but + # not curl. wget's exit status is a good-enough reachability signal. + elif wget -q -T "$max_time" -O /dev/null "$url"; then return 0 fi sleep 1 diff --git a/docs/REMOTE_AIPERF_DOCKER.md b/docs/REMOTE_AIPERF_DOCKER.md index 08ab2a2e1..7962d79f3 100644 --- a/docs/REMOTE_AIPERF_DOCKER.md +++ b/docs/REMOTE_AIPERF_DOCKER.md @@ -1,8 +1,7 @@ # Remote agentic-replay: running AIPerf from a pre-built image -How the remote-replay benchmark client runs, why it re-installs AIPerf on every -job today, and the future-work plan for using a pre-built AIPerf image on the -`benchmark-client` runner to skip that install. +How the remote-replay benchmark client runs, and how to use a pre-built full +AIPerf image on the `benchmark-client` runner to skip the per-job install. ## How the remote path runs the client @@ -31,10 +30,10 @@ which pip-installs AIPerf from the `utils/aiperf-mooncake` submodule on **every job**. The slow part is the editable install of AIPerf plus its transformers-from-git dependency. -## The `aiperf-docker-image` config option +## The `aiperf-docker-image` config option (deprecated, inert) -The config schema and CI plumbing already carry an optional pre-built-image name -end to end: +The config schema and CI plumbing carry an optional pre-built-image name end to +end: - `.github/configs/nvidia-master.yaml` — `remote.aiperf-docker-image: ` - `utils/matrix_logic/validation.py` — `RemoteConfig.aiperf_docker_image` @@ -44,51 +43,45 @@ end to end: the image exists locally, it skips the pip install and marks the run to invoke AIPerf via `docker run ` instead. -**Known limitation (not yet wired end to end).** With the current -`launch_remote.sh`, the whole orchestration already runs *inside* the top-level -`image:` container, so that `docker run ` would be a **nested** -docker call (Docker-in-Docker). That needs a `docker` CLI inside the serving -image and the host's `/var/run/docker.sock` mounted into the container — neither -of which `launch_remote.sh` sets up. Until that is addressed, leave -`aiperf-docker-image` unset so the runner keeps the pip-install path. - -## Why not just point `image:` at the AIPerf image - -This is the clean idea: since `image:` is the client runtime in the remote path, -set it to an AIPerf image that already has the client installed, and drop the -per-job install entirely. It would also avoid pulling the heavy vLLM image onto -the `benchmark-client` runner, which never serves a model in this path. - -The blocker is *which* AIPerf image. `make docker` in `utils/aiperf-mooncake` -builds the default `runtime` target, which is **distroless** -([Dockerfile](../utils/aiperf-mooncake/Dockerfile) `runtime` stage): it ships only -`/bin/bash`, the AIPerf venv, and ffmpeg. It has no `mkdir`, `timeout`, `tee`, -`id`, `git`, `curl`, or `sleep`. Its `ENTRYPOINT ["/bin/bash", "-c"]` is built to -run a single `aiperf …` command string. - -But `image:` has to host the **whole** orchestration, not just AIPerf: -`_remote_replay.sh` needs `mkdir`/`timeout`/`tee` and `python3` for result -aggregation and `analyze_benchmark_distributions.py`; the pre-check and pip paths -in `benchmark_lib.sh` need `curl`/`sleep`/`git`. The distroless image would fail -on the first line. It is perfect for running a single AIPerf command, and -unusable as the orchestration host. - -## Future work (decided: defer) - -Preferred direction, to avoid pulling the unused vLLM image on the remote client -runner: - -1. Build a **full** AIPerf image instead of the distroless `runtime` target — - e.g. base it on the Dockerfile's `test`/`local-dev` (Debian) stage, or add - `coreutils`, `git`, and `curl` to a runtime variant. It must have AIPerf - pre-installed plus the shell utilities and `python3` the orchestration uses. -2. Point the remote config's top-level `image:` at that full AIPerf image. -3. Add a one-line "AIPerf already installed → skip the slow editable install" - bypass to `install_agentic_deps` (mirroring the reuse check `ensure_aiperf` - already has). AIPerf then runs directly in the container — no nested docker, - no Docker-in-Docker, results identical to the pip path since it is the same - AIPerf build. - -This is deferred for now. Until it lands, remote-replay configs continue to use -the serving `image:` and pip-install AIPerf per job; leave `aiperf-docker-image` -unset. +**This field is inert and should not be used.** `runners/launch_remote.sh` passes +an explicit allowlist of env vars into the container (`RUN_ENV`), which does not +include `AIPERF_DOCKER_IMAGE` — so the value never reaches the job. Even if it +did, the whole orchestration already runs *inside* the top-level `image:` +container, so `docker run ` would be a **nested** docker call +(Docker-in-Docker), which would need a `docker` CLI inside the container and the +host's `/var/run/docker.sock` mounted in — neither of which `launch_remote.sh` +sets up. Use the full-image approach below instead. + +## The full-image approach (implemented) + +Since `image:` is already the client runtime in the remote path, point it +directly at a pre-built AIPerf image and drop the per-job install entirely. This +also avoids pulling the heavy vLLM image onto the `benchmark-client` runner, +which never serves a model in this path. + +`make docker` in `utils/aiperf-mooncake` builds the default `runtime` target, +which is **distroless** ([Dockerfile](../utils/aiperf-mooncake/Dockerfile) +`runtime` stage). It's built around `ENTRYPOINT ["/bin/bash", "-c"]` to run a +single `aiperf …` command, but it turns out to be enough to host the whole +orchestration too: on top of `/bin/bash`, the AIPerf venv, and `python3`, the +base distroless image ships busybox, which provides `mkdir`, `timeout`, `tee`, +`sleep`, `id`, and `wget`. The only orchestration dependency it's missing is +`curl` (used only for the endpoint reachability pre-check) and `git` (used only +by the pip-install path, which the full image skips). No rebuild needed: + +1. **`_probe_endpoint`** ([benchmark_lib.sh](../benchmarks/benchmark_lib.sh)) + prefers `curl` and falls back to busybox `wget` when `curl` is absent. +2. **`install_agentic_deps`** short-circuits with `command -v aiperf` (mirroring + the reuse check `ensure_aiperf` already has) — when the image already has + `aiperf` on `PATH`, the pip install is skipped entirely. Set + `AIPERF_FORCE_PIP_INSTALL=true` to force the source install anyway (e.g. to + pick up submodule changes not yet baked into the image). +3. The remote config's top-level `image:` points at the pre-built AIPerf image + (e.g. `aiperf:0.8.0`) instead of the serving image, and + `remote.aiperf-docker-image` is removed. + +AIPerf then runs directly in the container — no nested docker, no +Docker-in-Docker, results identical to the pip path since it's the same AIPerf +build. Rebuilding/re-tagging the image is only needed to pick up +`utils/aiperf-mooncake` submodule changes, since the full image pins whatever +AIPerf version was baked in at build time. diff --git a/docs/REMOTE_JOB_SUBMISSION_VI.md b/docs/REMOTE_JOB_SUBMISSION_VI.md index 79ef6f6c6..693052513 100644 --- a/docs/REMOTE_JOB_SUBMISSION_VI.md +++ b/docs/REMOTE_JOB_SUBMISSION_VI.md @@ -196,20 +196,27 @@ set từ secret `REMOTE_ENDPOINT_API_KEY` khi `remote-url` khác rỗng (xem `.github/workflows/benchmark-tmpl.yml`). Nếu endpoint remote không cần API key, có thể để trống secret này — hệ thống sẽ fallback về giá trị `EMPTY`. -## 7. Giới hạn hiện tại (aiperf-docker-image) - -Field `aiperf-docker-image` đã được plumbing đầy đủ từ config → matrix → -workflow inputs → env var `AIPERF_DOCKER_IMAGE` → `install_agentic_deps` trong -`benchmark_lib.sh`, nhưng **hiện tại không có tác dụng thực tế**: -`runners/launch_remote.sh` (script khởi chạy container client) chưa forward -biến `AIPERF_DOCKER_IMAGE` vào bên trong container, nên `install_agentic_deps` -luôn thấy biến này unset và đi theo nhánh pip-install như cũ. Vì vậy: - -- Có thể khai báo `aiperf-docker-image` trong config mà **không gây lỗi hay - ảnh hưởng gì** đến job hiện tại (an toàn, nhưng vô tác dụng). -- Cho tới khi tính năng này được nối dây đầy đủ (xem phần "Future work" trong - `docs/REMOTE_AIPERF_DOCKER.md`), mọi job remote đều sẽ tự cài AIPerf qua pip - trên mỗi lần chạy như bình thường. +## 7. `aiperf-docker-image` đã deprecated — dùng full-image approach + +Field `remote.aiperf-docker-image` **vô tác dụng** (inert): `runners/launch_remote.sh` +không forward biến `AIPERF_DOCKER_IMAGE` vào container, nên `install_agentic_deps` +luôn thấy biến này unset. Kể cả khi forward được, cách này cần `docker run` lồng +trong container đang chạy (Docker-in-Docker) — điều `launch_remote.sh` chưa hỗ trợ. +Đừng dùng field này. + +Thay vào đó, dùng **full-image approach** (xem chi tiết trong +`docs/REMOTE_AIPERF_DOCKER.md`): trỏ thẳng `image:` của config vào một image +AIPerf đã build sẵn (`make docker` trong `utils/aiperf-mooncake`) thay vì image +serving, và bỏ hẳn field `aiperf-docker-image`: + +- `install_agentic_deps` tự phát hiện `aiperf` đã có sẵn trên `PATH` (qua + `command -v aiperf`) và bỏ qua bước pip-install chậm. +- `_probe_endpoint` (bước kiểm tra endpoint trước khi chạy) tự fallback sang + `wget` (busybox) nếu container không có `curl` — image distroless mặc định + không có `curl`. +- Không cần Docker-in-Docker, không cần rebuild thêm gì khác. +- Muốn ép chạy lại pip-install (ví dụ khi submodule `utils/aiperf-mooncake` có + thay đổi chưa được bake vào image), set `AIPERF_FORCE_PIP_INSTALL=true`. ## 8. Checklist nhanh From 02b5617854d49200c5bf10cdcfaa9041b1803720 Mon Sep 17 00:00:00 2001 From: Noridom1 Date: Fri, 3 Jul 2026 22:44:29 +0700 Subject: [PATCH 2/4] fix: run remote replay container as host runner user The pre-built full AIPerf image (aiperf:0.8.0) is distroless and runs as non-root UID 1000, so it could not create /workspace/results in the bind-mounted workspace owned by the runner user (mkdir: Permission denied). The old vllm serving image masked this by running as root. Map the container to the host runner's uid/gid and point HOME at the writable workspace so mkdir/results writes succeed and result files stay runner-owned for the upload/cleanup steps. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_remote.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runners/launch_remote.sh b/runners/launch_remote.sh index cb560d16e..d98cf14be 100755 --- a/runners/launch_remote.sh +++ b/runners/launch_remote.sh @@ -23,11 +23,19 @@ for v in "${RUN_ENV[@]}"; do ENV_ARGS+=(-e "$v") done +# The pre-built full AIPerf image is distroless and runs as non-root UID 1000, +# so it can't write into the bind-mounted workspace (owned by the runner user). +# Map the container to the host runner's uid/gid so mkdir/results writes succeed +# and result files stay runner-owned for the upload/cleanup steps. HOME=/app is +# owned by 1000 and unwritable under the remapped uid, so point HOME at the +# writable workspace (matplotlib is the only remaining HOME writer). docker run --rm \ --init \ --ipc=host \ --network host \ --shm-size=32g \ + --user "$(id -u):$(id -g)" \ + -e HOME=/workspace \ -v "$GITHUB_WORKSPACE:/workspace" \ -v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \ -w /workspace \ From bf570d36359e5ab6adf22aa30f436ab05b410882 Mon Sep 17 00:00:00 2001 From: Noridom1 Date: Fri, 3 Jul 2026 22:49:05 +0700 Subject: [PATCH 3/4] fix: use busybox-compatible timeout flags in remote replay The pre-built full AIPerf image is distroless and ships busybox timeout, which rejects GNU long options: `timeout: unrecognized option '--signal=TERM'`. As a result timeout exited immediately, aiperf never ran, and process_agentic_result.py failed with "profile_export.jsonl not found". Switch to the short flags `-s TERM -k 60`, accepted by both busybox and GNU timeout, so the remote-replay path works on the full image and the pip-install path alike. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/single_node/agentic/_remote_replay.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/_remote_replay.sh b/benchmarks/single_node/agentic/_remote_replay.sh index 1890e8ea9..4f64ffb28 100755 --- a/benchmarks/single_node/agentic/_remote_replay.sh +++ b/benchmarks/single_node/agentic/_remote_replay.sh @@ -46,10 +46,14 @@ set +x # AIPERF_DATASET_CONFIGURATION_TIMEOUT (1800s, see build_replay_cmd) plus the # benchmark duration itself, so the default here leaves headroom above that. AIPERF_MAX_RUNTIME="${AIPERF_MAX_RUNTIME:-2400}" +# Use the short flags -s/-k rather than --signal/--kill-after: the pre-built +# full AIPerf image is distroless and ships busybox timeout, which only accepts +# `timeout [-s SIG] [-k KILL_SECS] SECS PROG`. GNU timeout accepts these too, so +# this is portable across both the full-image and pip-install paths. if [[ "$AIPERF_USE_DOCKER" == "true" ]]; then - timeout --signal=TERM --kill-after=60 "$AIPERF_MAX_RUNTIME" "${DOCKER_REPLAY_ARGS[@]}" 2>&1 | tee "$RESULT_DIR/benchmark.log" || true + timeout -s TERM -k 60 "$AIPERF_MAX_RUNTIME" "${DOCKER_REPLAY_ARGS[@]}" 2>&1 | tee "$RESULT_DIR/benchmark.log" || true else - timeout --signal=TERM --kill-after=60 "$AIPERF_MAX_RUNTIME" $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true + timeout -s TERM -k 60 "$AIPERF_MAX_RUNTIME" $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true fi replay_exit="${PIPESTATUS[0]}" set -x From 9e249e40ab80e5382e64d104f54287dc1898e916 Mon Sep 17 00:00:00 2001 From: Noridom1 Date: Fri, 3 Jul 2026 23:21:53 +0700 Subject: [PATCH 4/4] test: sweep remote Weka replay at conc 2/4/8 Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dee12294e..10eb1425b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9571,4 +9571,4 @@ deepseek-coder-v2-lite-weka-fp8-bench-client-vllm-remote-multi-endpoint-smoke: num-dataset-entries: 949 duration: 900 search-space: - - { tp: 1, ep: 1, conc-list: [2, 4] } + - { tp: 1, ep: 1, conc-list: [2, 4, 8] }