diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml index 7ddaca285c..489bca7888 100644 --- a/.github/workflows/collectivex-sweep.yml +++ b/.github/workflows/collectivex-sweep.yml @@ -1,71 +1,300 @@ # CollectiveX Sweep — one structured run instead of thousands of dispatches. # -# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs -# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that -# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked -# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job -# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't -# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer / -# deepep-hybrid / nccl-ep, + deepep_v2) for full parity. +# Shape: setup -> sweep. The matrix unit is a shard: one allocation that sweeps +# cases sharing (sku, backend, nodes). Each cell uploads its privacy-checked raw +# result JSONs. The isolated v1 publisher consumes downloaded shards separately. name: CollectiveX Sweep +permissions: + contents: read on: workflow_dispatch: inputs: backend: - description: EP library to sweep (deepep matrix is remapped onto the others, capability-filtered) + description: "EP library to sweep — 'all' runs every EP backend in one matrix" type: choice - default: deepep - options: [deepep, uccl, flashinfer, deepep-hybrid, nccl-ep] - deepep_v2: - description: DeepEP V2 from-source kernels (kernel_gen=v2; deepep backend only) - type: boolean - default: false + default: all + options: [all, deepep, deepep-v2, uccl, deepep-hybrid, mori, nccl-ep] suites: description: "'all' or comma-list of suite names" type: string default: all only_sku: - description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all + description: Restrict to one GHA runner pool (h100-dgxc|h200-dgxc|b300|b200-dgxc|gb200|gb300|mi325x|mi355x); blank = all + type: string + default: '' + min_nodes: + description: Keep only shards with >= this tray count (2 = rack-scale EP8 only; blank = all) + type: string + default: '' + max_nodes: + description: Keep only shards with <= this tray count (1 = single-tray EP4 only; blank = all) type: string default: '' max_cases: - description: Max cases per shard cell (chunk larger shards) + description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites) type: string - default: '14' - + default: '128' + diagnostic_execution: + description: Temporary retained-log execution identity; blank runs the benchmark + type: string + default: '' concurrency: - group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }} + group: ${{ inputs.diagnostic_execution != '' && format('cx-diagnostic-{0}', inputs.diagnostic_execution) || format('cx-sweep-{0}-{1}-{2}', github.ref, inputs.backend, inputs.only_sku) }} cancel-in-progress: false jobs: + diagnostic: + if: ${{ inputs.diagnostic_execution != '' }} + runs-on: ${{ 'h100-dgxc' }} + timeout-minutes: 5 + env: + EXECUTION_ID: ${{ inputs.diagnostic_execution }} + steps: + - name: Classify retained private log without disclosing it + run: | + python3 - <<'PY' + import hashlib + import json + import os + import re + import stat + + execution = os.environ.get("EXECUTION_ID", "") + expected = "28706865182_1_h100-dgxc-deepep-v2-n1" + if execution != expected: + raise SystemExit("invalid diagnostic request") + root = f"/tmp/inferencex-collectivex-{os.getuid()}/{expected}" + try: + root_fd = os.open( + root, os.O_RDONLY | os.O_DIRECTORY | os.O_CLOEXEC | os.O_NOFOLLOW + ) + except OSError: + raise SystemExit("retained diagnostic unavailable") from None + metadata = os.fstat(root_fd) + if ( + not stat.S_ISDIR(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != 0o700 + ): + raise SystemExit("private diagnostic directory is unsafe") + + native_status = {} + native_sites = {} + exceptions = {} + trace_sites = {} + terms = {} + digests = [] + total = 0 + logs = 0 + for name in sorted(os.listdir(root_fd)): + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]{0,127}[.]log", name): + continue + fd = -1 + try: + fd = os.open( + name, + os.O_RDONLY | os.O_CLOEXEC | os.O_NOFOLLOW, + dir_fd=root_fd, + ) + item = os.fstat(fd) + if ( + not stat.S_ISREG(item.st_mode) + or item.st_uid != os.getuid() + or stat.S_IMODE(item.st_mode) & 0o077 + or item.st_nlink != 1 + or item.st_size > 64 * 1024 * 1024 + ): + raise RuntimeError + chunks = [] + remaining = item.st_size + while remaining: + chunk = os.read(fd, min(1024 * 1024, remaining)) + if not chunk: + raise RuntimeError + chunks.append(chunk) + remaining -= len(chunk) + if os.read(fd, 1): + raise RuntimeError + payload = b"".join(chunks) + except (OSError, RuntimeError): + raise SystemExit("retained diagnostic validation failed") from None + finally: + if fd >= 0: + os.close(fd) + logs += 1 + total += len(payload) + digests.append(hashlib.sha256(payload).digest()) + for line in payload.splitlines(): + lower = line.lower() + match = re.search(rb" exception \([^()\n]*:[0-9]{1,6}\):\s*([0-9]{1,6})", line) + if match: + key = match.group(1).decode("ascii") + native_status[key] = native_status.get(key, 0) + 1 + native_site = re.search( + rb" exception \([^()\n]*/(nccl[.]cu):([0-9]{1,6})\):\s*([0-9]{1,6})", + line, + ) + if native_site: + key = ":".join( + part.decode("ascii") for part in native_site.groups() + ) + native_sites[key] = native_sites.get(key, 0) + 1 + for found in re.finditer( + rb"(? artifact for the cells; slim (no cases) -> the strategy output. - python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $ov $v2 $os --out matrix_full.json >/dev/null - SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))") - echo "matrix=$SLIM" >> "$GITHUB_OUTPUT" - echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT" - python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))" + args=(--suites "$INPUT_SUITES" --max-cases "$INPUT_MAX_CASES") + case "$INPUT_BACKEND" in + all) args+=(--backends all) ;; + *) args+=(--backend "$INPUT_BACKEND") ;; + esac + [ -n "$INPUT_ONLY_SKU" ] && args+=(--only-sku "$INPUT_ONLY_SKU") + [ -n "$INPUT_MIN_NODES" ] && args+=(--min-nodes "$INPUT_MIN_NODES") + [ -n "$INPUT_MAX_NODES" ] && args+=(--max-nodes "$INPUT_MAX_NODES") + python3 sweep_matrix.py "${args[@]}" --out matrix_full.json >/dev/null + python3 artifact_safety.py matrix_full.json + SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='case_ids'} for x in m['include']]}))") + { + echo "matrix=$SLIM" + echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" + echo "source_backends=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(' '.join(sorted({x['backend'] for x in m['include']} & {'deepep-v2','deepep-hybrid'})))")" + } >> "$GITHUB_OUTPUT" + unsupported_n=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(sum(x['disposition']=='unsupported' for x in m['requested_cases']))") + echo "unsupported_n=$unsupported_n" >> "$GITHUB_OUTPUT" + if [ "$unsupported_n" -gt 0 ]; then + python3 sweep_matrix.py --emit-unsupported-from matrix_full.json \ + --out-dir unsupported + fi + python3 -c "import json;m=json.load(open('matrix_full.json'));r=m['requested_cases'];print('shard-cells:',len(m['include']),'runnable:',sum(x['disposition']=='runnable' for x in r),'unsupported:',sum(x['disposition']=='unsupported' for x in r))" + - name: Prepare pinned backend source archive + if: ${{ steps.gen.outputs.source_backends != '' }} + working-directory: experimental/CollectiveX + env: + SOURCE_BACKENDS: ${{ steps.gen.outputs.source_backends }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_sources + run: | + set -euo pipefail + source runtime/common.sh + work="$RUNNER_TEMP/collectivex-backend-sources" + archive="$RUNNER_TEMP/collectivex-backend-sources.tar" + rm -rf -- "$work" "$archive" + umask 077 + mkdir -m 700 "$work" + mkdir -p "$work/experimental/CollectiveX" + read -r -a backends <<< "$SOURCE_BACKENDS" + [ "${#backends[@]}" -gt 0 ] + for backend in "${backends[@]}"; do + cx_prepare_backend_source "$work" "$backend" + done + cx_cleanup_private_logs 0 + tar --sort=name --mtime='@1' --owner=0 --group=0 --numeric-owner \ + -C "$work/experimental/CollectiveX" -cf "$archive" .cx_sources + sha256sum "$archive" + rm -rf -- "$work" + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: ${{ steps.gen.outputs.source_backends != '' }} + with: + name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-backend-sources.tar + if-no-files-found: error + retention-days: 3 - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: cxsweep-matrix-${{ github.run_id }} + name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }} path: experimental/CollectiveX/matrix_full.json if-no-files-found: error + - name: Validate unsupported artifact safety + id: unsupported_safety + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 }} + run: | + python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/unsupported/*.json + - name: Validate unsupported outcomes + id: unsupported_contracts + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_safety.outcome == 'success' }} + env: + COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported + run: | + python3 experimental/CollectiveX/contracts.py validate-delivery \ + --source experimental/CollectiveX/matrix_full.json \ + --disposition unsupported \ + experimental/CollectiveX/unsupported/*.json + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_contracts.outcome == 'success' && steps.unsupported_safety.outcome == 'success' }} + with: + name: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }} + path: experimental/CollectiveX/unsupported/*.json + if-no-files-found: error # ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ---- sweep: @@ -73,82 +302,248 @@ jobs: if: ${{ fromJSON(needs.setup.outputs.n) > 0 }} strategy: fail-fast: false - max-parallel: 10 # don't saturate the ~20-runner fleet; cells queue as slots free + max-parallel: 10 matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} - # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental). - runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }} + runs-on: ${{ matrix.sku }} timeout-minutes: 350 env: CX_BENCH: ${{ matrix.backend }} - CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }} CX_NODES: ${{ matrix.nodes }} - CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json + CX_GPUS_PER_NODE: ${{ matrix.gpus_per_node }} + CX_SCALE_UP_DOMAIN: ${{ matrix.scale_up_domain }} + CX_SHARD_FILE: .shards/${{ matrix.id }}.json + CX_SHARD_SKU: ${{ matrix.sku }} + COLLECTIVEX_CANONICAL_GHA: '1' COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} - CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} - CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + COLLECTIVEX_ARTIFACT_NAME: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }} + # Consolidated shards run one bounded build-group in one Slurm allocation, so + # the launcher's default 45-min --time is too short. 300 min covers a cold + # compute-node image import plus the shard. The allocation releases early + # when the shard finishes, so short shards don't waste it. + CX_TIME: '300' + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.id }} + CX_JOB_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }} + CX_SOURCE_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/source + HOME: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/home steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 - with: { clean: true } + - name: Prepare isolated source + id: source + env: + COLLECTIVEX_REPOSITORY: ${{ github.repository }} + run: | + set -euo pipefail + python3 - <<'PY' + import os + import re + import shutil + import stat + import time + + pattern = re.compile(r"inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+") + cutoff = time.time() - 86400 + for entry in os.scandir("/tmp"): + if not pattern.fullmatch(entry.name): + continue + try: + metadata = entry.stat(follow_symlinks=False) + except FileNotFoundError: + continue + if ( + not stat.S_ISDIR(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != 0o700 + or metadata.st_mtime >= cutoff + ): + continue + marked = False + for marker_name in ("cleanup-safe", "cleanup-unsafe"): + try: + marker = os.stat( + os.path.join(entry.path, marker_name), follow_symlinks=False + ) + except FileNotFoundError: + continue + marked = ( + stat.S_ISREG(marker.st_mode) + and marker.st_uid == os.getuid() + and stat.S_IMODE(marker.st_mode) == 0o600 + ) + if marked: + break + if marked: + shutil.rmtree(entry.path) + PY + [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + || { echo "CollectiveX isolated root is invalid" >&2; exit 1; } + [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \ + || { echo "CollectiveX source root is invalid" >&2; exit 1; } + if [ -e "$CX_JOB_ROOT" ] || [ -L "$CX_JOB_ROOT" ]; then + echo "CollectiveX isolated root already exists" >&2 + exit 1 + fi + umask 077 + mkdir -m 700 -- "$CX_JOB_ROOT" + trap 'rc=$?; [ "$rc" = 0 ] || rm -rf -- "$CX_JOB_ROOT"; exit "$rc"' EXIT + mkdir -m 700 -- "$HOME" "$CX_JOB_ROOT/control" "$CX_JOB_ROOT/artifact" "$CX_SOURCE_ROOT" + : > "$CX_JOB_ROOT/cleanup-safe" + if ! { + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null git init -q "$CX_SOURCE_ROOT" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" remote add origin \ + "https://github.com/${COLLECTIVEX_REPOSITORY}.git" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" -c credential.helper= -c protocol.version=2 \ + fetch -q --no-tags --depth=1 origin "$COLLECTIVEX_SOURCE_SHA" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" -c advice.detachedHead=false \ + checkout -q --detach FETCH_HEAD + [ "$(git -C "$CX_SOURCE_ROOT" rev-parse HEAD)" = "$COLLECTIVEX_SOURCE_SHA" ] + } /dev/null 2>&1; then + echo "CollectiveX source preparation failed" >&2 + exit 1 + fi + [ "$(stat -c '%a' "$CX_JOB_ROOT")" = 700 ] \ + || { echo "CollectiveX isolated root has unsafe permissions" >&2; exit 1; } + echo 'prepared=true' >> "$GITHUB_OUTPUT" + trap - EXIT - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: - name: cxsweep-matrix-${{ github.run_id }} - path: experimental/CollectiveX - - name: Extract this shard's cases (stdlib only — no runner deps) - working-directory: experimental/CollectiveX + name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ env.CX_JOB_ROOT }}/control + - name: Download pinned backend source archive + if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }} + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ env.CX_JOB_ROOT }}/control + - name: Install pinned backend source seed + if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }} + env: + EXPECTED_BACKEND: ${{ matrix.backend }} run: | set -euo pipefail - python3 -c " - import json - m=json.load(open('matrix_full.json')) - s=[x for x in m['include'] if x['id']=='${{ matrix.id }}'] - assert s, 'shard ${{ matrix.id }} not in matrix' - s=s[0] - json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w')) - print('shard ${{ matrix.id }}:', len(s['cases']), 'cases') - " + archive="$CX_JOB_ROOT/control/collectivex-backend-sources.tar" + destination="$CX_SOURCE_ROOT/experimental/CollectiveX" + seed_root="$destination/.cx_sources" + [ -f "$archive" ] && [ ! -e "$seed_root" ] && [ ! -L "$seed_root" ] + python3 - "$archive" <<'PY' + from pathlib import PurePosixPath + import sys + import tarfile + + with tarfile.open(sys.argv[1]) as archive: + for member in archive.getmembers(): + path = PurePosixPath(member.name) + if ( + not path.parts + or path.parts[0] != ".cx_sources" + or ".." in path.parts + or member.issym() + or member.islnk() + or member.isdev() + ): + raise SystemExit("invalid backend source archive") + PY + umask 077 + tar --extract --no-same-owner --no-same-permissions \ + --file "$archive" --directory "$destination" + source "$destination/runtime/common.sh" + source_path="$(cx_backend_source_path "$seed_root" "$EXPECTED_BACKEND")" + cx_backend_source_is_valid "$EXPECTED_BACKEND" "$source_path" + printf 'CX_BACKEND_SOURCE_SEED_ROOT=%s\n' "$seed_root" >> "$GITHUB_ENV" + - name: Extract and validate this shard's cases + run: | + set -euo pipefail + cd "$CX_SOURCE_ROOT/experimental/CollectiveX" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 sweep_matrix.py \ + --extract-from "$CX_JOB_ROOT/control/matrix_full.json" \ + --shard-id '${{ matrix.id }}' \ + --expect-sku '${{ matrix.sku }}' \ + --expect-backend '${{ matrix.backend }}' \ + --expect-nodes '${{ matrix.nodes }}' \ + --out '${{ env.CX_SHARD_FILE }}' >/dev/null - name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation) + id: sweep_shard env: - RUNNER_NAME: ${{ runner.name }} - run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + COLLECTIVEX_OPERATOR_CONFIG_CONTENT: ${{ secrets.COLLECTIVEX_OPERATOR_CONFIG_V1 }} + COLLECTIVEX_OPERATOR_CONFIG_REQUIRED: '1' + run: | + set -euo pipefail + umask 077 + : > "$CX_JOB_ROOT/cleanup-unsafe" + rm -f -- "$CX_JOB_ROOT/cleanup-safe" + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + bash "experimental/CollectiveX/launchers/launch_${{ matrix.launcher }}.sh" + - name: Confirm allocation cleanup + id: allocation_cleanup + if: ${{ always() && steps.source.outputs.prepared == 'true' }} + run: | + set -euo pipefail + [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \ + || { echo "CollectiveX allocation cleanup was not confirmed" >&2; exit 1; } + - name: Validate shard artifact safety + id: artifact_safety + if: ${{ always() && steps.allocation_cleanup.outcome == 'success' }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json + - name: Validate shard delivery completeness + id: delivery_contracts + if: ${{ always() && steps.artifact_safety.outcome == 'success' }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 experimental/CollectiveX/contracts.py validate-delivery \ + --source "experimental/CollectiveX/${CX_SHARD_FILE}" \ + experimental/CollectiveX/results/*.json - name: Shard summary - if: always() - run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true + if: ${{ always() && steps.artifact_safety.outcome == 'success' && steps.delivery_contracts.outcome == 'success' }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 experimental/CollectiveX/summarize.py \ + --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true + - name: Stage shard artifact + id: stage_artifact + if: ${{ always() && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' }} + run: | + set -euo pipefail + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + cp -- experimental/CollectiveX/results/*.json "$CX_JOB_ROOT/artifact/" - name: Upload shard results - if: always() + id: upload_artifact + if: always() && steps.stage_artifact.outcome == 'success' && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: cxshard-${{ matrix.id }}-${{ github.run_id }} - path: experimental/CollectiveX/results/*.json # glob skips the hidden .shard_*.json - if-no-files-found: warn - - # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ---- - aggregate: - needs: sweep - if: always() - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 - with: { clean: true } - - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - pattern: cxshard-*-${{ github.run_id }} - path: _shards - merge-multiple: true - - name: Aggregate shards -> one ndjson - working-directory: experimental/CollectiveX + name: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }} + path: | + ${{ env.CX_JOB_ROOT }}/artifact/*.json + if-no-files-found: error + - name: Cleanup isolated workspace + if: ${{ always() && steps.source.outputs.prepared == 'true' }} run: | set -euo pipefail - tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}" - python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson" - { - echo "## CollectiveX sweep aggregate (${tag})" - echo '```' - wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - name: Upload aggregate - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }} - path: experimental/CollectiveX/results/aggregate/*.ndjson - if-no-files-found: warn + [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + || { echo "CollectiveX cleanup root is invalid" >&2; exit 1; } + [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \ + || { echo "CollectiveX cleanup source is invalid" >&2; exit 1; } + [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \ + || { echo "CollectiveX allocation cleanup was not confirmed; retaining isolated files" >&2; exit 1; } + if [ '${{ steps.sweep_shard.outcome }}' = success ] \ + && [ '${{ steps.allocation_cleanup.outcome }}' = success ] \ + && [ '${{ steps.artifact_safety.outcome }}' = success ] \ + && [ '${{ steps.delivery_contracts.outcome }}' = success ] \ + && [ '${{ steps.stage_artifact.outcome }}' = success ] \ + && [ '${{ steps.upload_artifact.outcome }}' = success ] \ + && [ -f "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" ]; then + # shellcheck source=/dev/null + if source "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" \ + >/dev/null 2>&1; then + cx_cleanup_private_logs 0 + fi + fi + rm -rf -- "$CX_JOB_ROOT" diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore new file mode 100644 index 0000000000..56b307215b --- /dev/null +++ b/experimental/CollectiveX/.gitignore @@ -0,0 +1,15 @@ +__pycache__/ +*.pyc +results/ +unsupported/ +.shards/ +.cx_workloads/ +.cx_backend/ +/matrix_full.json +gpucore.* + +# Local plans and infrastructure inventory. +goal.md +notes.md +configs/platforms.yaml +private-infra.md diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md new file mode 100644 index 0000000000..bd01428974 --- /dev/null +++ b/experimental/CollectiveX/README.md @@ -0,0 +1,115 @@ +# CollectiveX + +
+ +**English** | [中文](./README_zh.md) + +
+ +CollectiveX is an experimental MoE expert-parallel communication benchmark. It measures dispatch, +combine, and paired roundtrip latency across EP libraries and accelerator systems. + +> Publication hold: historical schema 3-5 data is diagnostic. No current dataset is approved for +> rankings, recommendations, or regression baselines. + +## v1 Execution Profile + +Every scheduled case is BF16, normal mode, `layout-and-dispatch-v1`, backend-tuned resources, packed +placement, and `fixed-512-v1` sampling: 64 trials x 8 timed iterations with 32 synchronized full +roundtrip warmups before each measured component at every trial/point. Roundtrip is measured first, +and every backend uses the same phase-specific conditioning ramp and ascending point order. Routing is limited +to uniform and one Zipf sensitivity; EPLB is measured only +as the Zipf remedy. Combine returns activation payload only on every backend; gate weights are verified +at dispatch. A stdlib integer counter produces byte-identical routing and gate weights. + +The current matrix has 38 runnable allocation cells across H100, H200, B200, B300, GB200, GB300, +MI325X, and MI355X. It requests 360 cases / 840 token points: 228 runnable cases / 532 points and +132 explicit unsupported cases / 308 points. `sweep_matrix.py` materializes every token ladder and +rejects missing, stale, malformed, or altered shard controls. Workflow shards are emitted round-robin +by SKU so the bounded GHA matrix can use every available runner pool from its first scheduling cycle. + +| Backend | Current scope | +|---|---| +| DeepEP V1 | Image-pinned `deep_ep.Buffer`: upstream v1.2.1 on x86 and the image's GB fork on arm64 | +| DeepEP V2 | PR #605 `ElasticBuffer` plus the upstream #630 scale-up fix; NCCL Device API LSA and source/SASS-bound reproducible JIT | +| DeepEP Hybrid | Pinned `HybridEPBuffer`; realized auto-tuned config and JIT keys; NVLink/MNNVL domain | +| UCCL | Pinned 0.1.1 wheel and wrapper on Hopper; Blackwell is explicitly unsupported | +| NCCL/RCCL A2A | Portable rank-deduplicated payload plus expert/routing-metadata reference | +| MoRI | MI325X AsyncLL transport and MI355X intranode transport | + +FlashInfer is outside v1 because its exercised EP path failed intermittently at runtime. It is not +misreported as a platform capability limitation and can return after a stable pinned path is proven. + +DeepEP V2 means the `ElasticBuffer` implementation introduced by +[DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605), not a newer legacy `Buffer` build. +The pinned source is the minimal upstream [PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) +follow-up: its parent is the #605 merge tree and its only source change fixes pure scale-up +initialization when GIN is unavailable. Every v1 V2 case fits inside its declared NVLink/MNNVL +scale-up domain, so the adapter requests NCCL Device API LSA and disables network GIN. It then +requires NCCL's realized LSA team to cover the full EP world; a smaller realized domain fails rather +than being mislabeled. A true scale-out case must use and identify GIN separately. The isolated +build records the API, source, loaded libraries, generated JIT source, executable SASS, and raw +CUBIN diagnostics. NVIDIA SKUs remain unvalidated until their GPU outcomes pass the native +correctness and publication gates. + +Removed v1 axes include cached-layout `[cl]`, runtime-visible `[rv]`, LL, FP8, quantized combine, +extra routing distributions, activation profiles, uneven allocation, placement permutations, model +envelopes, and scaling studies. + +## Workflow And Artifacts + +`.github/workflows/collectivex-sweep.yml` generates a public-SKU matrix, extracts a strict ignored +`.shards/.json` control, executes one allocation per shard, privacy-checks result JSON, and uploads +raw GitHub artifacts. Raw producers are diagnostic-only; they cannot self-promote evidence. + +Development publication uses one self-hosted persistent filesystem. GitHub artifacts are +transient input; Vercel storage, GCP, Neon, managed databases, and managed object stores are out of +scope. `publisher.py` ingests complete downloaded workflow artifacts, verifies or promotes explicit +bundle IDs, and writes the atomic content-addressed layout consumed by the frontend. It never runs on +GPU workers. The store contract and promotion gates are in [docs/methodology.md](docs/methodology.md). + +## Runner Configuration + +Runner-local Slurm and storage values use a strict per-SKU JSON document at +`$XDG_CONFIG_HOME/inferencex/collectivex.json` or `COLLECTIVEX_OPERATOR_CONFIG`. The mode-0600, +same-owner, non-symlink file is outside the checkout and never uploaded. Unknown runners, fields, +duplicate keys, endpoint literals, unsafe paths, and non-JSON input fail closed; configuration is +never evaluated as shell. GHA passes encrypted `COLLECTIVEX_OPERATOR_CONFIG_V1` content only to the +launcher, which validates it, exports the selected SKU's allowlisted values, and deletes the +temporary copy before allocation. Required JSON fields are: + +| SKU | Variables | +|---|---| +| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir` | +| `h200-dgxc` | `partition`, `squash_dir` | +| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `gb200` | `partition`, `account`, ordered `storage_roots` | +| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` | +| `mi325x`, `mi355x` | `partition`, `squash_dir` | + +Before import, each Docker Hub tag is resolved with bounded registry requests and must match its +pinned digest; digest-qualified overrides are rejected. Enroot imports use a fixed filesystem epoch +and a versioned, registry-digest-bound cache key. Every mounted squash is freshly hashed. The +verified registry digest and local squash hash are both recorded. Image-provided DeepEP is checked +against exact wheel and installed-file fingerprints; source-built backends use pinned commits and +runtime-verified GPU targets. DeepEP V2's mode-0700 cluster-local build cache is keyed by a versioned +build recipe, verified image, architecture, upstream trees, and dependency pins; only its fixed +`/cx-cache` mount reaches the container, and it never enters result artifacts. +Compute containers receive an explicit environment allowlist. Private host, address, device, NIC, +credential, workspace, and path data stays in encrypted config, ignored operator notes, or bounded +mode-0600 runner logs; it is never uploaded. + +## Local Checks + +```bash +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py' +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify +bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh +``` + +Core paths are `capability.py`, `configs/`, `contracts.py`, `schemas/`, `sweep_matrix.py`, +`publisher.py`, `runtime/`, `launchers/`, and `tests/`. diff --git a/experimental/CollectiveX/README_zh.md b/experimental/CollectiveX/README_zh.md new file mode 100644 index 0000000000..bed2172d40 --- /dev/null +++ b/experimental/CollectiveX/README_zh.md @@ -0,0 +1,111 @@ +# CollectiveX + +
+ +[English](./README.md) | **中文** + +
+ +CollectiveX 是实验性的 MoE 专家并行通信基准,用于测量不同 EP 库和加速器系统的 +dispatch、combine 及配对 roundtrip 延迟。 + +> 发布暂停:历史 schema 3-5 数据仅供诊断。目前没有数据集获准用于排名、推荐或回归基线。 + +## v1 执行配置 + +每个调度用例均采用 BF16、normal mode、`layout-and-dispatch-v1`、后端调优资源、packed +placement 以及 `fixed-512-v1` 采样:64 trials x 8 timed iterations;每个 trial/point 的每个 +被测组件前执行 32 次同步完整 roundtrip warmup。先测 roundtrip;所有后端使用相同的分阶段 +conditioning ramp 和升序点位。Routing 仅保留 uniform 和一个 Zipf 敏感性场景,EPLB 只作为 +Zipf 的修正方案测量。所有后端的 combine 仅返回 activation payload,gate weights 在 dispatch +阶段接受校验。stdlib 整数计数器生成逐字节一致的 routing 和 gate weights。 + +当前矩阵覆盖 H100、H200、B200、B300、GB200、GB300、MI325X 和 MI355X,共 38 个可运行 +allocation cells。矩阵请求 360 个 cases / 840 个 token points:228 个可运行 cases / 532 个 +points,以及 132 个显式 unsupported cases / 308 个 points。`sweep_matrix.py` 物化每个 token +ladder,并拒绝缺失、过期、格式错误或被修改的 shard controls。Workflow shards 按 SKU +round-robin 发出,使受限的 GHA matrix 从第一个调度周期起即可使用所有可用 runner pools。 + +| 后端 | 当前范围 | +|---|---| +| DeepEP V1 | 镜像固定的 `deep_ep.Buffer`:x86 使用 upstream v1.2.1,arm64 使用镜像内 GB fork | +| DeepEP V2 | PR #605 `ElasticBuffer` 加 upstream #630 scale-up 修复;NCCL Device API LSA 与 source/SASS 绑定的可复现 JIT | +| DeepEP Hybrid | 固定的 `HybridEPBuffer`;记录实际自动调优配置与 JIT keys;NVLink/MNNVL domain | +| UCCL | Hopper 上固定的 0.1.1 wheel 和 wrapper;Blackwell 显式标为 unsupported | +| NCCL/RCCL A2A | 可移植的 rank-deduplicated payload 加 expert/routing-metadata reference | +| MoRI | MI325X AsyncLL transport 和 MI355X intranode transport | + +FlashInfer 不在 v1 范围内,因为已测试的 EP path 在运行时存在间歇性失败。该问题不会被误报为 +平台能力限制;在证明有稳定的固定实现后可重新加入。 + +DeepEP V2 指 [DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605) 引入的 +`ElasticBuffer` 实现,而不是更新的 legacy `Buffer` build。固定 source 使用最小化的 upstream +[PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) 后续修复:其 parent 是 #605 merge +tree,唯一 source 变更是修复 GIN 不可用时的纯 scale-up 初始化。v1 的所有 V2 cases 都位于各自 +声明的 NVLink/MNNVL scale-up domain 内,因此 adapter 请求 NCCL Device API LSA 并禁用网络 +GIN。随后必须确认 NCCL 实际建立的 LSA team 覆盖整个 EP world;若实际 domain 更小,case +会直接失败而不会被错误标注。真正的 scale-out case 必须单独启用并标识 GIN。隔离构建会记录 +API、source、loaded libraries、generated JIT source、executable SASS 与 raw CUBIN +diagnostics。在 GPU outcome 通过 native correctness 和 publication gates 前,各 NVIDIA SKU +仍为 unvalidated。 + +v1 已移除的轴包括 cached-layout `[cl]`、runtime-visible `[rv]`、LL、FP8、quantized combine、 +额外 routing distributions、activation profiles、uneven allocation、placement permutations、 +model envelopes 和 scaling studies。 + +## Workflow 与产物 + +`.github/workflows/collectivex-sweep.yml` 生成 public-SKU matrix,提取严格且被忽略的 +`.shards/.json` control,每个 shard 执行一次 allocation,对结果 JSON 做隐私检查并上传 +raw GitHub artifacts。Raw producers 仅供诊断,不能自行提升 evidence。 + +开发阶段发布使用一个 self-hosted persistent filesystem。GitHub artifacts 仅作为临时输入; +Vercel storage、GCP、Neon、managed databases 和 managed object stores 均不在范围内。 +`publisher.py` 摄取完整下载的 workflow artifacts,验证或提升显式 bundle IDs,并写入供前端 +使用的原子 content-addressed layout。它不会在 GPU workers 上运行。Store contract 和 promotion +gates 见 [docs/methodology_zh.md](docs/methodology_zh.md)。 + +## Runner 配置 + +Runner 本地 Slurm 和 storage 值使用严格的 per-SKU JSON 文档,路径为 +`$XDG_CONFIG_HOME/inferencex/collectivex.json` 或 `COLLECTIVEX_OPERATOR_CONFIG`。该 mode-0600、 +同 owner、非 symlink 文件位于 checkout 外且永不上传。未知 runners、fields、duplicate keys、 +endpoint literals、unsafe paths 和非 JSON 输入均 fail closed;配置绝不作为 shell 执行。GHA +仅将加密的 `COLLECTIVEX_OPERATOR_CONFIG_V1` 内容传给 launcher;launcher 验证后只导出所选 +SKU 的 allowlisted values,并在 allocation 前删除临时副本。必需 JSON fields 如下: + +| SKU | 变量 | +|---|---| +| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir` | +| `h200-dgxc` | `partition`, `squash_dir` | +| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `gb200` | `partition`, `account`, 有序 `storage_roots` | +| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` | +| `mi325x`, `mi355x` | `partition`, `squash_dir` | + +导入前,每个 Docker Hub tag 都通过有界 registry requests 解析,并且必须匹配固定 digest;拒绝 +digest-qualified overrides。Enroot imports 使用固定 filesystem epoch 和带版本、绑定 registry +digest 的 cache key。每个已挂载 squash 都重新计算 hash,同时记录 verified registry digest 和 +local squash hash。镜像提供的 DeepEP 会按精确 wheel 和 installed-file fingerprints 检查; +source-built backends 使用固定 commits 和 runtime-verified GPU targets。DeepEP V2 的 mode-0700 +cluster-local build cache 由版本化 build recipe、verified image、architecture、upstream +trees 和 dependency pins 共同寻址;container 只看到固定的 `/cx-cache` mount,且该 cache 永不 +进入 result artifacts。 +Compute containers 仅接收显式 environment allowlist。Private host、address、device、NIC、 +credential、workspace 和 path 数据只保留在加密配置、忽略的 operator notes 或有界 mode-0600 +runner logs 中,永不上传。 + +## 本地检查 + +```bash +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py' +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify +bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh +``` + +核心路径为 `capability.py`、`configs/`、`contracts.py`、`schemas/`、`sweep_matrix.py`、 +`publisher.py`、`runtime/`、`launchers/` 和 `tests/`。 diff --git a/experimental/CollectiveX/artifact_safety.py b/experimental/CollectiveX/artifact_safety.py new file mode 100644 index 0000000000..83d522fba8 --- /dev/null +++ b/experimental/CollectiveX/artifact_safety.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +"""Fail-closed privacy check for CollectiveX public result documents.""" +from __future__ import annotations + +import argparse +import ipaddress +import json +import os +import re +import stat + + +SENSITIVE_FIELDS = frozenset({ + "environment", "env", "host", "hostname", "uuid", "gpu_uuid", "device_uuid", + "pci_bus_id", "ip_address", "ip_addresses", "master_addr", "ssh", "ssh_target", + "nodelist", "node_list", "nic_guid", "ib_guid", "topology_matrix", "rdma_devices", + "user", "username", "password", "passwd", "secret", "token", "access_token", + "api_token", "auth_token", "api_key", "private_key", "credential", "credentials", + "address", "addresses", "ip", "ips", +}) +SENSITIVE_FIELDS_COMPACT = frozenset(item.replace("_", "") for item in SENSITIVE_FIELDS) +SENSITIVE_FIELD_SUFFIXES = ( + "_host", "_hostname", "_address", "_addresses", "_path", "_paths", "_ip", "_ips", + "_password", "_passwd", "_secret", "_token", "_credential", "_credentials", + "_uuid", "_guid", "_bus_id", +) +SENSITIVE_VALUE_PATTERNS = ( + ("private-path", re.compile( + r"(? str: + normalized = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", str(value).strip()) + normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized) + return normalized.lower().replace("-", "_") + + +def _sensitive_value_rule(value: str, *, contextual: bool = True) -> str | None: + matched = next( + ( + name for name, pattern in SENSITIVE_VALUE_PATTERNS + if (contextual or name not in CONTEXTUAL_VALUE_RULES) and pattern.search(value) + ), + None, + ) + if matched: + return matched + for candidate in IPV6_CANDIDATE.findall(value): + try: + address = candidate.split("%", 1)[0] + if ipaddress.ip_address(address).version == 6: + return "ipv6-address" + except ValueError: + continue + return None + + +def assert_publication_safe(docs: list[dict]) -> None: + """Reject private infrastructure fields and value shapes.""" + def walk(value, doc_index: int, parent_field: str | None = None) -> None: + if isinstance(value, dict): + for key, child in value.items(): + field = _normalized_field(key) + compact = field.replace("_", "") + if ( + field in SENSITIVE_FIELDS + or compact in SENSITIVE_FIELDS_COMPACT + or field.endswith(SENSITIVE_FIELD_SUFFIXES) + ): + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden private field" + ) + key_rule = _sensitive_value_rule(str(key)) + if key_rule: + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden {key_rule} key" + ) + walk(child, doc_index, field) + elif isinstance(value, list): + for child in value: + walk(child, doc_index, parent_field) + elif isinstance(value, str): + rule = _sensitive_value_rule(value, contextual=parent_field != "ref") + if rule: + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden {rule} value" + ) + + for index, doc in enumerate(docs): + if not isinstance(doc, dict): + raise ArtifactSafetyError(f"artifact safety: doc[{index}] is not a JSON object") + walk(doc, index) + + +def load_documents(paths: list[str]) -> list[dict]: + docs: list[dict] = [] + for path in paths: + try: + metadata = os.lstat(path) + except OSError as exc: + raise ArtifactSafetyError("artifact safety: result file is unavailable") from exc + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or metadata.st_size <= 0 + or metadata.st_size > MAX_INPUT_BYTES + ): + raise ArtifactSafetyError("artifact safety: result file is unavailable") + descriptor = -1 + try: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + opened = os.fstat(descriptor) + if ( + not stat.S_ISREG(opened.st_mode) + or (opened.st_dev, opened.st_ino, opened.st_size) + != (metadata.st_dev, metadata.st_ino, metadata.st_size) + ): + raise ArtifactSafetyError("artifact safety: result file changed during open") + with os.fdopen(descriptor, encoding="utf-8") as fh: + descriptor = -1 + if path.endswith(".ndjson"): + for line_number, line in enumerate(fh, 1): + if not line.strip(): + continue + try: + docs.append(json.loads(line)) + except json.JSONDecodeError as exc: + raise ArtifactSafetyError( + f"artifact safety: malformed NDJSON at input line {line_number}" + ) from exc + else: + docs.append(json.load(fh)) + except json.JSONDecodeError as exc: + raise ArtifactSafetyError("artifact safety: malformed JSON input") from exc + except (OSError, UnicodeError) as exc: + raise ArtifactSafetyError("artifact safety: result file is unreadable") from exc + finally: + if descriptor >= 0: + os.close(descriptor) + if not docs: + raise ArtifactSafetyError("artifact safety: no public result documents found") + return docs + + +def main() -> int: + parser = argparse.ArgumentParser(description="Check CollectiveX result artifacts for private data") + parser.add_argument("paths", nargs="+") + args = parser.parse_args() + try: + docs = load_documents(args.paths) + assert_publication_safe(docs) + except ArtifactSafetyError as exc: + parser.error(str(exc)) + print(f"artifact safety: {len(docs)} public document(s) passed") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/capability.py b/experimental/CollectiveX/capability.py new file mode 100644 index 0000000000..6a069b09b9 --- /dev/null +++ b/experimental/CollectiveX/capability.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +"""Public runner and backend capability registry for CollectiveX v1.""" + +from __future__ import annotations + +import re + + +DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +DEEPEP_V2_SKU_CAPABILITIES = { + "h100-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"}, + "h200-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"}, + "b200-dgxc": {"schedulable": True, "basis": "upstream-sm100-result"}, + "gb200": {"schedulable": True, "basis": "upstream-sm100-result"}, + "b300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"}, + "gb300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"}, + "mi325x": {"schedulable": False, "basis": "nvidia-only"}, + "mi355x": {"schedulable": False, "basis": "nvidia-only"}, +} +PLATFORMS = { + "h100-dgxc": dict(vendor="nvidia", arch="sm90", machine="amd64", product="h100", transport="nvlink", topology_class="h100-nvlink-island", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"), + "h200-dgxc": dict(vendor="nvidia", arch="sm90", machine="amd64", product="h200", transport="nvlink", topology_class="h200-nvlink-island", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"), + "b200-dgxc": dict(vendor="nvidia", arch="sm100", machine="amd64", product="b200", transport="nvlink", topology_class="b200-nvlink-island", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"), + "b300": dict(vendor="nvidia", arch="sm103", machine="amd64", product="b300", transport="nvlink", topology_class="b300-nvlink-island", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"), + "gb200": dict(vendor="nvidia", arch="sm100", machine="arm64", product="gb200", transport="mnnvl", topology_class="gb200-nvl72-mnnvl", + gpus_per_node=4, scale_up_domain=72, ep_degrees=(4, 8), launcher="gb-nv"), + "gb300": dict(vendor="nvidia", arch="sm103", machine="arm64", product="gb300", transport="mnnvl", topology_class="gb300-nvl72-mnnvl", + gpus_per_node=4, scale_up_domain=72, ep_degrees=(4, 8), launcher="gb-nv"), + "mi325x": dict(vendor="amd", arch="gfx942", machine="amd64", product="mi325x", transport="xgmi", topology_class="mi325x-xgmi", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="mi-amds"), + "mi355x": dict(vendor="amd", arch="gfx950", machine="amd64", product="mi355x", transport="xgmi", topology_class="mi355x-xgmi", + gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="mi-amds"), +} + +BACKENDS = { + "deepep": {"vendors": {"nvidia"}}, + "deepep-v2": { + "vendors": {"nvidia"}, + "implementation": "deep_ep.ElasticBuffer", + "source": "deepseek-ai/DeepEP#605+#630", + "commit": DEEPEP_V2_COMMIT, + "communication_backend": "nccl-device-lsa", + "torch": "2.10.0+cu130", + "nccl": "2.30.4", + "sku_capabilities": DEEPEP_V2_SKU_CAPABILITIES, + }, + "uccl": { + "vendors": {"nvidia"}, + "machines": {"amd64"}, + "excluded_skus": {"b200-dgxc", "b300"}, + }, + "deepep-hybrid": {"vendors": {"nvidia"}}, + "mori": {"vendors": {"amd"}}, + "nccl-ep": {"vendors": {"nvidia", "amd"}}, +} +SWEEP_BACKENDS = tuple(BACKENDS) + + +def runtime_identity_issues( + sku: str, *, vendor: str, arch: str, machine: str, device_name: str, + device_count: int, world_size: int, +) -> list[str]: + """Validate public product identity on every rank without private device identifiers.""" + platform = PLATFORMS.get(sku) + if platform is None: + return [f"unknown runner identity {sku!r}"] + issues = [] + for field, observed in (("vendor", vendor), ("arch", arch), ("machine", machine)): + if observed != platform[field]: + issues.append(f"{field}={observed!r}, expected {platform[field]!r}") + products = set(re.findall(r"[a-z]+\d+[a-z]*", device_name.lower())) + if platform["product"] not in products: + issues.append(f"device product {device_name!r} does not identify {platform['product']}") + if device_count != platform["gpus_per_node"]: + issues.append( + f"visible GPUs={device_count}, expected {platform['gpus_per_node']} per node" + ) + if world_size not in platform["ep_degrees"]: + issues.append(f"EP{world_size} is not registered for {sku}") + return issues + + +def resolve(sku: str, backend: str, *, nodes: int = 1, routing: str = "uniform", + eplb: bool = False) -> tuple[bool, str]: + """Return whether one fixed-v1 case can run on a public GHA runner label.""" + platform, implementation = PLATFORMS.get(sku), BACKENDS.get(backend) + if platform is None: + return False, f"unknown GHA runner label {sku!r}" + if implementation is None: + return False, f"unknown backend {backend!r}" + if nodes < 1 or nodes * platform["gpus_per_node"] not in platform["ep_degrees"]: + return False, f"{sku} does not register a {nodes}-node EP degree" + if routing not in {"uniform", "zipf"} or (eplb and routing != "zipf"): + return False, "v1 routing is uniform or zipf, with EPLB only on zipf" + if platform["vendor"] not in implementation["vendors"]: + return False, f"{backend} does not support {platform['vendor']}" + sku_capability = implementation.get("sku_capabilities", {}).get(sku) + if sku_capability is not None and not sku_capability["schedulable"]: + return False, f"{backend} is unsupported on {sku}: {sku_capability['basis']}" + if platform["machine"] not in implementation.get("machines", {platform["machine"]}): + return False, f"{backend} does not support {platform['machine']}" + if sku in implementation.get("excluded_skus", set()): + return False, f"{backend} is unavailable on {sku}" + return True, "ok" diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml new file mode 100644 index 0000000000..0d72ceaae4 --- /dev/null +++ b/experimental/CollectiveX/configs/suites.yaml @@ -0,0 +1,21 @@ +# CollectiveX v1 comparison suites. +schema_version: 1 + +suites: + ep-core-v1: + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + routings: [uniform] + phases: [decode, prefill] + token_points_prefill: [256, 512] + required_publication: official + + ep-routing-v1: + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + routings: [zipf] + eplb: [false, true] + phases: [decode, prefill] + token_points_decode: [128] + token_points_prefill: [512] + required_publication: comparable-experimental diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml new file mode 100644 index 0000000000..b5b68334c4 --- /dev/null +++ b/experimental/CollectiveX/configs/workloads.yaml @@ -0,0 +1,9 @@ +# CollectiveX v1 canonical workload and phase metadata. +schema_version: 1 + +model_derived: + deepseek-v3-v1: + hidden: 7168 + topk: 8 + routed_experts: 256 + verified_against: "deepseek-ai/DeepSeek-V3@e815299b0bcbac849fa540c768ef21845365c9eb/config.json" diff --git a/experimental/CollectiveX/contracts.py b/experimental/CollectiveX/contracts.py new file mode 100644 index 0000000000..6089b8d119 --- /dev/null +++ b/experimental/CollectiveX/contracts.py @@ -0,0 +1,2641 @@ +#!/usr/bin/env python3 +"""Strict native attempt contracts and metric validation for CollectiveX v1.""" +from __future__ import annotations + +import argparse +import datetime as dt +from functools import lru_cache +import hashlib +import json +import math +import os +from pathlib import Path, PurePosixPath +import re +import sys +from typing import Any, Iterable + +import artifact_safety +import capability +import identity + +TESTS = Path(__file__).resolve().parent / "tests" +sys.path.insert(0, str(TESTS)) +import eplb as eplb_contract # noqa: E402 +import workload as workload_contract # noqa: E402 + +RAW_FORMAT = "collectivex.ep.v1" +SAMPLES_FORMAT = "collectivex.samples.v1" +TERMINAL_FORMAT = "collectivex.terminal.v1" +TERMINAL_CASE_FIELDS = { + "backend", "canonical", "eplb", "ep", "experts", "gpus_per_node", "hidden", + "ladder", "nodes", "phase", "required_publication", "routing", "samples_per_point", + "scale_up_domain", "suite", "timing", "topk", "warmup_semantics", "workload", +} +ALLOCATION_FACTOR_FIELDS = { + "artifact", "execution_id", "job", "repo", "run_attempt", "run_id", "runner", + "source_sha", +} +GIT_RUN_FIELDS = {"artifact", "job", "ref", "repo", "run_attempt", "run_id", "source_sha"} +PRE_EXECUTION_FAILURE_REASONS = { + "setup": "launcher-setup-failed", + "repository-stage": "repository-staging-failed", + "registry-verification": "container-registry-verification-failed", + "scheduler-allocation": "scheduler-allocation-failed", + "container-import": "container-image-preparation-failed", + "container-hash": "container-image-identity-failed", + "container-launch": "container-runtime-launch-failed", + "backend-setup": "backend-setup-failed", + "artifact-collection": "artifact-collection-failed", +} +RUNTIME_FAILURE_REASONS = { + **PRE_EXECUTION_FAILURE_REASONS, + "runtime-identity": "runtime-identity-mismatch", + "timeout": "execution-timeout", + "deadlock": "execution-deadlock", + "execution": "distributed-command-failed", +} +POST_EMIT_FAILURE_REASONS = { + mode: "post-emit-distributed-command-failed" + for mode in ("runtime-identity", "timeout", "deadlock", "execution") +} +CAPABILITY_FAILURE_REASONS = frozenset({ + "backend-platform-unsupported", + "backend-token-capacity", +}) +RETURN_CODE_FAILURE_MODES = { + 5: "runtime-identity", + 124: "timeout", +} +PERCENTILES = ("p50", "p90", "p95", "p99") +V1_CONDITIONING_LADDERS = { + "decode": (1, 2, 4, 8, 16, 32, 64, 128), + "prefill": (1, 2, 4, 8, 16, 32, 64, 128, 256, 512), +} +V1_CONDITIONING_ROUNDS_PER_SHAPE = 8 +DEEPEP_V2_JIT_KERNELS = frozenset({ + "barrier", "combine", "combine_reduce_epilogue", "dispatch", + "dispatch_copy_epilogue", +}) +DEEPEP_V2_V1_PROVENANCE = { + "deepep_version": "2.0.0", + "deepep_distribution_version": "2.0.0+fa8a9b1", + "deepep_commit": "fa8a9b16898204afd347c663b89e65ef87dc6ce6", + "deepep_tree": "29809e75c5874e6609dac4804e7b651d5226959f", + "deepep_pr": 605, + "deepep_fix_pr": 630, + "fmt_commit": "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa", + "torch_version": "2.10.0+cu130", + "nccl_package_version": "2.30.4", + "nccl_version": "2.30.4", + "nvshmem_package_version": "3.3.9", + "allow_hybrid_mode": False, + "gin_enabled": False, + "communication_backend": "nccl-device-lsa", +} +UCCL_DEPENDENCY_VERSIONS = { + "intervaltree": "3.1.0", + "nvidia-cuda-runtime-cu12": "12.9.79", + "sortedcontainers": "2.4.0", +} +SCHEMA_DIR = Path(__file__).resolve().parent / "schemas" +_SCHEMA_CACHE: dict[str, dict[str, Any]] = {} +REQUIRED_BACKEND_PROVENANCE = { + "deepep": ( + "deepep_version", "deepep_commit", "backend_lineage", "allow_mnnvl", + "mnnvl_comm", + ), + "deepep-v2": ( + *DEEPEP_V2_V1_PROVENANCE, "api_signature_sha256", "loaded_libraries", + "jit_cubins", "jit_random_seed", "deterministic", "num_experts", + "tuning_num_experts", + ), + "deepep-hybrid": ( + "deepep_commit", "deepep_tree", "branch", "backend_lineage", + "loaded_libraries", "realized_config", "jit_kernel_keys", "jit_shared_objects", + ), + "uccl": ( + "uccl_version", "uccl_commit", "uccl_wrapper_commit", "backend_lineage", + "loaded_libraries", "uccl_dependency_versions", + ), + "mori": ("mori_commit",), + "nccl-ep": ("nccl_version", "collective_library", "backend_lineage"), +} +PROVENANCE_KEYS = { + "allocated_qps", "allow_hybrid_mode", "allow_mnnvl", "allow_multiple_reduction", + "api", "api_signature_sha256", "backend", "backend_lineage", "block_num", + "block_num_floored", "block_num_target", "branch", "collective_library", + "combine_dtype", "combine_warps", "communication_backend", "cuda_version", + "deepep_commit", "deepep_distribution_version", "deepep_fix_pr", "deepep_pr", "deepep_tree", + "deepep_version", "deterministic", "device_cus", + "device_sms", "dispatch_dtype", "dispatch_warps", "enable_sdma", "fmt_commit", + "gin_enabled", + "gpus_per_node", "heap_size", + "impl", "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_random_seed", + "jit_shared_objects", "kernel_type", + "loaded_libraries", "local_experts", + "logical_scaleout_ranks", + "logical_scaleup_ranks", "mapping_variant", "max_num_inp_token_per_rank", + "max_num_tokens", "max_total_recv_tokens", "mnnvl_comm", "mode", "mori_commit", + "nccl_communicator", "nccl_package_version", "nccl_version", "num_experts", + "nvshmem_package_version", + "num_max_tokens_per_rank", "num_nvl_bytes", "num_qps", "num_qps_per_rank", + "num_rdma_bytes", "num_sms", "path", + "physical_nvlink_ranks", "physical_rdma_ranks", "prefer_overlap_with_compute", + "realized_config", "reference_semantics", "requested_num_sms", "resource_mode", "routing_factor", + "routing_metadata", "sm_fraction", "top_k", + "torch_git_version", "torch_version", "transport", "trtllm", "tuned_source", + "tuning_num_experts", + "uccl_commit", "uccl_dependency_versions", "uccl_version", "uccl_wrapper_commit", + "workspace", +} + + +class ContractError(ValueError): + """A document differs from the native v1 contract.""" + + +def resolve_deepep_mnnvl( + *, requested: bool, signature_parameters: Iterable[str], deepep_commit: str | None +) -> tuple[dict[str, bool], str]: + """Resolve one explicit DeepEP MNNVL API mode without signature fallbacks.""" + if not requested: + return {}, "not-requested" + if "allow_mnnvl" in set(signature_parameters): + return {"allow_mnnvl": True}, "explicit-allow-mnnvl" + raise ContractError( + f"requested DeepEP MNNVL is unsupported by commit {deepep_commit or 'unknown'}" + ) + + +def collective_kernel_generation(collective_library: Any) -> str: + """Return the public NCCL/RCCL implementation lineage.""" + if collective_library not in {"nccl", "rccl"}: + raise ContractError("reference collective library must be nccl or rccl") + return collective_library + + +def project_resource_profile(provenance: dict[str, Any]) -> dict[str, Any]: + """Project backend provenance into the canonical cross-backend resource vocabulary.""" + device_units = provenance.get("device_sms") or provenance.get("device_cus") + if provenance.get("num_sms") is not None: + kind, configured = "sm", provenance["num_sms"] + elif ( + provenance.get("block_num") is not None + and provenance.get("kernel_type") != "AsyncLL" + ): + kind, configured = "cu_block", provenance["block_num"] + else: + kind, configured = None, None + achieved = configured / device_units if configured and device_units else None + fixed = "fixed-kernel" in str(provenance.get("tuned_source", "")) + source = str(provenance.get("tuned_source", "")) + return { + "achieved_fraction": round(achieved, 4) if achieved else None, + "comm_units_kind": kind, + "configured_units": configured, + "conformance_class": ( + "not-applicable" if fixed else "best-known" if "default" not in source + else "backend-default" + ), + "device_units": device_units, + "fixed_kernel": fixed, + "nonconforming": False, + "pareto_eligible": False, + "persistent_bytes": ( + provenance.get("num_nvl_bytes") + or provenance.get("num_rdma_bytes") + or provenance.get("heap_size") + ), + "qps_per_rank": provenance.get("num_qps_per_rank"), + "requested_fraction": None, + "resource_class": "fixed-kernel" if fixed else "backend-tuned", + "target_achieved_within_tol": None, + "tolerance": 0.10, + "tuned_source": provenance.get("tuned_source"), + "warps_combine": provenance.get("combine_warps"), + "warps_dispatch": provenance.get("dispatch_warps"), + } + + +def backend_version(provenance: dict[str, Any]) -> str | None: + """Return the canonical public backend version from implementation provenance.""" + for field in ( + "deepep_version", "uccl_version", "nccl_version", + "mori_commit", "deepep_commit", + ): + value = provenance.get(field) + if value is not None and str(value).strip(): + return str(value)[:160] + return None + + +def public_series_config( + *, kernel_generation: Any, provenance: dict[str, Any], + resource_profile: dict[str, Any], resource_mode: Any, device_product: Any, +) -> dict[str, Any]: + """Project raw implementation facts into the exact public configuration fields.""" + generation = None if kernel_generation == "n-a" else kernel_generation + profile = "profile-" + _sha256_json(resource_profile)[:16] + return { + "backend": { + "generation": generation, + "version": backend_version(provenance), + }, + "resource": { + "mode": resource_mode, + "profile": profile, + "comm_units_kind": resource_profile.get("comm_units_kind"), + "configured_units": resource_profile.get("configured_units"), + }, + "system": {"label": str(device_product)[:160]}, + } + + +def public_series_config_sha256(config: dict[str, Any]) -> str: + """Commit the canonical public configuration projection into series identity.""" + return _sha256_json(config) + + +SOURCE_BUILT_LIBRARY_ROLES = frozenset({ + "deepep-extension", "deepep-hybrid-extension", +}) + + +def series_provenance(provenance: dict[str, Any]) -> dict[str, Any]: + """Project stable semantic build identity while retaining raw binaries in private evidence.""" + projected = { + key: value for key, value in provenance.items() + if key not in {"jit_cache_key", "jit_shared_objects", "path", "sm_fraction"} + } + libraries = provenance.get("loaded_libraries") + if isinstance(libraries, list): + projected["loaded_libraries"] = [ + { + "name": item.get("name"), + "role": item.get("role"), + "source_tree": provenance.get("deepep_tree"), + } + if isinstance(item, dict) and item.get("role") in SOURCE_BUILT_LIBRARY_ROLES + else item + for item in libraries + ] + jit_cubins = provenance.get("jit_cubins") + if isinstance(jit_cubins, list): + projected["jit_cubins"] = [ + { + "cache_key": item.get("cache_key"), + "sass_sha256": item.get("sass_sha256"), + "source_sha256": item.get("source_sha256"), + } + if isinstance(item, dict) + else item + for item in jit_cubins + ] + return projected + + +def routing_implementation_control_sha256(implementation: dict[str, Any]) -> str: + """Bind routing cohorts to the same static build/generator and non-treatment configuration.""" + provenance = implementation.get("provenance") + if not isinstance(provenance, dict): + raise ContractError("implementation provenance is unavailable") + semantic = series_provenance(provenance) + treatment_fields = { + "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_shared_objects", + "local_experts", "num_experts", "path", "realized_config", "sm_fraction", + } + return _sha256_json({ + "kernel_generation": implementation.get("kernel_generation"), + "name": implementation.get("name"), + "provenance": { + key: value for key, value in semantic.items() + if key not in treatment_fields + }, + "resource_profile": implementation.get("resource_profile"), + }) + + +def _resolved_provenance_value(field: str, value: Any) -> bool: + if value is None or isinstance(value, (dict, list, tuple, set)) and not value: + return False + text = str(value).strip().lower() + if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}: + return False + if "capture-failed" in text: + return False + if field.endswith("_commit") and ( + text in {"main", "hybrid-ep", "uccl", "pkg-uccl"} + or text.endswith(("-unknown", "-none", "-main", "-hybrid-ep")) + ): + return False + return True + + +def _content_evidence_is_valid(value: Any, required_roles: set[str]) -> bool: + if not isinstance(value, list) or not value: + return False + records: set[tuple[str, str]] = set() + roles: set[str] = set() + for item in value: + if not isinstance(item, dict) or set(item) != {"name", "role", "sha256"}: + return False + name, role, digest = item["name"], item["role"], item["sha256"] + if ( + not isinstance(name, str) + or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name) + or not isinstance(role, str) + or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role) + or not isinstance(digest, str) + or not re.fullmatch(r"[0-9a-f]{64}", digest) + or (role, name) in records + ): + return False + records.add((role, name)) + roles.add(role) + return required_roles <= roles + + +def _deepep_v2_jit_cubins_are_valid(value: Any) -> bool: + if not isinstance(value, list) or len(value) != len(DEEPEP_V2_JIT_KERNELS): + return False + cache_keys = [] + kernel_names = set() + for item in value: + if not isinstance(item, dict) or set(item) != { + "cache_key", "cubin_sha256", "sass_sha256", "source_sha256", + }: + return False + cache_key = item["cache_key"] + match = ( + re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.[0-9a-f]{32}", cache_key) + if isinstance(cache_key, str) + else None + ) + if ( + match is None + or any( + not isinstance(item[field], str) + or not re.fullmatch(r"[0-9a-f]{64}", item[field]) + for field in ("cubin_sha256", "sass_sha256", "source_sha256") + ) + ): + return False + cache_keys.append(cache_key) + kernel_names.add(match.group(1)) + return ( + cache_keys == sorted(set(cache_keys)) + and kernel_names == DEEPEP_V2_JIT_KERNELS + ) + + +HYBRID_REALIZED_CONFIG_FIELDS = { + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", "pad_multiple", + "num_of_tokens_per_chunk_preprocessing_api", + "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api", + "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type", + "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api", + "num_of_in_flight_s2g_dispatch_api", + "num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_additional_in_flight_s2g_dispatch_api", + "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api", + "forward_dispatch_api", "device_side_sync_dispatch_api", + "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api", + "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api", + "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api", + "backward_combine_api", "device_side_sync_combine_api", +} +HYBRID_REALIZED_BOOL_FIELDS = { + "forward_dispatch_api", "device_side_sync_dispatch_api", "backward_combine_api", + "device_side_sync_combine_api", +} + + +def _hybrid_realized_config_is_valid(value: Any) -> bool: + if not isinstance(value, dict) or set(value) != HYBRID_REALIZED_CONFIG_FIELDS: + return False + for field, field_value in value.items(): + if field in HYBRID_REALIZED_BOOL_FIELDS: + if type(field_value) is not bool: + return False + elif field == "token_data_type": + if field_value not in {"UINT8", "UINT16"}: + return False + elif type(field_value) is not int or field_value < 0: + return False + return all(value[field] > 0 for field in ( + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", + )) + + +def _hybrid_kernel_keys_are_valid(value: Any) -> bool: + return ( + isinstance(value, list) + and len(value) == 3 + and len(set(value)) == 3 + and value == sorted(value) + and all( + isinstance(key, str) + and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", key) + for key in value + ) + ) + + +def _hybrid_jit_evidence_is_valid(value: Any, kernel_keys: Any) -> bool: + if not _hybrid_kernel_keys_are_valid(kernel_keys) or not isinstance(value, list): + return False + if len(value) != len(kernel_keys): + return False + rank_sets = [] + for expected_key, item in zip(kernel_keys, value): + if not isinstance(item, dict) or set(item) != {"kernel_key", "rank_artifacts"}: + return False + rank_artifacts = item["rank_artifacts"] + if item["kernel_key"] != expected_key or not isinstance(rank_artifacts, list): + return False + ranks = [] + for artifact in rank_artifacts: + if not isinstance(artifact, dict) or set(artifact) != {"bytes", "rank", "sha256"}: + return False + rank, digest, size = artifact["rank"], artifact["sha256"], artifact["bytes"] + if ( + type(rank) is not int + or rank < 0 + or not isinstance(digest, str) + or not re.fullmatch(r"[0-9a-f]{64}", digest) + or type(size) is not int + or size <= 0 + ): + return False + ranks.append(rank) + if not ranks or ranks != list(range(len(ranks))): + return False + rank_sets.append(ranks) + return all(ranks == rank_sets[0] for ranks in rank_sets) + + +def backend_provenance_issues(backend: str, provenance: dict[str, Any]) -> list[str]: + unknown = [ + field for field, value in provenance.items() + if isinstance(value, str) and value.strip().lower() == "unknown" + ] + unresolved = [ + field for field in REQUIRED_BACKEND_PROVENANCE.get(backend, ()) + if not _resolved_provenance_value(field, provenance.get(field)) + ] + if backend == "deepep": + mode = provenance.get("mnnvl_comm") + allow = provenance.get("allow_mnnvl") + valid_modes = { + "not-requested": False, + "explicit-allow-mnnvl": True, + } + if type(allow) is not bool or valid_modes.get(mode) is not allow: + unresolved.append("mnnvl_comm") + if provenance.get("backend_lineage") != "deepep-v1": + unresolved.append("backend_lineage") + if backend == "deepep-v2": + for field in ("num_experts", "tuning_num_experts"): + if type(provenance.get(field)) is not int or provenance[field] <= 0: + unresolved.append(field) + if not _deepep_v2_jit_cubins_are_valid(provenance.get("jit_cubins")): + unresolved.append("jit_cubins") + if provenance.get("jit_random_seed") != "collectivex-deepep-v2-fa8a9b1": + unresolved.append("jit_random_seed") + unresolved.extend( + field for field, expected in DEEPEP_V2_V1_PROVENANCE.items() + if provenance.get(field) != expected + ) + content_roles = { + "deepep-v2": {"deepep-extension", "nccl", "nvshmem"}, + "deepep-hybrid": {"deepep-extension", "deepep-hybrid-extension"}, + "uccl": { + "uccl-distribution", "uccl-wrapper", "intervaltree-distribution", + "sortedcontainers-distribution", "cuda-runtime", + }, + }.get(backend) + if content_roles is not None and not _content_evidence_is_valid( + provenance.get("loaded_libraries"), content_roles + ): + unresolved.append("loaded_libraries") + if backend in {"deepep-v2", "deepep-hybrid"} and not re.fullmatch( + r"[0-9a-f]{40}", str(provenance.get("deepep_tree", "")) + ): + unresolved.append("deepep_tree") + if backend == "deepep-hybrid" and provenance.get("backend_lineage") != "deepep-hybrid": + unresolved.append("backend_lineage") + if backend == "deepep-hybrid": + if not _hybrid_realized_config_is_valid(provenance.get("realized_config")): + unresolved.append("realized_config") + if not _hybrid_kernel_keys_are_valid(provenance.get("jit_kernel_keys")): + unresolved.append("jit_kernel_keys") + if not _hybrid_jit_evidence_is_valid( + provenance.get("jit_shared_objects"), provenance.get("jit_kernel_keys") + ): + unresolved.append("jit_shared_objects") + if backend == "uccl" and provenance.get("backend_lineage") != "uccl": + unresolved.append("backend_lineage") + if backend == "uccl" and provenance.get("uccl_dependency_versions") != ( + UCCL_DEPENDENCY_VERSIONS + ): + unresolved.append("uccl_dependency_versions") + if backend == "nccl-ep": + collective = provenance.get("collective_library") + if collective not in {"nccl", "rccl"}: + unresolved.append("collective_library") + if provenance.get("backend_lineage") != collective: + unresolved.append("backend_lineage") + return sorted(set(unknown + unresolved)) + + +def provenance_complete( + provenance: dict[str, Any], backend: str, git_run: dict[str, Any] | None, + *, image_digest: Any, image_verified: Any, squash_sha256: Any, +) -> bool: + image = str(image_digest or "") + squash = str(squash_sha256 or "") + return ( + not backend_provenance_issues(backend, provenance) + and image_verified is True + and bool(re.fullmatch(r"sha256:[0-9a-f]{64}", image)) + and bool(re.fullmatch(r"[0-9a-f]{64}", squash)) + and isinstance(git_run, dict) + and all(git_run.get(field) for field in GIT_RUN_FIELDS) + ) + + +def strict_load(path: str | os.PathLike[str]) -> Any: + """Load JSON while rejecting duplicate keys and non-finite constants.""" + def pairs(items): + result = {} + for key, value in items: + if key in result: + raise ContractError(f"duplicate JSON key {key!r}") + result[key] = value + return result + + def constant(value): + raise ContractError(f"non-finite JSON number {value}") + + try: + with open(path) as handle: + return json.load(handle, object_pairs_hook=pairs, parse_constant=constant) + except (OSError, json.JSONDecodeError) as exc: + raise ContractError(f"invalid JSON {path}: {exc}") from exc + + +def canonical_json_bytes(value: Any) -> bytes: + """Canonical finite JSON bytes for checksums and immutable artifacts.""" + _finite_tree(value) + try: + return json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + except (TypeError, ValueError) as exc: + raise ContractError(f"value is not canonical JSON: {exc}") from exc + + +def content_manifest_evidence( + *, role: str, name: str, files: Iterable[tuple[str, str | os.PathLike[str]]] +) -> dict[str, str]: + """Hash a labeled file set without exposing any host path in provenance.""" + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role): + raise ContractError("content evidence role is invalid") + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name): + raise ContractError("content evidence name is invalid") + manifest: list[dict[str, Any]] = [] + labels: set[str] = set() + for label, raw_path in files: + logical = PurePosixPath(label) + if ( + not label + or logical.is_absolute() + or ".." in logical.parts + or label in labels + or any(ord(character) < 0x20 or ord(character) > 0x7E for character in label) + ): + raise ContractError("content evidence label is invalid or duplicated") + path = Path(raw_path) + if not path.is_file(): + raise ContractError("content evidence source is not a file") + digest = hashlib.sha256() + size = 0 + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + size += len(chunk) + labels.add(label) + manifest.append({"bytes": size, "label": label, "sha256": digest.hexdigest()}) + if not manifest: + raise ContractError("content evidence cannot be empty") + digest = hashlib.sha256( + canonical_json_bytes(sorted(manifest, key=lambda item: item["label"])) + ).hexdigest() + return {"name": name, "role": role, "sha256": digest} + + +def _obj(value: Any, path: str) -> dict[str, Any]: + if not isinstance(value, dict): + raise ContractError(f"{path} must be an object") + return value + + +def _keys(value: Any, expected: set[str], path: str) -> dict[str, Any]: + obj = _obj(value, path) + actual = set(obj) + if actual != expected: + raise ContractError( + f"{path} fields differ: missing={sorted(expected - actual)}, " + f"extra={sorted(actual - expected)}" + ) + return obj + + +def _text(value: Any, path: str, *, nullable: bool = False) -> str | None: + if nullable and value is None: + return None + if not isinstance(value, str) or not value: + raise ContractError(f"{path} must be a non-empty string") + return value + + +def _integer(value: Any, path: str, *, minimum: int = 0) -> int: + if type(value) is not int or value < minimum: + raise ContractError(f"{path} must be an integer >= {minimum}") + return value + + +def validate_conditioning_contract(value: Any, phase: str) -> dict[str, Any]: + """Validate the exact phase-specific v1 conditioning schedule.""" + if phase not in V1_CONDITIONING_LADDERS: + raise ContractError("raw conditioning phase is invalid") + conditioning = _keys( + value, {"contract", "ladder", "roundtrips_per_shape"}, + "raw.measurement.conditioning", + ) + ladder = conditioning["ladder"] + if ( + conditioning["contract"] != identity.V1_CASE_PROFILE["conditioning_contract"] + or type(ladder) is not list + or any(type(point) is not int for point in ladder) + or ladder != list(V1_CONDITIONING_LADDERS[phase]) + or _integer( + conditioning["roundtrips_per_shape"], + "raw.measurement.conditioning.roundtrips_per_shape", + minimum=1, + ) != V1_CONDITIONING_ROUNDS_PER_SHAPE + ): + raise ContractError(f"raw {phase} conditioning contract differs") + return conditioning + + +def _number(value: Any, path: str, *, minimum: float | None = None) -> float: + if isinstance(value, bool) or not isinstance(value, (int, float)) or not math.isfinite(value): + raise ContractError(f"{path} must be finite") + result = float(value) + if minimum is not None and result < minimum: + raise ContractError(f"{path} must be >= {minimum}") + return result + + +def _finite_tree(value: Any, path: str = "$") -> None: + if isinstance(value, float) and not math.isfinite(value): + raise ContractError(f"{path} contains a non-finite number") + if isinstance(value, list): + for index, item in enumerate(value): + _finite_tree(item, f"{path}[{index}]") + elif isinstance(value, dict): + for key, item in value.items(): + _finite_tree(item, f"{path}.{key}") + + +def _typed(value: Any, kind: str, path: str) -> str: + if not identity.is_typed_id(value, kind): + raise ContractError(f"{path} is not a {kind} ID") + return value + + +def _sha256_json(value: Any) -> str: + payload = json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":") + ).encode() + return hashlib.sha256(payload).hexdigest() + + +@lru_cache(maxsize=None) +def _expected_eplb_plan( + routing: str, + topk: int, + logical_experts: int, + physical_experts: int, + ep_size: int, + seed: int, + reference_tokens_per_rank: int, +) -> dict[str, Any]: + indices, _ = workload_contract.canonical_routing_rows( + reference_tokens_per_rank * ep_size, + logical_experts, + topk, + routing, + seed, + ) + load = [0] * logical_experts + for row in indices: + for expert in row: + load[expert] += 1 + return eplb_contract.build_plan(load, physical_experts, ep_size) + + +@lru_cache(maxsize=None) +def _expected_canonical_trace( + routing: str, + hidden: int, + topk: int, + logical_experts: int, + physical_experts: int, + ep_size: int, + tokens_per_rank: int, + seed: int, + eplb_enabled: bool, + reference_tokens_per_rank: int, +) -> tuple[str, dict[str, str], str, list[list[int]], list[list[float]]]: + member, checksums, indices, weights = workload_contract.canonical_member( + routing, + hidden, + topk, + logical_experts, + ep_size, + tokens_per_rank, + seed, + ) + if eplb_enabled: + plan = _expected_eplb_plan( + routing, + topk, + logical_experts, + physical_experts, + ep_size, + seed, + reference_tokens_per_rank, + ) + indices = eplb_contract.remap_rows(indices, plan) + routing_hash = workload_contract.trace_checksums(indices, weights)["trace"] + return member, checksums, routing_hash, indices, weights + + +def _coefficient_of_variation(values: list[int]) -> float: + mean = sum(values) / len(values) + if mean == 0: + return 0.0 + variance = sum((value - mean) ** 2 for value in values) / len(values) + return variance**0.5 / mean + + +def _expected_routing_summary( + indices: list[list[int]], + weights: list[list[float]], + *, + physical_experts: int, + ep_size: int, + tokens_per_rank: int, + gpus_per_node: int, + scale_up_domain: int, +) -> dict[str, Any]: + """Recompute every published routing/load statistic without torch.""" + experts_per_rank = physical_experts // ep_size + expert_load = [0] * physical_experts + assignment_load = [0] * ep_size + payload_load = [0] * ep_size + fanouts: list[int] = [] + local = same_node = same_domain = copies = 0 + for token, row in enumerate(indices): + destinations = {expert // experts_per_rank for expert in row} + source = token // tokens_per_rank + fanouts.append(len(destinations)) + for expert in row: + expert_load[expert] += 1 + assignment_load[expert // experts_per_rank] += 1 + for destination in destinations: + payload_load[destination] += 1 + copies += 1 + local += destination == source + same_node += destination // gpus_per_node == source // gpus_per_node + same_domain += destination // scale_up_domain == source // scale_up_domain + fanout_histogram = [fanouts.count(value) for value in range(1, ep_size + 1)] + expert_mean = sum(expert_load) / len(expert_load) + return { + "empty_expert_count": expert_load.count(0), + "empty_rank_count": payload_load.count(0), + "expert_assignment_rank_cv": _coefficient_of_variation(assignment_load), + "expert_assignments_per_rank": assignment_load, + "expert_load_cv": _coefficient_of_variation(expert_load), + "expert_load_max": max(expert_load), + "expert_load_mean": expert_mean, + "expert_load_min": min(expert_load), + "fanout_histogram": fanout_histogram, + "fanout_max": max(fanouts), + "fanout_mean": sum(fanouts) / len(fanouts), + "fanout_min": min(fanouts), + "hash": workload_contract.trace_checksums(indices, weights)["trace"], + "hotspot_ratio": max(expert_load) / expert_mean if expert_mean else 0.0, + "locality": { + "placement": "packed", + "local_rank_fraction": local / copies, + "same_node_fraction": same_node / copies, + "same_scaleup_domain_fraction": same_domain / copies, + "cross_node_fraction": 1 - same_node / copies, + "cross_domain_fraction": 1 - same_domain / copies, + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + "copies": copies, + }, + "payload_copies_per_rank": payload_load, + "payload_rank_cv": _coefficient_of_variation(payload_load), + "routed_copies": copies, + "source_token_stats": { + "min": tokens_per_rank, + "mean": float(tokens_per_rank), + "max": tokens_per_rank, + "cv": 0.0, + "empty_ranks": 0, + "total": tokens_per_rank * ep_size, + "ranks": ep_size, + }, + } + + +def _expected_histogram(samples: list[float], bins: int = 40) -> dict[str, Any]: + low, high = min(samples), max(samples) + if high <= low: + return {"n": len(samples), "min": low, "max": high, "bins": bins, "counts": [len(samples)]} + counts = [0] * bins + span = high - low + for sample in samples: + index = min(bins - 1, int((sample - low) / span * bins)) + counts[index] += 1 + return { + "n": len(samples), + "min": round(low, 3), + "max": round(high, 3), + "bins": bins, + "counts": counts, + } + + +def _expected_anomalies( + tokens: int, components: dict[str, Any] +) -> list[dict[str, Any]]: + dispatch = components["dispatch"]["percentiles_us"] + combine = components["combine"]["percentiles_us"] + roundtrip = components["roundtrip"]["percentiles_us"] + isolated = components["isolated_sum"]["percentiles_us"] + anomalies: list[dict[str, Any]] = [] + if isolated is not None and roundtrip["p99"] > 3.0 * isolated["p99"]: + anomalies.append({ + "type": "roundtrip_gt_isolated_sum", + "T": tokens, + "roundtrip_p99": round(roundtrip["p99"], 2), + "isolated_sum_p99": round(isolated["p99"], 2), + "ratio": round(roundtrip["p99"] / isolated["p99"], 2), + "threshold": 3.0, + }) + floor = max(dispatch["p50"], combine["p50"]) if dispatch and combine else None + if floor and roundtrip["p50"] < 0.95 * floor: + anomalies.append({ + "type": "roundtrip_lt_component_floor", + "T": tokens, + "roundtrip_p50": round(roundtrip["p50"], 2), + "component_floor_p50": round(floor, 2), + }) + return anomalies + + +def _validate_canonical_workload( + workload: dict[str, Any], + scheduled_case: dict[str, Any], + rows: list[dict[str, Any]], + eplb: dict[str, Any], +) -> None: + """Bind every canonical member and measured routing hash to its scheduled token row.""" + profile = identity.V1_CASE_PROFILE + if eplb["enabled"]: + plan = _expected_eplb_plan( + scheduled_case["routing"], + scheduled_case["topk"], + scheduled_case["experts"], + eplb["num_physical_experts"], + scheduled_case["ep"], + profile["seed"], + profile["eplb_reference_tokens_per_rank"], + ) + if eplb["mapping_hash"] != eplb_contract.mapping_hash(plan): + raise ContractError("raw EPLB mapping differs from the frozen canonical plan") + + expected: dict[str, dict[str, str]] = {} + for index, row in enumerate(rows): + member, checksums, routing_hash, _, _ = _expected_canonical_trace( + scheduled_case["routing"], + scheduled_case["hidden"], + scheduled_case["topk"], + scheduled_case["experts"], + eplb["num_physical_experts"], + scheduled_case["ep"], + row["tokens_per_rank"], + profile["seed"], + eplb["enabled"], + profile["eplb_reference_tokens_per_rank"], + ) + if row["routing"]["hash"] != routing_hash: + raise ContractError( + f"raw.measurement.rows[{index}].routing.hash differs from its canonical member" + ) + expected[member] = checksums + if ( + len(expected) != len(rows) + or workload["members"] != sorted(expected) + or workload["manifest_checksums"] != expected + ): + raise ContractError("raw canonical member set/checksums differ from scheduled rows") + expected_workload_id = identity.workload_id({ + "members": [ + {"checksums": expected[member], "workload_id": member} + for member in sorted(expected) + ] + }) + if workload["workload_id"] != expected_workload_id: + raise ContractError("raw composite workload identity differs from scheduled rows") + + +def _nearest_rank(samples: list[float], q: int) -> float: + ordered = sorted(samples) + return ordered[max(0, min(len(ordered) - 1, math.ceil(q / 100 * len(ordered)) - 1))] + + +def _close(observed: Any, expected: float, path: str, tolerance: float = 1e-6) -> None: + value = _number(observed, path) + if not math.isclose(value, expected, rel_tol=tolerance, abs_tol=tolerance): + raise ContractError(f"{path}={value} differs from recomputed {expected}") + + +def _equivalent( + observed: Any, expected: Any, path: str, *, tolerance: float = 1e-6 +) -> None: + """Compare a recomputed JSON subtree while allowing only float roundoff.""" + if isinstance(expected, dict): + value = _keys(observed, set(expected), path) + for key, child in expected.items(): + _equivalent(value[key], child, f"{path}.{key}", tolerance=tolerance) + return + if isinstance(expected, list): + if not isinstance(observed, list) or len(observed) != len(expected): + raise ContractError(f"{path} differs from recomputed evidence") + for index, child in enumerate(expected): + _equivalent(observed[index], child, f"{path}[{index}]", tolerance=tolerance) + return + if isinstance(expected, float): + _close(observed, expected, path, tolerance) + return + if type(observed) is not type(expected) or observed != expected: + raise ContractError(f"{path} differs from recomputed evidence") + + +def _schema_equal(left: Any, right: Any) -> bool: + """JSON Schema equality: booleans are distinct from numbers.""" + if isinstance(left, bool) or isinstance(right, bool): + return type(left) is type(right) and left == right + if isinstance(left, dict) and isinstance(right, dict): + return set(left) == set(right) and all( + _schema_equal(left[key], right[key]) for key in left + ) + if isinstance(left, list) and isinstance(right, list): + return len(left) == len(right) and all( + _schema_equal(a, b) for a, b in zip(left, right, strict=True) + ) + return left == right + + +def _schema_ref(root: dict[str, Any], reference: str) -> dict[str, Any]: + if not reference.startswith("#/"): + raise ContractError("native artifact schema contains a non-local reference") + value: Any = root + for part in reference[2:].split("/"): + part = part.replace("~1", "/").replace("~0", "~") + if not isinstance(value, dict) or part not in value: + raise ContractError("native artifact schema contains a broken reference") + value = value[part] + if not isinstance(value, dict): + raise ContractError("native artifact schema reference is not an object") + return value + + +def _schema_type_matches(value: Any, expected: str) -> bool: + if expected == "null": + return value is None + if expected == "boolean": + return type(value) is bool + if expected == "object": + return isinstance(value, dict) + if expected == "array": + return isinstance(value, list) + if expected == "string": + return isinstance(value, str) + if expected == "number": + return ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + ) + if expected == "integer": + return ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + and float(value).is_integer() + ) + raise ContractError(f"native artifact schema uses unsupported type {expected!r}") + + +def _validate_schema_value( + value: Any, schema: dict[str, Any], root: dict[str, Any], path: str +) -> None: + """Validate the bounded JSON Schema subset used by native artifact contracts.""" + if "$ref" in schema: + _validate_schema_value(value, _schema_ref(root, schema["$ref"]), root, path) + return + if "oneOf" in schema: + matches = 0 + for candidate in schema["oneOf"]: + try: + _validate_schema_value(value, candidate, root, path) + except ContractError: + continue + matches += 1 + if matches != 1: + raise ContractError(f"{path} must match exactly one native schema alternative") + return + expected_type = schema.get("type") + if expected_type is not None and not _schema_type_matches(value, expected_type): + raise ContractError(f"{path} is not a schema {expected_type}") + if "const" in schema and not _schema_equal(value, schema["const"]): + raise ContractError(f"{path} differs from its schema constant") + if "enum" in schema and not any(_schema_equal(value, item) for item in schema["enum"]): + raise ContractError(f"{path} is outside its schema enum") + + if isinstance(value, dict): + required = set(schema.get("required", ())) + properties = schema.get("properties", {}) + missing = required - set(value) + if missing: + raise ContractError(f"{path} lacks schema fields {sorted(missing)}") + additional = schema.get("additionalProperties", True) + extra = set(value) - set(properties) + if additional is False and extra: + raise ContractError(f"{path} has extra schema fields {sorted(extra)}") + for key, item in value.items(): + if key in properties: + _validate_schema_value(item, properties[key], root, f"{path}.{key}") + elif isinstance(additional, dict): + _validate_schema_value(item, additional, root, f"{path}.{key}") + property_names = schema.get("propertyNames") + if property_names is not None: + for key in value: + _validate_schema_value(key, property_names, root, f"{path}.") + + if isinstance(value, list): + if len(value) < schema.get("minItems", 0): + raise ContractError(f"{path} has too few schema items") + maximum = schema.get("maxItems") + if maximum is not None and len(value) > maximum: + raise ContractError(f"{path} has too many schema items") + if schema.get("uniqueItems") and any( + _schema_equal(item, prior) + for index, item in enumerate(value) + for prior in value[:index] + ): + raise ContractError(f"{path} schema items are not unique") + if "items" in schema: + for index, item in enumerate(value): + _validate_schema_value(item, schema["items"], root, f"{path}[{index}]") + + if isinstance(value, str): + if len(value) < schema.get("minLength", 0): + raise ContractError(f"{path} is shorter than its schema minimum") + maximum = schema.get("maxLength") + if maximum is not None and len(value) > maximum: + raise ContractError(f"{path} is longer than its schema maximum") + if "pattern" in schema and re.search(schema["pattern"], value) is None: + raise ContractError(f"{path} does not match its schema pattern") + if schema.get("format") == "date-time": + try: + parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError as exc: + raise ContractError(f"{path} is not a schema date-time") from exc + if parsed.tzinfo is None: + raise ContractError(f"{path} schema date-time lacks a timezone") + + if ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + ): + if "minimum" in schema and value < schema["minimum"]: + raise ContractError(f"{path} is below its schema minimum") + if "maximum" in schema and value > schema["maximum"]: + raise ContractError(f"{path} is above its schema maximum") + + +def _validate_native_schema(name: str, value: Any) -> None: + schema = _SCHEMA_CACHE.get(name) + if schema is None: + loaded = strict_load(SCHEMA_DIR / name) + if not isinstance(loaded, dict): + raise ContractError(f"native artifact schema {name} is not an object") + schema = loaded + _SCHEMA_CACHE[name] = schema + _validate_schema_value(value, schema, schema, "$") + + +def validate_samples_document(document: Any) -> dict[str, Any]: + _validate_native_schema("samples-v1.schema.json", document) + doc = _keys( + document, + {"allocation_id", "attempt_id", "case_id", "format", "points", "sampling", + "schema_version", "series_id"}, + "samples", + ) + if doc["format"] != SAMPLES_FORMAT or doc["schema_version"] != 1: + raise ContractError("samples format/schema differs from v1") + for field, kind in ( + ("allocation_id", "allocation"), ("attempt_id", "attempt"), + ("case_id", "case"), ("series_id", "series"), + ): + _typed(doc[field], kind, f"samples.{field}") + sampling = _keys( + doc["sampling"], {"iterations_per_trial", "reduction", "trials"}, "samples.sampling" + ) + if ( + _integer(sampling["iterations_per_trial"], "samples.sampling.iterations_per_trial", minimum=1) != 8 + or _integer(sampling["trials"], "samples.sampling.trials", minimum=1) != 64 + or sampling["reduction"] != identity.V1_CASE_PROFILE["rank_reduction"] + ): + raise ContractError("samples must use the fixed 8x64 cross-rank-max contract") + points = doc["points"] + if not isinstance(points, list) or not points: + raise ContractError("samples.points must be non-empty") + seen = set() + for index, point_value in enumerate(points): + path = f"samples.points[{index}]" + point = _keys( + point_value, + {"components", "evidence_id", "point_id", "sample_sha256", "tokens_per_rank"}, + path, + ) + tokens = _integer(point["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1) + if tokens in seen: + raise ContractError(f"duplicate sample token point {tokens}") + seen.add(tokens) + _typed(point["point_id"], "point", f"{path}.point_id") + _typed(point["evidence_id"], "evidence", f"{path}.evidence_id") + components = _keys(point["components"], {"combine", "dispatch", "roundtrip"}, f"{path}.components") + for name, component_value in components.items(): + component = _keys( + component_value, {"availability", "sample_count", "trials"}, + f"{path}.components.{name}", + ) + availability = component["availability"] + count = _integer(component["sample_count"], f"{path}.components.{name}.sample_count") + trials = component["trials"] + if availability == "unavailable": + if count != 0 or trials is not None or name == "roundtrip": + raise ContractError(f"{path}.components.{name} has invalid unavailability") + continue + if availability != "measured" or not isinstance(trials, list) or len(trials) != 64: + raise ContractError(f"{path}.components.{name} must contain 64 measured trials") + if any(not isinstance(trial, list) or len(trial) != 8 for trial in trials): + raise ContractError(f"{path}.components.{name} trials must each contain 8 samples") + flattened = [ + _number(sample, f"{path}.components.{name}.trials", minimum=0.0) + for trial in trials for sample in trial + ] + if count != 512 or len(flattened) != 512: + raise ContractError(f"{path}.components.{name} must contain 512 samples") + sample_base = {"components": components, "tokens_per_rank": tokens} + if point["sample_sha256"] != _sha256_json(sample_base): + raise ContractError(f"{path}.sample_sha256 differs") + return doc + + +def _validate_component( + component_value: Any, + sample_component: dict[str, Any] | None, + path: str, + *, + derived: bool = False, +) -> None: + component = _keys( + component_value, {"availability", "origin", "percentiles_us", "sample_count"}, path + ) + availability = component["availability"] + if availability == "unavailable": + if component != { + "availability": "unavailable", "origin": None, + "percentiles_us": None, "sample_count": 0, + }: + raise ContractError(f"{path} has invalid unavailable representation") + if sample_component and sample_component["availability"] != "unavailable": + raise ContractError(f"{path} disagrees with samples") + return + expected_availability = "derived" if derived else "measured" + expected_origin = "derived-percentile-sum" if derived else "measured" + if availability != expected_availability or component["origin"] != expected_origin: + raise ContractError(f"{path} has invalid availability/origin") + percentiles = _keys(component["percentiles_us"], set(PERCENTILES), f"{path}.percentiles_us") + if derived: + if component["sample_count"] != 0: + raise ContractError(f"{path}.sample_count must be zero for a derived value") + return + if sample_component is None or sample_component["availability"] != "measured": + raise ContractError(f"{path} lacks measured sample evidence") + flattened = [sample for trial in sample_component["trials"] for sample in trial] + if component["sample_count"] != len(flattened): + raise ContractError(f"{path}.sample_count differs from exact samples") + for name, percentile in zip(PERCENTILES, (50, 90, 95, 99), strict=True): + _close(percentiles[name], _nearest_rank(flattened, percentile), f"{path}.{name}") + + +def _validate_oracle(value: Any, path: str) -> dict[str, Any]: + oracle = _keys( + value, + {"atol", "checks", "combine_weight_semantics", "contract", "dispatch_sha256", + "max_absolute_error", "max_elementwise_relative_error", "max_relative_error", + "max_weight_error", "order_sha256", "ordering_contract", "passed", "receive_count", + "rtol"}, + path, + ) + if oracle["contract"] != identity.V1_CASE_PROFILE["oracle_contract"]: + raise ContractError(f"{path}.contract differs") + checks = _keys( + oracle["checks"], + {"combine_values", "counts", "metadata", "multiplicity", "payload", "source_set", + "weights"}, + f"{path}.checks", + ) + if any(type(value) is not bool for value in checks.values()): + raise ContractError(f"{path}.checks must be boolean") + if type(oracle["passed"]) is not bool: + raise ContractError(f"{path}.passed must be boolean") + _integer(oracle["receive_count"], f"{path}.receive_count") + _text(oracle["ordering_contract"], f"{path}.ordering_contract") + if oracle["combine_weight_semantics"] != "unweighted-rank-sum": + raise ContractError(f"{path}.combine_weight_semantics differs from v1") + _close(oracle["rtol"], 5e-2, f"{path}.rtol") + _close(oracle["atol"], 2e-2, f"{path}.atol") + for field in ("dispatch_sha256", "order_sha256"): + digest = oracle[field] + if digest is not None and ( + not isinstance(digest, str) or len(digest) != 64 + or any(character not in "0123456789abcdef" for character in digest) + ): + raise ContractError(f"{path}.{field} is not a SHA-256 digest") + for field in ( + "max_absolute_error", "max_elementwise_relative_error", "max_relative_error", + "max_weight_error", + ): + if oracle[field] is not None: + _number(oracle[field], f"{path}.{field}", minimum=0.0) + expected_pass = ( + all(checks.values()) + and oracle["max_relative_error"] is not None + and oracle["max_relative_error"] < 5e-2 + ) + if oracle["passed"] != expected_pass: + raise ContractError(f"{path}.passed differs from its evidence") + return oracle + + +def validate_raw_document(document: Any, samples_document: Any) -> dict[str, Any]: + """Validate identities, exact samples, formulas, privacy, and the native raw shape.""" + _validate_native_schema("raw-case-v1.schema.json", document) + doc = _keys( + document, + {"case", "format", "generated_at", "identity", "implementation", "measurement", + "outcome", "provenance", "record_type", "runtime_fingerprint", "sample_artifact", + "schema_version", "topology", "workload"}, + "raw", + ) + _finite_tree(doc) + if doc["format"] != RAW_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "case-attempt": + raise ContractError("raw format/schema/record type differs from v1") + _text(doc["generated_at"], "raw.generated_at") + identifiers = _keys( + doc["identity"], + {"allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", "case_factors", + "case_id", "series_factors", "series_id"}, + "raw.identity", + ) + for field, kind in ( + ("allocation_id", "allocation"), ("attempt_id", "attempt"), + ("case_id", "case"), ("series_id", "series"), + ): + _typed(identifiers[field], kind, f"raw.identity.{field}") + ordinal = _integer(identifiers["attempt_ordinal"], "raw.identity.attempt_ordinal", minimum=1) + allocation_factors = _keys( + identifiers["allocation_factors"], ALLOCATION_FACTOR_FIELDS, + "raw.identity.allocation_factors", + ) + case_factors = _keys( + identifiers["case_factors"], {"case", "profile", "sku"}, + "raw.identity.case_factors", + ) + scheduled_case = _keys( + case_factors["case"], TERMINAL_CASE_FIELDS, "raw.identity.case_factors.case" + ) + if case_factors["profile"] != identity.V1_CASE_PROFILE: + raise ContractError("raw case profile differs from CollectiveX v1") + _text(case_factors["sku"], "raw.identity.case_factors.sku") + series_factors = _keys( + identifiers["series_factors"], + {"backend", "case_id", "image_digest", "implementation_contract_sha256", + "public_config_sha256", "routing_control_sha256", + "runtime_fingerprint_sha256", "source_sha", "squash_sha256", "workload_id"}, + "raw.identity.series_factors", + ) + if identity.allocation_id(identifiers["allocation_factors"]) != identifiers["allocation_id"]: + raise ContractError("allocation identity differs") + if identity.digest("case", identifiers["case_factors"]) != identifiers["case_id"]: + raise ContractError("case identity differs") + if identity.series_id(identifiers["series_factors"]) != identifiers["series_id"]: + raise ContractError("series identity differs") + if identity.attempt_id( + allocation=identifiers["allocation_id"], case=identifiers["case_id"], ordinal=ordinal + ) != identifiers["attempt_id"]: + raise ContractError("attempt identity differs") + + samples = validate_samples_document(samples_document) + for field in ("allocation_id", "attempt_id", "case_id", "series_id"): + if samples[field] != identifiers[field]: + raise ContractError(f"samples.{field} differs from raw identity") + sample_by_token = {point["tokens_per_rank"]: point for point in samples["points"]} + + case = _keys( + doc["case"], + {"attempt_ordinal", "backend", "eplb", "ep_size", "mode", "phase", + "required_publication", "resource_mode", "runner", "shape", "suite", "workload_name"}, + "raw.case", + ) + ep_size = _integer(case["ep_size"], "raw.case.ep_size", minimum=1) + if case["attempt_ordinal"] != ordinal: + raise ContractError("case attempt ordinal differs") + for field in ("backend", "mode", "phase", "required_publication", "resource_mode", "runner", + "suite", "workload_name"): + _text(case[field], f"raw.case.{field}") + shape = _keys( + case["shape"], + {"activation_profile", "dispatch_dtype", "eplb", "experts", "experts_per_rank", + "hidden", "kernel_gen", "num_logical_experts", "quant", "routing", "topk"}, + "raw.case.shape", + ) + hidden = _integer(shape["hidden"], "raw.case.shape.hidden", minimum=1) + topk = _integer(shape["topk"], "raw.case.shape.topk", minimum=1) + physical_experts = _integer( + shape["experts"], "raw.case.shape.experts", minimum=1 + ) + logical_experts = _integer( + shape["num_logical_experts"], + "raw.case.shape.num_logical_experts", + minimum=1, + ) + experts_per_rank = _integer( + shape["experts_per_rank"], "raw.case.shape.experts_per_rank", minimum=1 + ) + quant = _keys( + shape["quant"], + {"combine_accum_dtype", "combine_input_dtype", "combine_output_dtype", + "combine_quant_mode", "scale_layout"}, + "raw.case.shape.quant", + ) + eplb = _keys( + case["eplb"], + {"enabled", "imbalance_after", "imbalance_before", "mapping_hash", "max_replicas", + "num_logical_experts", "num_physical_experts", "num_redundant", "planner", + "reference_tokens_per_rank", "replicated_experts"}, + "raw.case.eplb", + ) + if not isinstance(eplb["enabled"], bool): + raise ContractError("raw.case.eplb.enabled must be boolean") + expected_redundant = ( + identity.V1_CASE_PROFILE["eplb_redundant_experts"] if eplb["enabled"] else 0 + ) + expected_physical = eplb_contract.physical_count( + scheduled_case["experts"], expected_redundant, ep_size + ) + if ( + shape["eplb"] != eplb["enabled"] + or logical_experts != scheduled_case["experts"] + or physical_experts != expected_physical + or experts_per_rank * ep_size != physical_experts + or eplb["num_logical_experts"] != logical_experts + or eplb["num_physical_experts"] != physical_experts + or eplb["num_redundant"] != expected_redundant + ): + raise ContractError("raw EPLB/shape dimensions differ from the frozen profile") + if eplb["enabled"]: + expected_plan = _expected_eplb_plan( + scheduled_case["routing"], + topk, + logical_experts, + physical_experts, + ep_size, + identity.V1_CASE_PROFILE["seed"], + identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"], + ) + expected_eplb = { + "enabled": True, + "imbalance_after": expected_plan["imbalance_after"], + "imbalance_before": expected_plan["imbalance_before"], + "mapping_hash": eplb_contract.mapping_hash(expected_plan), + "max_replicas": expected_plan["max_replicas"], + "num_logical_experts": logical_experts, + "num_physical_experts": physical_experts, + "num_redundant": expected_redundant, + "planner": identity.V1_CASE_PROFILE["eplb_planner"], + "reference_tokens_per_rank": identity.V1_CASE_PROFILE[ + "eplb_reference_tokens_per_rank" + ], + "replicated_experts": expected_plan["replicated_experts"], + } + else: + expected_eplb = { + "enabled": False, + "imbalance_after": None, + "imbalance_before": None, + "mapping_hash": None, + "max_replicas": None, + "num_logical_experts": logical_experts, + "num_physical_experts": physical_experts, + "num_redundant": 0, + "planner": None, + "reference_tokens_per_rank": None, + "replicated_experts": 0, + } + _equivalent(eplb, expected_eplb, "raw.case.eplb", tolerance=1e-9) + if case_factors["sku"] != case["runner"]: + raise ContractError("raw case runner differs from case identity") + + workload = _keys( + doc["workload"], + {"activation_generator", "activation_identity", "activation_profile", + "cross_rank_consistent", "manifest_checksums", "members", "routing_generator", "source", + "trace_hashes", "trace_signature", "workload_id"}, + "raw.workload", + ) + if workload["source"] not in {"canonical-serialized", "seeded-runtime"}: + raise ContractError("raw workload source is invalid") + if workload["source"] == "canonical-serialized": + _typed(workload["workload_id"], "workload", "raw.workload.workload_id") + members = workload["members"] + checksums = workload["manifest_checksums"] + if ( + not isinstance(members, list) + or not members + or members != sorted(set(members)) + or not all(identity.is_typed_id(member, "workload") for member in members) + or not isinstance(checksums, dict) + or set(checksums) != set(members) + ): + raise ContractError("raw canonical workload members/checksums are invalid") + for member, values in checksums.items(): + if ( + not isinstance(values, dict) + or set(values) != {"topk_idx", "topk_weights", "trace"} + or any(not re.fullmatch(r"[0-9a-f]{64}", str(value)) for value in values.values()) + ): + raise ContractError(f"raw canonical workload checksums differ for {member}") + expected_workload_id = identity.workload_id({ + "members": [ + {"checksums": checksums[member], "workload_id": member} + for member in members + ] + }) + if workload["workload_id"] != expected_workload_id: + raise ContractError("raw composite workload identity differs from its members") + elif any(workload[field] is not None for field in ("members", "manifest_checksums", "workload_id")): + raise ContractError("raw seeded workload cannot claim serialized members") + if workload["cross_rank_consistent"] is not True: + raise ContractError("raw workload is not consistent across ranks") + + measurement = _keys( + doc["measurement"], + {"component_order_contract", "conditioning", "contract", "rows", + "sampling", "source_allocation"}, + "raw.measurement", + ) + validate_conditioning_contract(measurement["conditioning"], case["phase"]) + sampling = _keys( + measurement["sampling"], + {"contract", "iterations_per_trial", "percentile_method", "reduction", + "samples_per_component", "trials", "warmup_iterations", "warmup_semantics"}, + "raw.measurement.sampling", + ) + expected_sampling = { + "contract": identity.V1_CASE_PROFILE["sampling_contract"], "iterations_per_trial": 8, + "percentile_method": identity.V1_CASE_PROFILE["percentile_method"], + "reduction": identity.V1_CASE_PROFILE["rank_reduction"], + "samples_per_component": 512, "trials": 64, "warmup_iterations": 32, + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + } + if sampling != expected_sampling: + raise ContractError("raw sampling contract differs from fixed-512-v1") + profile = identity.V1_CASE_PROFILE + if ( + case["mode"] != profile["mode"] + or case["resource_mode"] != profile["resource_mode"] + or measurement["contract"] != profile["contract"] + or measurement["component_order_contract"] != profile["component_order_contract"] + or measurement["source_allocation"] != "even" + or shape["activation_profile"] != profile["activation_profile"] + or shape["dispatch_dtype"] != profile["dtype"] + or quant["combine_input_dtype"] != profile["combine_dtype"] + or quant["combine_output_dtype"] != profile["combine_dtype"] + or quant["combine_quant_mode"] != profile["combine_quant_mode"] + or quant["scale_layout"] is not None + or workload["activation_generator"] != profile["activation_generator"] + or workload["activation_profile"] != profile["activation_profile"] + or workload["routing_generator"] != profile["routing_generator"] + ): + raise ContractError("raw case differs from the frozen v1 profile") + expected_activation = hashlib.sha256( + ( + f"counter|seed={profile['seed']}|hidden={hidden}|" + f"gen={profile['activation_generator']}" + ).encode() + ).hexdigest() + if workload["activation_identity"] != expected_activation: + raise ContractError("raw activation identity differs from the frozen seed/profile") + rows = measurement["rows"] + if not isinstance(rows, list) or not rows: + raise ContractError("raw.measurement.rows must be non-empty") + seen_points = set() + row_tokens = [] + recomputed_anomalies = 0 + for index, row_value in enumerate(rows): + path = f"raw.measurement.rows[{index}]" + row = _keys( + row_value, + {"anomalies", "components", "correctness", "evidence_id", "global_tokens", + "logical_bytes", "point_id", "receive", "routing", + "sample_histograms", "sample_sha256", "token_rate_at_latency_percentile", + "tokens_per_rank"}, + path, + ) + tokens = _integer(row["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1) + row_tokens.append(tokens) + if tokens in seen_points or tokens not in sample_by_token: + raise ContractError(f"{path} token point is duplicate or missing samples") + seen_points.add(tokens) + if row["global_tokens"] != tokens * ep_size: + raise ContractError(f"{path}.global_tokens formula differs") + sample_point = sample_by_token[tokens] + expected_point = identity.point_id(series=identifiers["series_id"], tokens_per_rank=tokens) + if row["point_id"] != expected_point or sample_point["point_id"] != expected_point: + raise ContractError(f"{path}.point_id differs") + expected_evidence = identity.evidence_id( + point=expected_point, allocation=identifiers["allocation_id"], + attempt=identifiers["attempt_id"], sample_sha256=sample_point["sample_sha256"], + ) + if row["evidence_id"] != expected_evidence or sample_point["evidence_id"] != expected_evidence: + raise ContractError(f"{path}.evidence_id differs") + if row["sample_sha256"] != sample_point["sample_sha256"]: + raise ContractError(f"{path}.sample_sha256 differs") + components = _keys( + row["components"], {"combine", "dispatch", "isolated_sum", "roundtrip"}, + f"{path}.components", + ) + for name in ("combine", "dispatch", "roundtrip"): + _validate_component( + components[name], sample_point["components"][name], f"{path}.components.{name}" + ) + _validate_component( + components["isolated_sum"], None, f"{path}.components.isolated_sum", derived=True + ) + _, _, _, expected_indices, expected_weights = _expected_canonical_trace( + scheduled_case["routing"], + hidden, + topk, + logical_experts, + physical_experts, + ep_size, + tokens, + profile["seed"], + eplb["enabled"], + profile["eplb_reference_tokens_per_rank"], + ) + expected_routing = _expected_routing_summary( + expected_indices, + expected_weights, + physical_experts=physical_experts, + ep_size=ep_size, + tokens_per_rank=tokens, + gpus_per_node=scheduled_case["gpus_per_node"], + scale_up_domain=scheduled_case["scale_up_domain"], + ) + _equivalent( + row["routing"], expected_routing, f"{path}.routing", tolerance=1e-5 + ) + expected_payload_counts = expected_routing["payload_copies_per_rank"] + throughput = _keys( + row["token_rate_at_latency_percentile"], set(PERCENTILES), + f"{path}.token_rate_at_latency_percentile", + ) + for percentile in PERCENTILES: + latency = components["roundtrip"]["percentiles_us"][percentile] + if latency <= 0: + raise ContractError(f"{path} roundtrip latency must be positive") + _close( + throughput[percentile], row["global_tokens"] / (latency * 1e-6), + f"{path}.token_rate_at_latency_percentile.{percentile}", 1e-9, + ) + correctness = _keys( + row["correctness"], + {"contract", "max_relative_error", "passed", "rank_evidence", "scope"}, + f"{path}.correctness", + ) + if ( + correctness["contract"] != identity.V1_CASE_PROFILE["oracle_contract"] + or correctness["scope"] != "dispatch-metadata-and-transformed-combine" + or type(correctness["passed"]) is not bool + ): + raise ContractError(f"{path}.correctness contract differs") + _number( + correctness["max_relative_error"], + f"{path}.correctness.max_relative_error", + minimum=0.0, + ) + rank_evidence = correctness["rank_evidence"] + if not isinstance(rank_evidence, list) or len(rank_evidence) != ep_size: + raise ContractError(f"{path}.correctness.rank_evidence must cover every rank") + ranks = set() + observed_max_error = 0.0 + evidence_passed = True + for evidence_index, evidence_value in enumerate(rank_evidence): + evidence_path = f"{path}.correctness.rank_evidence[{evidence_index}]" + evidence = _keys( + evidence_value, + {"input_unchanged", "order_stable", "post_timing", "pre_timing", "rank"}, + evidence_path, + ) + evidence_rank = _integer(evidence["rank"], f"{evidence_path}.rank") + if evidence_rank >= ep_size: + raise ContractError(f"{evidence_path}.rank is outside the EP group") + ranks.add(evidence_rank) + if type(evidence["input_unchanged"]) is not bool or type(evidence["order_stable"]) is not bool: + raise ContractError(f"{evidence_path} stability fields must be boolean") + pre = _validate_oracle(evidence["pre_timing"], f"{evidence_path}.pre_timing") + post = _validate_oracle(evidence["post_timing"], f"{evidence_path}.post_timing") + if ( + pre["receive_count"] != expected_payload_counts[evidence_rank] + or post["receive_count"] != expected_payload_counts[evidence_rank] + ): + raise ContractError( + f"{evidence_path}.receive_count differs from canonical routing" + ) + expected_stability = all( + pre[field] == post[field] + for field in ("ordering_contract", "order_sha256", "dispatch_sha256") + ) + if evidence["order_stable"] != expected_stability: + raise ContractError(f"{evidence_path}.order_stable differs from the evidence") + errors = [ + oracle["max_relative_error"] + for oracle in (pre, post) + if oracle["max_relative_error"] is not None + ] + observed_max_error = max([observed_max_error, *errors]) + evidence_passed = evidence_passed and all( + (evidence["input_unchanged"], evidence["order_stable"], pre["passed"], post["passed"]) + ) + if ranks != set(range(ep_size)) or correctness["passed"] != evidence_passed: + raise ContractError(f"{path}.correctness rank coverage or outcome differs") + _close( + correctness["max_relative_error"], observed_max_error, + f"{path}.correctness.max_relative_error", + ) + if components["dispatch"]["availability"] == "measured": + for percentile in PERCENTILES: + expected = ( + components["dispatch"]["percentiles_us"][percentile] + + components["combine"]["percentiles_us"][percentile] + ) + _close( + components["isolated_sum"]["percentiles_us"][percentile], expected, + f"{path}.components.isolated_sum.{percentile}", + ) + routed_copies = expected_routing["routed_copies"] + expected_bytes = routed_copies * hidden * 2 + expected_logical = { + "combine": expected_bytes, + "dispatch": expected_bytes, + "roundtrip": expected_bytes * 2, + } + _equivalent(row["logical_bytes"], expected_logical, f"{path}.logical_bytes") + + max_receive = max(expected_payload_counts) + expected_receive = { + "max": max_receive, + "mean": sum(expected_payload_counts) / ep_size, + "min": min(expected_payload_counts), + "total": sum(expected_payload_counts), + } + _equivalent(row["receive"], expected_receive, f"{path}.receive") + expected_histograms = { + name: ( + _expected_histogram([ + sample + for trial in sample_point["components"][name]["trials"] + for sample in trial + ]) + if sample_point["components"][name]["availability"] == "measured" + else None + ) + for name in ("dispatch", "combine", "roundtrip") + } + _equivalent( + row["sample_histograms"], expected_histograms, f"{path}.sample_histograms" + ) + expected_anomalies = _expected_anomalies(tokens, components) + _equivalent(row["anomalies"], expected_anomalies, f"{path}.anomalies") + recomputed_anomalies += len(expected_anomalies) + if seen_points != set(sample_by_token): + raise ContractError("raw rows and sample points differ") + if row_tokens != sorted(row_tokens): + raise ContractError("raw rows must follow the scheduled token ladder") + expected_trace_hashes = sorted(row["routing"]["hash"] for row in rows) + if workload["trace_hashes"] != expected_trace_hashes: + raise ContractError("raw workload trace hashes differ from measured rows") + expected_trace_signature = hashlib.sha256( + "|".join(expected_trace_hashes).encode() + ).hexdigest() + if workload["trace_signature"] != expected_trace_signature: + raise ContractError("raw workload trace signature differs from measured rows") + + implementation = _keys( + doc["implementation"], {"kernel_generation", "name", "provenance", "resource_profile"}, + "raw.implementation", + ) + if ( + implementation["name"] != case["backend"] + or implementation["kernel_generation"] != shape["kernel_gen"] + ): + raise ContractError("raw implementation identity differs from the case") + provenance_fields = _obj(implementation["provenance"], "raw.implementation.provenance") + unknown = set(provenance_fields) - PROVENANCE_KEYS + if unknown: + raise ContractError(f"raw implementation provenance has unknown fields {sorted(unknown)}") + if ( + implementation["name"] == "deepep-v2" + and provenance_fields.get("deterministic") is not False + ): + raise ContractError("DeepEP V2 deterministic mode differs from the v1 kernel contract") + if implementation["name"] == "deepep-v2" and ( + _integer( + provenance_fields.get("tuning_num_experts"), + "raw.implementation.provenance.tuning_num_experts", + minimum=1, + ) != logical_experts + or _integer( + provenance_fields.get("num_experts"), + "raw.implementation.provenance.num_experts", + minimum=1, + ) != physical_experts + ): + raise ContractError("DeepEP V2 expert-count provenance differs from the case") + if implementation["name"] == "deepep-hybrid": + realized_config = provenance_fields.get("realized_config") + jit_kernel_keys = provenance_fields.get("jit_kernel_keys") + jit_shared_objects = provenance_fields.get("jit_shared_objects") + if ( + not _hybrid_realized_config_is_valid(realized_config) + or not _hybrid_jit_evidence_is_valid(jit_shared_objects, jit_kernel_keys) + or realized_config["hidden_dim"] != shape["hidden"] + or realized_config["num_of_experts_per_rank"] * ep_size != physical_experts + or realized_config["num_of_ranks_per_node"] != ep_size + or realized_config["num_of_nodes"] != 1 + or realized_config["token_data_type"] != "UINT16" + or any( + len(artifact["rank_artifacts"]) != ep_size + for artifact in jit_shared_objects + ) + ): + raise ContractError("DeepEP Hybrid realized config/JIT evidence differs from the case") + if implementation["name"] == "nccl-ep" and implementation["kernel_generation"] != ( + collective_kernel_generation(provenance_fields.get("collective_library")) + ): + raise ContractError("NCCL/RCCL kernel generation differs from collective lineage") + resource_profile = _obj( + implementation["resource_profile"], "raw.implementation.resource_profile" + ) + expected_resource_profile = project_resource_profile(provenance_fields) + if resource_profile != expected_resource_profile: + raise ContractError("raw resource profile differs from implementation provenance") + topology = _keys( + doc["topology"], + {"device_count", "device_product", "gpus_per_node", "nodes", "placement", + "realized_placement", "scale_up_domain", "topology_class", "transport", "world_size"}, + "raw.topology", + ) + for field in ("device_count", "gpus_per_node", "nodes", "scale_up_domain", "world_size"): + _integer(topology[field], f"raw.topology.{field}", minimum=1) + realized = _keys( + topology["realized_placement"], + {"gpus_per_node", "nodes", "ranks_per_node", "unique_local_ranks", "valid"}, + "raw.topology.realized_placement", + ) + if realized != { + "gpus_per_node": topology["gpus_per_node"], + "nodes": topology["nodes"], + "ranks_per_node": topology["gpus_per_node"], + "unique_local_ranks": True, + "valid": True, + }: + raise ContractError("raw realized placement differs from requested topology") + if ( + topology["world_size"] != ep_size + or topology["nodes"] * topology["gpus_per_node"] != ep_size + or topology["device_count"] != topology["gpus_per_node"] + or topology["placement"] != profile["placement"] + or topology["scale_up_domain"] < ep_size + ): + raise ContractError("raw topology dimensions differ from the case") + if implementation["name"] == "deepep-v2": + if ( + provenance_fields.get("allow_hybrid_mode"), + provenance_fields.get("gin_enabled"), + provenance_fields.get("communication_backend"), + ) != (False, False, "nccl-device-lsa"): + raise ContractError("DeepEP V2 communication policy differs from the v1 contract") + lsa_topology = tuple( + _integer( + provenance_fields.get(field), + f"raw.implementation.provenance.{field}", + minimum=1, + ) + for field in ( + "physical_rdma_ranks", "physical_nvlink_ranks", + "logical_scaleout_ranks", "logical_scaleup_ranks", + ) + ) + if lsa_topology != (1, ep_size, 1, ep_size): + raise ContractError("DeepEP V2 no-GIN provenance is outside one realized LSA domain") + runtime = _keys( + doc["runtime_fingerprint"], + {"accelerator_runtime", "collective_library", "device", "driver_version", "framework", + "machine", "python_version", "vendor"}, + "raw.runtime_fingerprint", + ) + for field in ("machine", "python_version", "vendor"): + _text(runtime[field], f"raw.runtime_fingerprint.{field}") + runtime_device = _keys( + runtime["device"], {"arch", "compute_units", "memory_bytes", "product", "warp_size"}, + "raw.runtime_fingerprint.device", + ) + if topology["device_product"] != runtime_device["product"]: + raise ContractError("raw topology and runtime device products differ") + platform = capability.PLATFORMS.get(case["runner"]) + if platform is not None: + identity_issues = capability.runtime_identity_issues( + case["runner"], vendor=runtime["vendor"], arch=runtime_device["arch"], + machine=runtime["machine"], device_name=runtime_device["product"], + device_count=topology["device_count"], world_size=topology["world_size"], + ) + expected_topology_class = ( + f"{case['runner']}-nvl72-mnnvl" + if case["runner"] in {"gb200", "gb300"} + else f"{case['runner']}-xgmi" + if platform["vendor"] == "amd" + else f"{platform['product']}-nvlink-island" + ) + if identity_issues or ( + topology["transport"] != platform["transport"] + or topology["gpus_per_node"] != platform["gpus_per_node"] + or topology["scale_up_domain"] != platform["scale_up_domain"] + or topology["topology_class"] != expected_topology_class + ): + raise ContractError( + "raw runtime/topology differs from the scheduled SKU: " + + "; ".join(identity_issues) + ) + raw_provenance = _keys( + doc["provenance"], {"command", "distributed_launcher", "git_run", "image", "redaction"}, + "raw.provenance", + ) + image = _keys( + raw_provenance["image"], + {"arch", "digest", "digest_verified", "reference", "squash_sha256"}, + "raw.provenance.image", + ) + if ( + image["digest_verified"] is not True + or not isinstance(image["digest"], str) + or not re.fullmatch(r"sha256:[0-9a-f]{64}", image["digest"]) + ): + raise ContractError("raw image digest was not registry-verified") + if raw_provenance["redaction"] != "sanitized-v1": + raise ContractError("raw provenance redaction contract differs") + git_run = raw_provenance["git_run"] + if git_run is not None: + git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run") + expected_provenance_complete = provenance_complete( + provenance_fields, + case["backend"], + git_run, + image_digest=image["digest"], + image_verified=image["digest_verified"], + squash_sha256=image["squash_sha256"], + ) + + actual_scheduled_case = { + "backend": case["backend"], + "canonical": workload["source"] == "canonical-serialized", + "eplb": eplb["enabled"], + "ep": ep_size, + "experts": shape["num_logical_experts"], + "gpus_per_node": topology["gpus_per_node"], + "hidden": hidden, + "ladder": " ".join(map(str, row_tokens)), + "nodes": topology["nodes"], + "phase": case["phase"], + "required_publication": case["required_publication"], + "routing": shape["routing"], + "samples_per_point": sampling["samples_per_component"], + "scale_up_domain": topology["scale_up_domain"], + "suite": case["suite"], + "timing": ( + f"{sampling['iterations_per_trial']}:{sampling['trials']}:" + f"{sampling['warmup_iterations']}" + ), + "topk": shape["topk"], + "warmup_semantics": sampling["warmup_semantics"], + "workload": case["workload_name"], + } + if scheduled_case != actual_scheduled_case: + mismatches = sorted( + field for field in scheduled_case + if scheduled_case[field] != actual_scheduled_case[field] + ) + raise ContractError(f"raw data differs from scheduled case fields {mismatches}") + + if workload["source"] == "canonical-serialized": + _validate_canonical_workload(workload, scheduled_case, rows, eplb) + + expected_series = { + "backend": case["backend"], + "case_id": identifiers["case_id"], + "image_digest": image["digest"], + "implementation_contract_sha256": _sha256_json({ + "kernel_generation": implementation["kernel_generation"], + "name": implementation["name"], + "provenance": series_provenance(provenance_fields), + "resource_profile": resource_profile, + }), + "public_config_sha256": public_series_config_sha256(public_series_config( + kernel_generation=implementation["kernel_generation"], + provenance=provenance_fields, + resource_profile=resource_profile, + resource_mode=case["resource_mode"], + device_product=topology["device_product"], + )), + "routing_control_sha256": routing_implementation_control_sha256(implementation), + "runtime_fingerprint_sha256": _sha256_json(runtime), + "source_sha": git_run["source_sha"] if git_run is not None else None, + "squash_sha256": image["squash_sha256"], + "workload_id": workload["workload_id"] or workload["trace_signature"], + } + if series_factors != expected_series: + raise ContractError("raw series factors differ from measured implementation/runtime") + expected_allocation = { + "artifact": git_run["artifact"] if git_run is not None else None, + "execution_id": allocation_factors["execution_id"], + "job": git_run["job"] if git_run is not None else None, + "repo": git_run["repo"] if git_run is not None else None, + "run_attempt": git_run["run_attempt"] if git_run is not None else None, + "run_id": git_run["run_id"] if git_run is not None else None, + "runner": case["runner"], + "source_sha": git_run["source_sha"] if git_run is not None else None, + } + if allocation_factors != expected_allocation: + raise ContractError("raw allocation factors differ from provenance") + artifact = _keys(doc["sample_artifact"], {"bytes", "format", "path", "sha256"}, "raw.sample_artifact") + if artifact["format"] != SAMPLES_FORMAT or Path(artifact["path"]).name != artifact["path"]: + raise ContractError("raw.sample_artifact format/path is invalid") + if not isinstance(artifact["sha256"], str) or len(artifact["sha256"]) != 64: + raise ContractError("raw.sample_artifact.sha256 is invalid") + _integer(artifact["bytes"], "raw.sample_artifact.bytes", minimum=1) + outcome = _keys(doc["outcome"], {"publication_status", "reasons", "status", "validity"}, "raw.outcome") + if outcome["status"] not in {"success", "invalid"} or outcome["publication_status"] not in {"diagnostic", "invalid"}: + raise ContractError("raw outcome status is invalid") + if not isinstance(outcome["reasons"], list) or not all(isinstance(x, str) for x in outcome["reasons"]): + raise ContractError("raw outcome reasons must be strings") + validity = _keys( + outcome["validity"], + {"anomaly_free", "execution_status", "measurement_conformance", "provenance_complete", + "resource_conformance", "sampling_conformance", "semantic_correctness", + "workload_identity", "workload_source"}, + "raw.outcome.validity", + ) + correctness_passed = all(row["correctness"]["passed"] for row in rows) + workload_consistent = workload["cross_rank_consistent"] is True + expected_status = "success" if correctness_passed and workload_consistent else "invalid" + expected_publication = "diagnostic" if expected_status == "success" else "invalid" + if ( + outcome["status"] != expected_status + or outcome["publication_status"] != expected_publication + or bool(outcome["reasons"]) == (expected_status == "success") + or validity["execution_status"] != "complete" + or validity["semantic_correctness"] != ("pass" if correctness_passed else "fail") + or validity["workload_identity"] != ( + "consistent-across-ranks" if workload_consistent else "inconsistent" + ) + or validity["workload_source"] != workload["source"] + or validity["measurement_conformance"] != "conformant" + or validity["sampling_conformance"] != "conformant" + or validity["resource_conformance"] != resource_profile["conformance_class"] + or validity["anomaly_free"] != (recomputed_anomalies == 0) + or validity["provenance_complete"] is not expected_provenance_complete + ): + raise ContractError("raw outcome differs from its measurement evidence") + artifact_safety.assert_publication_safe([doc]) + return doc + + +def make_terminal_document( + *, + allocation_factors: dict[str, Any], + attempt_ordinal: int, + case: dict[str, Any], + case_factors: dict[str, Any], + control_sha256: str | None, + failure_mode: str, + generated_at: str, + git_run: dict[str, Any], + reason: str, + return_code: int, + source: str, + status: str, + expected_case_id: str | None = None, +) -> dict[str, Any]: + """Build and self-validate one attributable non-success attempt.""" + case_id = identity.digest("case", case_factors) + if expected_case_id is not None and expected_case_id != case_id: + raise ContractError( + f"scheduled case ID differs from terminal factors: {expected_case_id} != {case_id}" + ) + allocation_id = identity.allocation_id(allocation_factors) + attempt_id = identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=attempt_ordinal + ) + document = { + "format": TERMINAL_FORMAT, + "schema_version": 1, + "record_type": "terminal-outcome", + "generated_at": generated_at, + "identity": { + "allocation_factors": allocation_factors, + "allocation_id": allocation_id, + "attempt_id": attempt_id, + "attempt_ordinal": attempt_ordinal, + "case_factors": case_factors, + "case_id": case_id, + }, + "case": case, + "provenance": { + "git_run": git_run, + "control_sha256": control_sha256, + "redaction": "sanitized-v1", + "source": source, + }, + "outcome": { + "status": status, + "failure_mode": failure_mode, + "reason": reason, + "return_code": return_code, + }, + } + return validate_terminal_document(document) + + +def validate_terminal_document(document: Any) -> dict[str, Any]: + _validate_native_schema("terminal-outcome-v1.schema.json", document) + doc = _keys( + document, + {"case", "format", "generated_at", "identity", "outcome", "provenance", "record_type", + "schema_version"}, + "terminal", + ) + if doc["format"] != TERMINAL_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "terminal-outcome": + raise ContractError("terminal format/schema/record type differs from v1") + ids = _keys(doc["identity"], { + "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", + "case_factors", "case_id", + }, "terminal.identity") + for field, kind in (("allocation_id", "allocation"), ("attempt_id", "attempt"), ("case_id", "case")): + _typed(ids[field], kind, f"terminal.identity.{field}") + ordinal = _integer(ids["attempt_ordinal"], "terminal.identity.attempt_ordinal", minimum=1) + case = _keys(doc["case"], TERMINAL_CASE_FIELDS, "terminal.case") + factors = _keys(ids["case_factors"], {"case", "profile", "sku"}, "terminal.identity.case_factors") + if factors["case"] != case or factors["profile"] != identity.V1_CASE_PROFILE: + raise ContractError("terminal case factors differ from the scheduled case/profile") + _text(factors["sku"], "terminal.identity.case_factors.sku") + allocation = _keys( + ids["allocation_factors"], ALLOCATION_FACTOR_FIELDS, + "terminal.identity.allocation_factors", + ) + expected_case = identity.digest("case", factors) + expected_allocation = identity.allocation_id(allocation) + expected_attempt = identity.attempt_id( + allocation=expected_allocation, case=expected_case, ordinal=ordinal + ) + if (ids["case_id"], ids["allocation_id"], ids["attempt_id"]) != ( + expected_case, expected_allocation, expected_attempt + ): + raise ContractError("terminal typed identities do not match their factors") + provenance = _keys( + doc["provenance"], {"git_run", "control_sha256", "redaction", "source"}, + "terminal.provenance", + ) + git_run = _keys(provenance["git_run"], GIT_RUN_FIELDS, "terminal.provenance.git_run") + control = provenance["control_sha256"] + if control is not None and ( + not isinstance(control, str) or len(control) != 64 + or any(char not in "0123456789abcdef" for char in control) + ): + raise ContractError("terminal control_sha256 is invalid") + if provenance["redaction"] != "sanitized-v1": + raise ContractError("terminal redaction contract differs") + source = _text(provenance["source"], "terminal.provenance.source") + outcome = _keys( + doc["outcome"], {"failure_mode", "reason", "return_code", "status"}, "terminal.outcome" + ) + if outcome["status"] not in {"failed", "invalid", "unsupported"}: + raise ContractError("terminal outcome status is invalid") + failure_mode = _text(outcome["failure_mode"], "terminal.outcome.failure_mode") + reason = _text(outcome["reason"], "terminal.outcome.reason") + _integer(outcome["return_code"], "terminal.outcome.return_code") + if source == "runtime-emitter": + expected_runner = factors["sku"] + expected_reason = RUNTIME_FAILURE_REASONS.get(failure_mode) + valid_outcome = outcome["status"] == "failed" and reason == expected_reason + elif source == "post-emit-command": + expected_runner = factors["sku"] + expected_reason = POST_EMIT_FAILURE_REASONS.get(failure_mode) + valid_outcome = outcome["status"] == "failed" and reason == expected_reason + elif source == "matrix-capability-resolver": + expected_runner = "capability-resolver" + valid_outcome = ( + outcome["status"] == "unsupported" + and failure_mode == "capability" + and reason in CAPABILITY_FAILURE_REASONS + ) + else: + raise ContractError("terminal provenance source is not registered") + if not valid_outcome: + raise ContractError("terminal source and outcome are not registered") + expected_allocation = { + "artifact": git_run["artifact"], + "execution_id": allocation["execution_id"], + "job": git_run["job"], + "repo": git_run["repo"], + "run_attempt": git_run["run_attempt"], + "run_id": git_run["run_id"], + "runner": expected_runner, + "source_sha": git_run["source_sha"], + } + if allocation != expected_allocation: + raise ContractError("terminal allocation factors differ from provenance or source") + artifact_safety.assert_publication_safe([doc]) + return doc + + +def load_raw_attempt(path: str | os.PathLike[str]) -> dict[str, Any]: + document = strict_load(path) + artifact = _obj(document, "raw").get("sample_artifact") + artifact = _obj(artifact, "raw.sample_artifact") + sample_path = Path(path).with_name(_text(artifact.get("path"), "raw.sample_artifact.path")) + payload = sample_path.read_bytes() + if len(payload) != artifact.get("bytes") or hashlib.sha256(payload).hexdigest() != artifact.get("sha256"): + raise ContractError("sample artifact bytes or digest differ") + samples = strict_load(sample_path) + return validate_raw_document(document, samples) + + +def load_attempt(path: str | os.PathLike[str]) -> dict[str, Any]: + """Fully validate and return one native raw or terminal attempt.""" + document = strict_load(path) + if isinstance(document, dict) and document.get("format") == RAW_FORMAT: + return load_raw_attempt(path) + if isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT: + return validate_terminal_document(document) + raise ContractError("unknown native attempt format") + + +def quarantine_invalid_attempt(path: str | os.PathLike[str]) -> bool: + """Move an invalid attempt and its basename-safe sample outside JSON upload globs.""" + destination = Path(path) + if not destination.is_file(): + return False + try: + load_attempt(destination) + return False + except (ContractError, OSError, ValueError): + try: + document = json.loads(destination.read_bytes()) + except (OSError, json.JSONDecodeError): + document = {} + artifact = document.get("sample_artifact") if isinstance(document, dict) else None + sample_name = artifact.get("path") if isinstance(artifact, dict) else None + if isinstance(sample_name, str) and Path(sample_name).name == sample_name: + sample_path = destination.with_name(sample_name) + if sample_path.is_file(): + os.replace(sample_path, sample_path.with_name(sample_path.name + ".quarantine")) + os.replace(destination, destination.with_name(destination.name + ".quarantine")) + return True + + +def normalize_attempt(document: dict[str, Any]) -> dict[str, Any]: + """Return the publisher-facing projection after native validation.""" + if document.get("format") == RAW_FORMAT: + ids = document["identity"] + return { + "allocation_id": ids["allocation_id"], + "attempt_id": ids["attempt_id"], + "case": document["case"], + "case_id": ids["case_id"], + "generated_at": document["generated_at"], + "outcome": document["outcome"], + "points": document["measurement"]["rows"], + "runtime_fingerprint": document["runtime_fingerprint"], + "series_id": ids["series_id"], + } + if document.get("format") == TERMINAL_FORMAT: + ids = document["identity"] + return { + "allocation_id": ids["allocation_id"], + "attempt_id": ids["attempt_id"], + "case": document["case"], + "case_id": ids["case_id"], + "generated_at": document["generated_at"], + "outcome": document["outcome"], + "points": [], + "runtime_fingerprint": None, + "series_id": None, + } + raise ContractError("unknown attempt format") + + +def _env_integer(name: str, default: int) -> int: + try: + return int(os.environ.get(name, str(default))) + except ValueError: + return default + + +def _env_enabled(name: str) -> bool: + return os.environ.get(name, "").lower() in {"1", "true", "yes"} + + +def _terminal_case_from_environment(backend: str, phase: str) -> dict[str, Any]: + ep = _env_integer("CX_EP", _env_integer("CX_NGPUS", 1)) + gpus_per_node = _env_integer("CX_GPUS_PER_NODE", ep) + ladder = os.environ.get("CX_TOKENS_LADDER", "") or ( + "1 2 4 8 16 32 64 128" + if phase == "decode" + else "128 256 512 1024 2048 4096" + ) + return { + "suite": os.environ.get("CX_SUITE") or "manual", + "workload": os.environ.get("CX_WORKLOAD_NAME") or "manual", + "required_publication": os.environ.get("CX_REQUIRED_PUBLICATION") or "diagnostic", + "backend": backend, + "routing": os.environ.get("CX_ROUTING", "uniform"), + "phase": phase, + "ep": ep, + "eplb": _env_enabled("CX_EPLB"), + "hidden": _env_integer("CX_HIDDEN", 7168), + "topk": _env_integer("CX_TOPK", 8), + "experts": _env_integer("CX_EXPERTS", 256), + "samples_per_point": _env_integer("CX_SAMPLES_PER_POINT", 512), + "warmup_semantics": os.environ.get( + "CX_WARMUP_SEMANTICS", + "full-roundtrip-before-each-component-trial-point-v1", + ), + "ladder": ladder, + "timing": ( + f'{_env_integer("CX_ITERS", 8)}:{_env_integer("CX_TRIALS", 64)}:' + f'{_env_integer("CX_WARMUP", 32)}' + ), + "canonical": _env_enabled("CX_CANONICAL"), + "nodes": _env_integer("CX_NODES", _env_integer("SLURM_NNODES", 1)), + "gpus_per_node": gpus_per_node, + "scale_up_domain": _env_integer("CX_SCALE_UP_DOMAIN", gpus_per_node), + } + + +def _git_run_from_environment() -> dict[str, Any]: + def value(name: str) -> str | None: + return os.environ.get(name) or None + + return { + "run_id": value("GITHUB_RUN_ID"), + "run_attempt": value("GITHUB_RUN_ATTEMPT"), + "ref": value("GITHUB_REF_NAME") or value("GITHUB_REF"), + "source_sha": value("COLLECTIVEX_SOURCE_SHA") or value("GITHUB_SHA"), + "repo": value("GITHUB_REPOSITORY"), + "job": value("GITHUB_JOB"), + "artifact": value("COLLECTIVEX_ARTIFACT_NAME"), + } + + +def _allocation_factors_from_environment( + runner: str, git_run: dict[str, Any] +) -> dict[str, Any]: + return { + "artifact": git_run["artifact"], + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID") or None, + "job": git_run["job"], + "repo": git_run["repo"], + "run_attempt": git_run["run_attempt"], + "run_id": git_run["run_id"], + "runner": runner, + "source_sha": git_run["source_sha"], + } + + +def make_terminal_from_environment( + *, backend: str, phase: str, return_code: int, failure_mode: str | None = None +) -> dict[str, Any]: + """Build a terminal document from the same exported case coordinates as run_ep.""" + mode = failure_mode or RETURN_CODE_FAILURE_MODES.get(return_code, "execution") + reason = RUNTIME_FAILURE_REASONS.get(mode) + if reason is None: + raise ContractError("runtime failure mode is not registered") + runner = os.environ.get("CX_RUNNER", "") + case = _terminal_case_from_environment(backend, phase) + case_factors = {"case": case, "profile": identity.V1_CASE_PROFILE, "sku": runner} + git_run = _git_run_from_environment() + control = os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None + return make_terminal_document( + allocation_factors=_allocation_factors_from_environment(runner, git_run), + attempt_ordinal=_env_integer("CX_ATTEMPT_ID", 1), + case=case, + case_factors=case_factors, + control_sha256=control, + failure_mode=mode, + generated_at=dt.datetime.now(dt.timezone.utc).isoformat(), + git_run=git_run, + reason=reason, + return_code=return_code, + source="runtime-emitter", + status="failed", + expected_case_id=os.environ.get("CX_CASE_ID") or None, + ) + + +def _write_document(path: str | os.PathLike[str], document: dict[str, Any]) -> None: + destination = Path(path) + destination.parent.mkdir(parents=True, exist_ok=True) + temporary = destination.with_name(destination.name + ".tmp") + temporary.write_text(json.dumps(document, indent=2, sort_keys=True) + "\n") + os.replace(temporary, destination) + + +def demote_raw_attempt(path: str | os.PathLike[str], return_code: int) -> dict[str, Any]: + """Replace a rank-zero raw result when the distributed command later fails.""" + destination = Path(path) + raw = strict_load(destination) + if not isinstance(raw, dict) or raw.get("format") != RAW_FORMAT: + raise ContractError("only a native raw attempt can be demoted") + ids = _obj(raw.get("identity"), "raw.identity") + required = { + "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", + "case_factors", "case_id", + } + if not required.issubset(ids): + raise ContractError("raw identity lacks terminal factors") + mode = RETURN_CODE_FAILURE_MODES.get(return_code, "execution") + git_run = _obj(_obj(raw.get("provenance"), "raw.provenance").get("git_run"), "raw.provenance.git_run") + terminal = make_terminal_document( + allocation_factors=ids["allocation_factors"], + attempt_ordinal=ids["attempt_ordinal"], + case=ids["case_factors"]["case"], + case_factors=ids["case_factors"], + control_sha256=os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None, + failure_mode=mode, + generated_at=dt.datetime.now(dt.timezone.utc).isoformat(), + git_run=git_run, + reason=POST_EMIT_FAILURE_REASONS[mode], + return_code=return_code, + source="post-emit-command", + status="failed", + expected_case_id=ids["case_id"], + ) + artifact = raw.get("sample_artifact") or {} + sample_name = artifact.get("path") + if isinstance(sample_name, str) and Path(sample_name).name == sample_name: + destination.with_name(sample_name).unlink(missing_ok=True) + _write_document(destination, terminal) + return terminal + + +def validate_attempt_paths(paths: list[str]) -> int: + """Fully validate a result directory's attempts and paired sample artifacts.""" + if not paths or len(paths) != len(set(paths)): + raise ContractError("validate-many requires unique result paths") + sample_paths: set[Path] = set() + referenced_samples: set[Path] = set() + attempt_count = 0 + for raw_path in paths: + path = Path(raw_path).resolve() + document = strict_load(path) + if isinstance(document, dict) and document.get("format") == RAW_FORMAT: + document = load_raw_attempt(path) + referenced_samples.add(path.with_name(document["sample_artifact"]["path"])) + attempt_count += 1 + elif isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT: + validate_terminal_document(document) + attempt_count += 1 + elif isinstance(document, dict) and document.get("format") == SAMPLES_FORMAT: + validate_samples_document(document) + sample_paths.add(path) + else: + raise ContractError(f"unknown result artifact {path.name}") + if sample_paths != referenced_samples: + raise ContractError("sample artifacts are missing, orphaned, or outside the validated set") + if attempt_count == 0: + raise ContractError("result set contains no native attempts") + return attempt_count + + +def validate_delivery( + paths: list[str], source_path: str, *, disposition: str | None = None +) -> int: + """Reconcile a shard or matrix disposition with its complete native attempt set.""" + source_file = Path(source_path).resolve() + source = strict_load(source_file) + if isinstance(source, dict) and source.get("format") == "collectivex.matrix.v1": + if disposition is None: + raise ContractError("matrix delivery validation requires a disposition") + wrappers = [ + item for item in source.get("requested_cases", []) + if isinstance(item, dict) and item.get("disposition") == disposition + ] + expected = { + item["case"]["case_id"]: (item["sku"], item["case"]) + for item in wrappers + } + expected_count = len(wrappers) + require_one_allocation = disposition == "unsupported" + elif isinstance(source, dict) and isinstance(source.get("cases"), list): + expected = { + case["case_id"]: (source.get("sku"), case) + for case in source["cases"] + } + expected_count = len(source["cases"]) + require_one_allocation = True + else: + raise ContractError("delivery source is not a matrix or shard control") + if not expected or len(expected) != expected_count: + raise ContractError("delivery source has empty or duplicate case coverage") + + validate_attempt_paths(paths) + attempts = [] + for raw_path in paths: + document = strict_load(raw_path) + if isinstance(document, dict) and document.get("format") in {RAW_FORMAT, TERMINAL_FORMAT}: + attempts.append(load_attempt(raw_path)) + by_case: dict[str, list[dict[str, Any]]] = {} + attempt_ids = set() + allocation_ids = set() + source_sha256 = hashlib.sha256(source_file.read_bytes()).hexdigest() + for document in attempts: + ids = document["identity"] + case_id = ids["case_id"] + if case_id not in expected or ids["attempt_id"] in attempt_ids: + raise ContractError("delivery contains an extra case or duplicate attempt") + attempt_ids.add(ids["attempt_id"]) + allocation_ids.add(ids["allocation_id"]) + sku, scheduled = expected[case_id] + scheduled_case = {key: value for key, value in scheduled.items() if key != "case_id"} + if ids["case_factors"] != { + "case": scheduled_case, "profile": identity.V1_CASE_PROFILE, "sku": sku + }: + raise ContractError("delivery attempt differs from its scheduled case") + factors = ids["allocation_factors"] + expected_environment = { + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"), + "job": os.environ.get("GITHUB_JOB"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "run_id": os.environ.get("GITHUB_RUN_ID"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + } + expected_runner = ( + "capability-resolver" + if document["format"] == TERMINAL_FORMAT + and document["provenance"]["source"] == "matrix-capability-resolver" + else sku + ) + if any( + value is not None and factors[field] != value + for field, value in expected_environment.items() + ) or factors["runner"] != expected_runner: + raise ContractError("delivery allocation factors differ from the workflow") + if document["format"] == TERMINAL_FORMAT: + control = document["provenance"]["control_sha256"] + if control != source_sha256: + raise ContractError("terminal outcome does not reference its exact control document") + by_case.setdefault(case_id, []).append(document) + if set(by_case) != set(expected): + raise ContractError("delivery case coverage is incomplete") + for case_id, documents in by_case.items(): + ordinals = sorted(document["identity"]["attempt_ordinal"] for document in documents) + if ordinals != list(range(1, len(ordinals) + 1)): + raise ContractError(f"delivery attempt ordinals are not contiguous for {case_id}") + if require_one_allocation and len(allocation_ids) != 1: + raise ContractError("one shard must use exactly one allocation identity") + return len(attempts) + + +def main() -> int: + parser = argparse.ArgumentParser(description="CollectiveX native attempt contracts") + subparsers = parser.add_subparsers(dest="command", required=True) + probe = subparsers.add_parser("probe") + probe.add_argument("path") + probe.add_argument("--status", choices=("success", "invalid")) + emit = subparsers.add_parser("emit-terminal") + emit.add_argument("--out", required=True) + emit.add_argument("--backend", required=True) + emit.add_argument("--phase", required=True, choices=("decode", "prefill")) + emit.add_argument("--return-code", required=True, type=int) + emit.add_argument("--failure-mode") + demote = subparsers.add_parser("demote") + demote.add_argument("path") + demote.add_argument("--return-code", required=True, type=int) + validate_many = subparsers.add_parser("validate-many") + validate_many.add_argument("paths", nargs="+") + quarantine = subparsers.add_parser("quarantine-invalid") + quarantine.add_argument("path") + delivery = subparsers.add_parser("validate-delivery") + delivery.add_argument("--source", required=True) + delivery.add_argument("--disposition") + delivery.add_argument("paths", nargs="+") + args = parser.parse_args() + try: + if args.command == "probe": + document = load_attempt(args.path) + if args.status is None: + return 0 + if document.get("format") != RAW_FORMAT: + return 1 + outcome = document["outcome"] + validity = outcome.get("validity") + return int( + not ( + isinstance(validity, dict) + and validity.get("execution_status") == "complete" + and outcome.get("status") == args.status + ) + ) + if args.command == "emit-terminal": + document = make_terminal_from_environment( + backend=args.backend, + phase=args.phase, + return_code=args.return_code, + failure_mode=args.failure_mode, + ) + _write_document(args.out, document) + print(f"preserved terminal outcome ({document['outcome']['failure_mode']})") + return 0 + if args.command == "validate-many": + print(f"validated {validate_attempt_paths(args.paths)} native attempts") + return 0 + if args.command == "quarantine-invalid": + quarantine_invalid_attempt(args.path) + return 0 + if args.command == "validate-delivery": + print( + f"validated {validate_delivery(args.paths, args.source, disposition=args.disposition)} " + "delivery attempts" + ) + return 0 + demote_raw_attempt(args.path, args.return_code) + return 0 + except (ContractError, identity.IdentityError, OSError, ValueError) as exc: + print(f"terminal contract error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md new file mode 100644 index 0000000000..f68ef89a5c --- /dev/null +++ b/experimental/CollectiveX/docs/methodology.md @@ -0,0 +1,256 @@ +# CollectiveX EP v1 Contract + +
+ +**English** | [中文](./methodology_zh.md) + +
+ +This document defines new CollectiveX results. Historical run notes are evidence, not contract. + +## Product Boundary + +CollectiveX is a communication microbenchmark for: + +- comparing EP libraries on one chip/topology; +- comparing EP latency and logical payload bandwidth across systems under the same workload; and +- exposing unsupported, failed, invalid, and unstable evidence without contaminating decisions. + +It does not predict serving throughput without a separate correlation study. + +## Matrix + +The promoted workload is `deepseek-v3-v1`: hidden 7168, top-k 8, 256 routed experts, BF16 dispatch +and combine, normal mode, packed placement, backend-tuned resources, and +`layout-and-dispatch-v1`. + +- `ep-core-v1`: uniform routing; decode T=1..128 powers of two; prefill T=256/512. +- `ep-routing-v1`: Zipf with EPLB off/on; decode T=128; prefill T=512. +- Current runnable surface: 38 cells, 228 cases, 532 token points before repeats. + +Unsupported combinations are terminal outcomes, not silently skipped coverage. DeepEP V2 is the +`ElasticBuffer` introduced by PR #605, pinned with upstream PR #630's minimal pure-scale-up fix. +Current V2 cases request NCCL Device API LSA because their world size does not exceed the declared +scale-up domain, then fail closed unless NCCL's realized LSA team covers the full EP world. GIN is +reserved for a separately identified true scale-out cohort. Source-declared NVIDIA capabilities +remain unvalidated until GPU outcomes pass the native oracle and publisher gates. Removed axes +include `[cl]`, `[rv]`, LL, +quantization, alternate activation/routing profiles, +uneven allocation, placement permutations, model envelopes, and scaling. +FlashInfer is excluded from v1 after repeatable intermittent execution failures; those failures are +not converted into planned-unsupported coverage. +MoRI AsyncLL and intranode paths publish distinct kernel generations rather than masquerading as the +same controlled implementation in cross-chip cohorts. + +## Workload Identity + +One canonical workload is generated over the global token batch and sliced by source rank. Expert +indices and gate weights are serialized. Activations use a versioned integer counter formula whose +BF16 values are exact across runtimes; its full identity is bound into the manifest. The manifest +also binds shape/EP coordinates and oracle version. SHA-256 covers canonical bytes and parameters; +library RNG regeneration is not proof of identity. + +Routing traffic distinguishes: + +- token-expert assignments, which determine expert compute load; and +- rank-deduplicated token payload copies, which determine EP activation traffic. + +Adapters may not generate routing or reinterpret one quantity as the other. + +## Measurement + +`layout-and-dispatch-v1` times dispatch layout plus communication. Expert-output staging is outside +isolated combine timing and inside measured paired roundtrip. Each component declares availability, +origin, start/end states, stage scope, and sample count. A paired-only API reports null isolated +components. Combine is activation-only for every adapter: dispatch gate weights are verified but are +not returned over the timed combine path. `isolated_sum` is derived and never used for throughput or +recommendations. + +Every measured component uses `fixed-512-v1`: + +- 64 trials x 8 timed iterations = 512 observations; +- 32 synchronized full dispatch-stage-combine warmups before each available measured component at + every trial/point; +- roundtrip first, then isolated dispatch and combine, with a fixed per-phase conditioning ladder; and +- per-iteration maximum latency across ranks before nearest-rank p50/p90/p95/p99. + +Measured roundtrip p99 is the headline latency. Retries remain separate attempts; a later success +does not erase earlier failures. Decode and prefill identify the serving regime represented by one +MoE-layer collective; they do not change the timed primitive at an otherwise identical shape. + +The NCCL/RCCL reference is an end-to-end Python adapter, not a bare fabric primitive. Its dispatch +boundary includes layout, count exchange, a device-to-host split synchronization, fresh receive +allocation, and four payload/metadata all-to-all calls; activation-only combine adds one all-to-all plus +scatter/reduction. Its p99 therefore measures the complete reference-adapter boundary and can be +host/scheduler-sensitive. It is useful for portable system controls but must not be labeled fabric, +link, bus, or single-collective latency. + +The versioned conditioning and EPLB planner contracts (reference trace, redundant count, and +placement/remap version) are part of scheduled and evidence identity. + +Logical payload bandwidth is: + +`logical_payload_bytes / measured_latency_seconds` + +Payload bytes use rank-deduplicated activations plus required scale bytes at the named boundary. +They exclude expert metadata, padding, and backend buffer capacity. Algorithm bandwidth, bus +bandwidth, wire utilization, and physical-link utilization are not published without a defined +primitive model or transport counters. Logical bandwidth must never be labeled physical bandwidth. +Published payload and token rates are named `rate_at_latency_percentile`: bytes or tokens divided by +the matching latency percentile. They are lower-tail service rates at p99 latency, not p99 +percentiles of an inverted rate distribution. + +## Correctness + +An implementation-independent oracle uses an expert-specific deterministic transform so wrong +expert routing cannot pass an identity roundtrip. For every rank and point it verifies: + +1. destination rank/expert, source token, multiplicity, gate weight, and receive counts; +2. dispatched payload and metadata before timing; +3. combined output before timing; +4. unchanged semantic inputs through all timed samples; and +5. dispatched payload/metadata and combined output again after timing. + +Every v1 adapter uses activation-only, unweighted rank-sum combine. The oracle builds each rank's +gate-weighted expert aggregate before combine, independently derives `sum(gate * expert(token))`, +and checks every element with recorded `rtol=0.05` and `atol=0.02`. Any failed rank or point makes +the case ineligible. +Pre/post dispatch evidence is hashed in canonical source-token order. Native receive slots may be +assigned nondeterministically, so physical receive order is not treated as a correctness property. + +## Native Result + +One raw case document uses `format: "collectivex.ep.v1"`, rejects unknown fields, and contains: + +- `case`: stable case ID, suite, required tier, and coordinate; +- `workload`: canonical identity and logical MoE shape; +- `measurement`: sampling, component states, timing, and byte accounting; +- `implementation`: instantiated class/API, pinned source, loaded libraries, and resources; +- `topology`: requested and realized SKU, devices, placement, scale-up domain, and transport; +- `provenance`: source SHA, image/squash hashes, allocation, run, and attempt; +- `rows`: point latency, byte accounting, token rate, correctness, load, fanout, and anomaly evidence; and +- `outcome`: `success`, `failed`, `invalid`, `diagnostic`, or `unsupported`, with reasons. + +Raw result documents and exact samples pass through transient GitHub delivery artifacts before the +publisher archives them in the private bundle; they never enter the public tree. Private environment +details remain in local mode-0600 logs and ignored operator notes; they are never archived or +published. Every expected case has one terminal selected outcome while every attempt remains retained. + +## Identity And Comparisons + +Canonical JSON produces three full SHA-256 IDs: + +- `series_id`: all locked factors except token coordinate and repeat allocation; +- `point_id`: `series_id` plus token coordinate; and +- `evidence_id`: `point_id` plus allocation/run/attempt/sample checksum. + +Locked factors include workload bytes, measurement and sampling contract, resources, realized +topology, implementation/build, loaded libraries, image/squash, runtime, and source SHA. +Deferred code generation is captured before measurement and recaptured afterward. DeepEP V2 uses a +fixed NVCC random seed and binds final cache keys plus generated-source and executable-SASS hashes; +raw CUBIN bytes remain private diagnostics. Hybrid binds its realized auto-tuned config and complete +kernel-key set while retaining rank-local shared-object hashes as private diagnostics. Locally built +extension hashes are diagnostic; their pinned source trees, build recipe, runtime, and dependencies +remain series-bound. +The series identity includes the case ID, which binds the complete scheduled token ladder and the +frozen percentile, rank-reduction, conditioning, warmup, and correctness semantics. + +A controlled comparison declares one contrast: + +- `library`: backend implementation and its tuned resource profile may differ; the realized system, + workload, EP, resource policy, source, and measurement remain matched; +- `chip`: a controlled platform contrast. The full realized system/topology and tuned resource + profile may differ while workload, EP, placement class, resource policy, backend lineage, source, + and measurement remain matched. It is not a silicon-only comparison; +- `system`: all hardware/backend differences stay visible while workload, EP, and measurement match; +- `routing`: routing distribution/EPLB differs while the static implementation build/generator, + system, model shape, resource profile, and measurement remain matched. Uniform and Zipf without + EPLB reuse the same generated implementation; EPLB's physical-expert/JIT configuration remains an + explicit treatment difference. + +Any undeclared mismatch rejects the overlay. Chip/system results describe measured systems, not +silicon alone. + +## Evidence Policy + +Capability declarations say what may be attempted; artifacts determine evidence status. Promotion +requires exact expected coverage with no missing, extra, duplicate, malformed, or heterogeneous +case. Public coverage preserves each matrix disposition; promotion requires every runnable case to +succeed and every planned-unsupported case to remain unsupported in every selected run. Only the +pinned canonical full-v1 matrix, with a decision-grade library, chip, system, and routing cohort, +may advance `dev-latest`; partial matrices remain diagnostic. The full-matrix digest intentionally +pins the exact workflow shard grouping as well as the requested cases, so changing `--max-cases` +or the SKU round-robin scheduling order produces diagnostic-only runs even when case coverage is +unchanged. Superseded retries, +planned-unsupported outcomes, and unstable comparison cohorts may render diagnostically but cannot +rank or recommend; every successful required series in a promoted dataset remains decision-grade. +Any failed, invalid, or diagnostic retry of a runnable case blocks promotion even if a later retry +succeeds. Routing cohorts are comparable-experimental sensitivities and never produce configuration +recommendations; official library/platform/system cohorts own actionable recommendations. + +A point becomes decision-grade only after three independent workflow runs and allocation IDs pass +correctness, identity, provenance, tail gates, p50/p99 repeat-stability thresholds, and stable ordering. The +publisher, not the frontend, computes eligibility, controlled cohorts, sensitivity pairs, and +recommendations. + +## Isolated Artifact Store + +Development uses one self-hosted persistent filesystem. There is no Vercel storage, GCP, Neon, +managed database, or managed object store. + +```text +$COLLECTIVEX_STORE_ROOT/ + private/incoming/ # write-once downloaded GHA attempts + private/bundles// # immutable source archives, native results/samples, matrix, checksums + private/quarantine/ # rejected attempts plus machine-readable reasons + public/datasets// # immutable sanitized frontend datasets + public/channels/ # small atomic pointers: latest-attempt, dev-latest + locks/ +``` + +Private and public trees use separate permissions. JSON manifests and checksums are authoritative; +a rebuildable catalog is only an index. GitHub artifacts are transient delivery input. + +Container tags are checked against pinned registry digests. Enroot imports use a fixed +`SOURCE_DATE_EPOCH` and versioned cache generation; every mounted squash is freshly hashed into +series identity. Image-provided DeepEP is also checked against exact per-architecture wheel and +installed-file fingerprints, so a stale cache cannot inherit the pinned source identity. +Source-built DeepEP V2 uses a separate mode-0700 cluster-local cache mounted only as `/cx-cache`. +Its content key binds a versioned build recipe, verified image digest, CPU/GPU architecture, +upstream source trees, and pinned build dependencies. The cache is never an artifact or publisher +input; per-execution source/results stages remain isolated and disposable, and marker plus runtime +probes fail closed before reuse. The runner UID is inside the trusted cluster boundary: this cache +guards against stale or accidental mutation, not hostile same-UID jobs. Only an unpublished partial +build may be reset automatically; a published cache that fails integrity or runtime checks is left +intact and rejected so a concurrent allocation cannot lose files it is using. + +Publication is fail-closed: + +1. acquire an exclusive filesystem lock and stage on the destination filesystem; +2. archive source bytes before parsing; +3. require the exact matrix-declared artifact set and reject every unconsumed archive member; +4. validate strict schemas, privacy, checksums, identities, timing, and exact matrix outcomes; +5. write checksums and `COMPLETE`, fsync, then atomically rename the private bundle; +6. build and validate the sanitized content-addressed dataset, fsync, then atomically rename it; +7. atomically replace `dev-latest.json` only when every promotion gate passes. + +Rejected attempts may update `latest-attempt` but never `dev-latest`. Channel responses use +`no-cache`; immutable datasets use content hashes and long-lived caching. A same-host read-only HTTP +route in the InferenceX frontend exposes only the two channel documents and digest-addressed +datasets under `public/`; it rejects incomplete objects, directory listing, and client-supplied +filesystem paths. + +`publisher.py ingest` accepts the exact matrix plus one `--artifact` directory or ZIP per GitHub +artifact. `promote` accepts explicit immutable bundle IDs. Default `verify` requires +`latest-attempt`; it also verifies `dev-latest` when present, while an explicit +`--channel dev-latest` requires it. The frontend process receives the same absolute, +non-symlinked `COLLECTIVEX_STORE_ROOT` and performs the only HTTP serving. + +The frontend fetches the channel pointer, validates it at runtime, resolves the immutable dataset, +verifies its digest/format, and renders terminal coverage. It never invents missing values, selects +retries, or recomputes decision eligibility. + +## Legacy Data + +Numeric schemas 3-5 are outside the v1 publisher and frontend reader. They remain historical +diagnostic evidence and cannot seed `dev-latest` or drive v1 decisions. diff --git a/experimental/CollectiveX/docs/methodology_zh.md b/experimental/CollectiveX/docs/methodology_zh.md new file mode 100644 index 0000000000..c9124dc4e4 --- /dev/null +++ b/experimental/CollectiveX/docs/methodology_zh.md @@ -0,0 +1,247 @@ +# CollectiveX EP v1 契约 + +
+ +[English](./methodology.md) | **中文** + +
+ +本文档定义新的 CollectiveX 结果。历史运行笔记是 evidence,不是 contract。 + +## 产品边界 + +CollectiveX 是通信 microbenchmark,用于: + +- 在同一 chip/topology 上比较 EP libraries; +- 在相同 workload 下比较不同系统的 EP latency 和 logical payload bandwidth; +- 展示 unsupported、failed、invalid 和 unstable evidence,同时避免污染决策。 + +若没有单独的 correlation study,它不能预测 serving throughput。 + +## 矩阵 + +提升后的 workload 为 `deepseek-v3-v1`:hidden 7168、top-k 8、256 routed experts、BF16 +dispatch 和 combine、normal mode、packed placement、backend-tuned resources,以及 +`layout-and-dispatch-v1`。 + +- `ep-core-v1`:uniform routing;decode T=1..128 的 2 次幂;prefill T=256/512。 +- `ep-routing-v1`:Zipf,EPLB off/on;decode T=128;prefill T=512。 +- 当前可运行范围:38 cells、228 cases、重复前 532 token points。 + +Unsupported combinations 是 terminal outcomes,不会被静默跳过。DeepEP V2 指 PR #605 +引入的 `ElasticBuffer`,并固定使用 upstream PR #630 的最小纯 scale-up 修复。当前 V2 cases +的 world size 均未超过声明的 scale-up domain,因此请求 NCCL Device API LSA;若 NCCL +实际建立的 LSA team 未覆盖整个 EP world,则直接失败。GIN 只用于单独标识的真正 scale-out +cohort。其 source 声明的 NVIDIA capabilities 在 GPU outcomes 通过 native oracle 和 publisher +gates 前仍为 unvalidated。已移除的轴包括 `[cl]`、`[rv]`、LL、 +quantization、alternate activation/routing profiles、uneven allocation、placement +permutations、model envelopes 和 scaling。 +FlashInfer 因可重复出现的间歇性执行失败而排除在 v1 外;这些失败不会转为 planned-unsupported +coverage。 +MoRI AsyncLL 和 intranode paths 发布不同的 kernel generations,不会在 cross-chip cohorts 中 +伪装成相同的 controlled implementation。 + +## Workload 身份 + +一个 canonical workload 在 global token batch 上生成,再按 source rank 切分。Expert indices +和 gate weights 会序列化。Activations 使用带版本的整数计数器公式,其 BF16 值在不同 runtime +中精确一致;完整身份绑定到 manifest。Manifest 还绑定 shape/EP coordinates 和 oracle version。 +SHA-256 覆盖 canonical bytes 和 parameters;重新生成 library RNG 不能证明身份一致。 + +Routing traffic 区分: + +- token-expert assignments,决定 expert compute load; +- rank-deduplicated token payload copies,决定 EP activation traffic。 + +Adapters 不得生成 routing,也不得将两种量相互解释。 + +## 测量 + +`layout-and-dispatch-v1` 计时 dispatch layout 加 communication。Expert-output staging 不计入 +isolated combine timing,但计入被测 paired roundtrip。每个 component 声明 availability、origin、 +start/end states、stage scope 和 sample count。仅有 paired API 时,isolated components 报 null。 +所有 adapter 的 combine 均采用 activation-only 边界:dispatch gate weights 会接受校验,但不会 +通过被测 combine 路径返回。`isolated_sum` 为派生值,不用于 throughput 或 recommendations。 + +每个被测 component 均使用 `fixed-512-v1`: + +- 64 trials x 8 timed iterations = 512 observations; +- 每个 trial/point 的每个可用被测 component 前,执行 32 次同步完整 + dispatch-stage-combine warmups; +- 先测 roundtrip,再测 isolated dispatch 和 combine,并使用固定的 per-phase conditioning ladder; +- 每次 iteration 先取跨 rank 最大 latency,再以 nearest-rank 计算 p50/p90/p95/p99。 + +被测 roundtrip p99 是 headline latency。Retries 保持为独立 attempts;后续成功不会抹除早期失败。 +Decode 和 prefill 表示一个 MoE-layer collective 所代表的 serving regime;在其他 shape 相同时, +它们不会改变 timed primitive。 + +NCCL/RCCL reference 是 end-to-end Python adapter,而不是 bare fabric primitive。其 dispatch +boundary 包含 layout、count exchange、device-to-host split synchronization、fresh receive +allocation,以及四次 payload/metadata all-to-all;activation-only combine 还包含一次 all-to-all 和 +scatter/reduction。因此其 p99 测量完整 reference-adapter boundary,可能对 host/scheduler 敏感。 +它可作为 portable system control,但不得标记为 fabric、link、bus 或 single-collective latency。 + +带版本的 conditioning 和 EPLB planner contracts(reference trace、redundant count 和 +placement/remap version)属于 scheduled 和 evidence identity。 + +Logical payload bandwidth 为: + +`logical_payload_bytes / measured_latency_seconds` + +Payload bytes 使用命名边界上的 rank-deduplicated activations 加必需 scale bytes,不包含 expert +metadata、padding 和 backend buffer capacity。若没有定义 primitive model 或 transport counters, +不发布 algorithm bandwidth、bus bandwidth、wire utilization 或 physical-link utilization。 +Logical bandwidth 绝不能标为 physical bandwidth。已发布 payload 和 token rates 命名为 +`rate_at_latency_percentile`:bytes 或 tokens 除以对应 latency percentile。它们是 p99 latency +下的 lower-tail service rates,不是 inverted rate distribution 的 p99 percentiles。 + +## 正确性 + +与实现无关的 oracle 使用 expert-specific deterministic transform,使错误 expert routing 无法 +通过 identity roundtrip。它对每个 rank 和 point 验证: + +1. destination rank/expert、source token、multiplicity、gate weight 和 receive counts; +2. timing 前的 dispatched payload 和 metadata; +3. timing 前的 combined output; +4. 所有 timed samples 期间 semantic inputs 不变; +5. timing 后再次验证 dispatched payload/metadata 和 combined output。 + +v1 的所有 adapter 均使用 activation-only、unweighted rank-sum combine。Oracle 在 combine 前 +构造每个 rank 的 gate-weighted expert aggregate,独立计算 `sum(gate * expert(token))`, +并使用已记录的 `rtol=0.05` 和 `atol=0.02` 检查每个 element。任一 rank 或 +point 失败都会使 case 不合格。Pre/post dispatch evidence 按 +canonical source-token order 计算 hash。Native receive slots 可能非确定性分配,因此 physical +receive order 不作为 correctness property。 + +## Native 结果 + +单个 raw case document 使用 `format: "collectivex.ep.v1"`,拒绝未知 fields,并包含: + +- `case`:稳定 case ID、suite、required tier 和 coordinate; +- `workload`:canonical identity 和 logical MoE shape; +- `measurement`:sampling、component states、timing 和 byte accounting; +- `implementation`:实例化 class/API、固定 source、loaded libraries 和 resources; +- `topology`:requested 和 realized SKU、devices、placement、scale-up domain 和 transport; +- `provenance`:source SHA、image/squash hashes、allocation、run 和 attempt; +- `rows`:point latency、byte accounting、token rate、correctness、load、fanout 和 anomaly evidence; +- `outcome`:`success`、`failed`、`invalid`、`diagnostic` 或 `unsupported`,以及 reasons。 + +Raw result documents 和 exact samples 会先经过临时 GitHub delivery artifacts,再由 publisher +归档到 private bundle;它们不会进入 public tree。Private environment details 只保留在本地 +mode-0600 logs 和忽略的 operator notes 中;不会归档或发布。每个 expected case 有一个 terminal +selected outcome,同时保留每次 attempt。 + +## 身份与比较 + +Canonical JSON 生成三个完整 SHA-256 IDs: + +- `series_id`:除 token coordinate 和 repeat allocation 外的所有 locked factors; +- `point_id`:`series_id` 加 token coordinate; +- `evidence_id`:`point_id` 加 allocation/run/attempt/sample checksum。 + +Locked factors 包括 workload bytes、measurement 和 sampling contract、resources、realized +topology、implementation/build、loaded libraries、image/squash、runtime 和 source SHA。 +Deferred code generation 会在 measurement 前捕获,并在之后再次捕获。DeepEP V2 使用固定的 +NVCC random seed,并绑定最终 cache keys、generated-source hashes 与 executable-SASS hashes; +raw CUBIN bytes 仅保留为 private diagnostics。Hybrid 绑定实际自动调优配置与完整 kernel-key +set,同时将各 rank 的 shared-object hashes 仅保留为 private diagnostics。本地构建的 extension +hashes 属于 diagnostic;其固定 source trees、build recipe、runtime 与 dependencies 仍绑定到 +series。 +Series identity 包含 case ID;case ID 绑定完整 scheduled token ladder,以及固定的 percentile、 +rank-reduction、conditioning、warmup 和 correctness semantics。 + +Controlled comparison 只声明一个 contrast: + +- `library`:backend implementation 及其 tuned resource profile 可以不同;realized system、 + workload、EP、resource policy、source 和 measurement 必须匹配; +- `chip`:受控 platform contrast。完整 realized system/topology 和 tuned resource profile 可以不同, + 但 workload、EP、placement class、resource policy、backend lineage、source 和 measurement 必须 + 匹配。它不是 silicon-only comparison; +- `system`:保留所有 hardware/backend 差异,同时匹配 workload、EP 和 measurement; +- `routing`:routing distribution/EPLB 可以不同,但 static implementation build/generator、system、 + model shape、resource profile 和 measurement 必须匹配。未启用 EPLB 的 Uniform 和 Zipf 复用 + 同一 generated implementation;EPLB 的 physical-expert/JIT configuration 是显式 treatment + difference。 + +任何未声明的 mismatch 都会拒绝 overlay。Chip/system results 描述 measured systems,而非仅描述 +silicon。 + +## Evidence 策略 + +Capability declarations 说明可以尝试什么;artifacts 决定 evidence status。Promotion 要求完整的 +expected coverage,不能有 missing、extra、duplicate、malformed 或 heterogeneous case。Public +coverage 保留每个 matrix disposition;promotion 要求每个 runnable case 在所有 selected runs 中 +成功,且每个 planned-unsupported case 始终为 unsupported。只有固定 canonical full-v1 matrix, +且具有 decision-grade library、chip、system 和 routing cohort,才能推进 `dev-latest`;partial +matrices 仍为 diagnostic。Full-matrix digest 有意绑定精确 workflow shard grouping 和 requested +cases,因此即使 case coverage 不变,修改 `--max-cases` 或 SKU round-robin scheduling order 也只 +会产生 diagnostic-only runs。Superseded retries、planned-unsupported outcomes 和 unstable +comparison cohorts 可以用于诊断展示,但不能排名或推荐;promoted dataset 中每个成功的 required +series 都必须保持 decision-grade。Runnable case 的任何 failed、invalid 或 diagnostic retry 都会 +阻止 promotion,即使后续 retry 成功。Routing cohorts 是 comparable-experimental sensitivities, +不会产生 configuration recommendations;official library/platform/system cohorts 才能产生可执行 +recommendations。 + +一个 point 只有在三个独立 workflow runs 和 allocation IDs 均通过 correctness、identity、 +provenance、tail gates、p50/p99 repeat-stability thresholds 和 stable ordering 后才成为 +decision-grade。Eligibility、controlled cohorts、sensitivity pairs 和 recommendations 由 +publisher 而非 frontend 计算。 + +## 隔离产物存储 + +开发阶段使用一个 self-hosted persistent filesystem,不使用 Vercel storage、GCP、Neon、 +managed database 或 managed object store。 + +```text +$COLLECTIVEX_STORE_ROOT/ + private/incoming/ # write-once downloaded GHA attempts + private/bundles// # immutable source archives, native results/samples, matrix, checksums + private/quarantine/ # rejected attempts plus machine-readable reasons + public/datasets// # immutable sanitized frontend datasets + public/channels/ # small atomic pointers: latest-attempt, dev-latest + locks/ +``` + +Private 和 public trees 使用不同 permissions。JSON manifests 和 checksums 是权威记录;可重建 +catalog 仅为 index。GitHub artifacts 是临时 delivery input。 + +Container tags 会与固定 registry digests 核对。Enroot imports 使用固定 +`SOURCE_DATE_EPOCH` 和 versioned cache generation;每个 mounted squash 都重新计算 hash 并纳入 +series identity。Image-provided DeepEP 也按精确 per-architecture wheel 和 installed-file +fingerprints 检查,因此 stale cache 不能继承固定 source identity。 +Source-built DeepEP V2 使用独立的 mode-0700 cluster-local cache,并且只以 `/cx-cache` 挂载。 +其 content key 绑定版本化 build recipe、verified image digest、CPU/GPU architecture、 +upstream source trees 和固定 build dependencies。该 cache 既不是 artifact,也不是 publisher +input;每次执行的 source/results stage 仍然隔离且可丢弃,并在复用前以 marker 和 runtime probe +fail closed。Runner UID 属于受信任的 cluster boundary:该 cache 用于防止 stale 或意外修改, +不防御恶意的同 UID job。只有从未发布的 partial build 才能自动重置;已发布 cache 一旦未通过 +integrity 或 runtime 检查,将保持原样并被拒绝,避免并发 allocation 正在使用的文件被删除。 + +Publication 采用 fail-closed: + +1. 获取 exclusive filesystem lock,并在 destination filesystem 上 stage; +2. 解析前归档 source bytes; +3. 要求精确 matrix-declared artifact set,并拒绝每个未消费 archive member; +4. 验证 strict schemas、privacy、checksums、identities、timing 和精确 matrix outcomes; +5. 写入 checksums 和 `COMPLETE`,fsync,然后原子 rename private bundle; +6. 构建并验证 sanitized content-addressed dataset,fsync,然后原子 rename; +7. 仅在全部 promotion gates 通过后原子替换 `dev-latest.json`。 + +Rejected attempts 可以更新 `latest-attempt`,但不能更新 `dev-latest`。Channel responses 使用 +`no-cache`;immutable datasets 使用 content hashes 和 long-lived caching。InferenceX 前端中的 +same-host read-only HTTP route 只暴露 `public/` 下两个 channel documents 和 digest-addressed +datasets;它拒绝 incomplete objects、directory listing 和 client-supplied filesystem paths。 + +`publisher.py ingest` 接受精确 matrix,并为每个 GitHub artifact 接受一个 `--artifact` directory +或 ZIP。`promote` 接受显式 immutable bundle IDs。默认 `verify` 要求 `latest-attempt`;若存在 +`dev-latest` 也会验证,而显式 `--channel dev-latest` 则要求其存在。Frontend process 接收相同的 +absolute、non-symlinked `COLLECTIVEX_STORE_ROOT`,并执行唯一的 HTTP serving。 + +Frontend 获取 channel pointer,在 runtime 验证它,解析 immutable dataset,验证其 +digest/format,并渲染 terminal coverage。它不会虚构 missing values、选择 retries,或重新计算 +decision eligibility。 + +## 历史数据 + +Numeric schemas 3-5 不在 v1 publisher 和 frontend reader 范围内。它们仍是 historical +diagnostic evidence,不能作为 `dev-latest` 初始数据或驱动 v1 decisions。 diff --git a/experimental/CollectiveX/identity.py b/experimental/CollectiveX/identity.py new file mode 100644 index 0000000000..f3cec953a3 --- /dev/null +++ b/experimental/CollectiveX/identity.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Canonical, cross-runtime identities for CollectiveX v1.""" +from __future__ import annotations + +import hashlib +import json +import re +from typing import Any + +IDENTITY_VERSION = 1 +MAX_SAFE_INTEGER = (1 << 53) - 1 +PREFIXES = { + "case": "cxcase-v1-", + "workload": "cxwork-v1-", + "series": "cxseries-v1-", + "point": "cxpoint-v1-", + "evidence": "cxevidence-v1-", + "allocation": "cxallocation-v1-", + "attempt": "cxattempt-v1-", +} +V1_CASE_PROFILE = { + "activation_generator": "collectivex-activation-counter-v3", + "activation_profile": "canonical-counter-source-v3", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2", + "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1", + "contract": "layout-and-dispatch-v1", + "dtype": "bf16", + "eplb_planner": "greedy-rank-major-v1", + "eplb_redundant_experts": 32, + "eplb_reference_tokens_per_rank": 2048, + "mode": "normal", + "oracle_contract": "expert-specific-transform-v1", + "oracle_tolerances": "rtol=0.05,atol=0.02", + "placement": "packed", + "percentile_method": "nearest-rank", + "rank_reduction": "cross-rank-max-per-iteration", + "resource_mode": "tuned", + "routing_generator": "collectivex-routing-counter-v3", + "sampling_contract": "fixed-512-v1", + "seed": 67, +} + + +class IdentityError(ValueError): + """An identity payload cannot be represented consistently across runtimes.""" + + +def _validate(value: Any, path: str = "$") -> None: + if value is None or isinstance(value, bool): + return + if isinstance(value, str): + if any(ord(character) < 0x20 or ord(character) > 0x7E for character in value): + raise IdentityError(f"{path}: string must contain printable ASCII only") + return + if type(value) is int: + if abs(value) > MAX_SAFE_INTEGER: + raise IdentityError(f"{path}: integer exceeds the cross-runtime safe range") + return + if isinstance(value, list): + for index, item in enumerate(value): + _validate(item, f"{path}[{index}]") + return + if isinstance(value, dict): + for key, item in value.items(): + if not isinstance(key, str): + raise IdentityError(f"{path}: object key is not a string") + if any(ord(character) < 0x20 or ord(character) > 0x7E for character in key): + raise IdentityError(f"{path}: object key must contain printable ASCII only") + _validate(item, f"{path}.{key}") + return + raise IdentityError(f"{path}: unsupported identity value {type(value).__name__}") + + +def canonical_bytes(value: Any) -> bytes: + """Return compact UTF-8 JSON after enforcing the portable value subset.""" + _validate(value) + return json.dumps( + value, + ensure_ascii=False, + allow_nan=False, + sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + + +def digest(kind: str, value: Any) -> str: + """Hash a typed v1 identity payload and return its typed identifier.""" + try: + prefix = PREFIXES[kind] + except KeyError as exc: + raise IdentityError(f"unknown identity kind {kind!r}") from exc + body = {"kind": kind, "value": value, "version": IDENTITY_VERSION} + return prefix + hashlib.sha256(canonical_bytes(body)).hexdigest() + + +def is_typed_id(value: Any, kind: str) -> bool: + prefix = PREFIXES.get(kind) + return bool( + isinstance(value, str) + and prefix + and re.fullmatch(re.escape(prefix) + r"[0-9a-f]{64}", value) + ) + + +def case_id(*, sku: str, profile: dict[str, Any], case: dict[str, Any]) -> str: + return digest("case", {"case": case, "profile": profile, "sku": sku}) + + +def workload_id(value: dict[str, Any]) -> str: + return digest("workload", value) + + +def series_id(value: dict[str, Any]) -> str: + return digest("series", value) + + +def point_id(*, series: str, tokens_per_rank: int) -> str: + return digest("point", {"series_id": series, "tokens_per_rank": tokens_per_rank}) + + +def allocation_id(value: dict[str, Any]) -> str: + return digest("allocation", value) + + +def attempt_id(*, allocation: str, case: str, ordinal: int) -> str: + return digest( + "attempt", {"allocation_id": allocation, "case_id": case, "ordinal": ordinal} + ) + + +def evidence_id( + *, point: str, allocation: str, attempt: str, sample_sha256: str +) -> str: + return digest( + "evidence", + { + "allocation_id": allocation, + "attempt_id": attempt, + "point_id": point, + "sample_sha256": sample_sha256, + }, + ) + + +IDENTITY_TEST_VECTOR = { + "payload": {"backend": "deepep", "ep": 8, "shape": [7168, 8, 256]}, + "series_id": "cxseries-v1-a79bf758488e3edd50f5531f3af825f371bf42aae7c4097e461fd2a32615af81", +} + + +def verify_test_vector() -> None: + observed = series_id(IDENTITY_TEST_VECTOR["payload"]) + if observed != IDENTITY_TEST_VECTOR["series_id"]: + raise IdentityError( + f"identity implementation differs: {observed} != {IDENTITY_TEST_VECTOR['series_id']}" + ) + + +if __name__ == "__main__": + verify_test_vector() + print(IDENTITY_TEST_VECTOR["series_id"]) diff --git a/experimental/CollectiveX/launchers/launch_gb-nv.sh b/experimental/CollectiveX/launchers/launch_gb-nv.sh new file mode 100644 index 0000000000..97d0377e00 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb-nv.sh @@ -0,0 +1,335 @@ +#!/usr/bin/env bash +# CollectiveX shared GB200/GB300 NVL72 (aarch64) launcher. +# +# Two paths by CX_NODES: +# CX_NODES<=1 (EP4): single NVL72 tray, 4 GPU. Hands off to run_in_container.sh (torchrun -g 4). +# CX_NODES==2 (EP8): 2 trays, 8 GPU over the MNNVL NVLink domain. run_in_container's single-node +# torchrun can't span nodes, so this path runs run_ep.py DIRECTLY across 8 srun tasks (1 rank +# each), per-rank RANK/LOCAL_RANK from SLURM_*, MASTER_ADDR=first node — the intranode NVLink +# path works across <=8 ranks on MNNVL (no internode/NVSHMEM). One allocation runs the shard. +# +# Scheduling and compute-visible storage are supplied by the runner-local config. +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-${CX_PUBLIC_RUNNER:-}}}" +case "$PRODUCT" in + gb200|gb300) ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to gb200 or gb300" ;; +esac +RUNNER="$PRODUCT" +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}" +export CX_IMAGE_PLATFORM=linux/arm64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" +NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-4}" +SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}" +EXPECTED_WORLD=$((NODES * GPN)) +NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}" +if [ "$PRODUCT" = gb200 ]; then default_time=30; else default_time=90; fi +TIME_MIN="${CX_TIME:-$default_time}" +[ "$NODES" = 1 ] || [ "$NODES" = 2 ] || cx_die "$PRODUCT supports one or two four-GPU trays" +[ "$GPN" = 4 ] || cx_die "$PRODUCT requires four GPUs per tray" +[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "$PRODUCT requires the NVL72 scale-up domain" +[ "$NGPUS" = "$EXPECTED_WORLD" ] || cx_die "$PRODUCT world size must equal nodes x GPUs per tray" +cx_apply_timing_profile +# CX_IMAGE is a Docker tag; cx_ensure_squash derives the local squash filename. +IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}" +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="${PRODUCT}-nvl72-mnnvl" CX_TRANSPORT="mnnvl" +export CX_NODES="$NODES" CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN" +export CX_NGPUS="$NGPUS" +case "$CX_BENCH" in + deepep|deepep-v2|deepep-hybrid|nccl-ep) ;; + *) cx_die "unsupported $PRODUCT EP backend: $CX_BENCH" ;; +esac +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR +[ "$PRODUCT" != gb300 ] || cx_require_vars CX_ENROOT_CACHE_PATH +PARTITION="$CX_PARTITION"; ACCOUNT="$CX_ACCOUNT"; SQUASH_DIR="$CX_SQUASH_DIR" +[ -z "${CX_ENROOT_CACHE_PATH:-}" ] || export ENROOT_CACHE_PATH="$CX_ENROOT_CACHE_PATH" +export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 + +cx_log "$PRODUCT runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH phase=${CX_PHASE:-decode}" +[ "${CX_DRYRUN:-0}" = "1" ] && { cx_log "DRYRUN"; exit 0; } +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "$CX_STAGE_DIR")" +cx_prepare_runtime_marker "$MOUNT_SRC" +CONTAINER_MOUNTS="$MOUNT_SRC:/ix" +if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then + cx_set_failure_stage backend-setup + cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \ + || cx_die "cannot stage the pinned backend source" + export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources +fi +if [ "$CX_BENCH" = deepep-v2 ]; then + cx_prepare_backend_cache "$CX_SQUASH_DIR" \ + || cx_die "cannot prepare the isolated backend cache" + BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE" + CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$BACKEND_CACHE:/cx-cache" + export CX_BACKEND_CACHE_ROOT=/cx-cache +fi +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found" + +if [ "$NODES" -le 1 ]; then + cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" \ + --gres=gpu:"$GPN" --exclusive --mem=0 --cpus-per-task=72 \ + --time="$TIME_MIN" --job-name="$RUNNER" +else + cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPN" --ntasks-per-node="$GPN" --exclusive --mem=0 --cpus-per-task=35 \ + --time="$TIME_MIN" --job-name="$RUNNER" +fi +[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc" +cx_set_failure_stage container-import +SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$SQUASH_DIR" "$IMAGE")" +cx_set_failure_stage container-hash +cx_export_squash_identity "$SQUASH_FILE" +cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \ + "${CX_SHARD_FILE:-}" + +if [ "$NODES" -le 1 ]; then # ---- EP4: single tray, run_in_container (torchrun -g 4) ---- + run_rc=0 + cx_set_failure_stage container-launch + runtime_log="$(cx_private_log_path runtime-ep4)" + srun --jobid="$JOB_ID" --chdir=/tmp --container-image="$SQUASH_FILE" \ + --container-mounts="$CONTAINER_MOUNTS" \ + --no-container-mount-home --container-writable --container-remap-root \ + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \ + --export="$(cx_container_exports)" bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \ + >"$runtime_log" 2>&1 || run_rc=$? + cx_adopt_runtime_stage "$MOUNT_SRC" + [ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true + collect_rc=0 + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? + [ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection + final_rc="$run_rc" + [ "$final_rc" != 0 ] || final_rc="$collect_rc" + exit "$final_rc" +fi + +# ---- EP8: 2 trays, run_ep.py directly across 8 ranks (no torchrun; MNNVL intranode path) ---- +cx_set_failure_stage scheduler-allocation +MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)" 2>/dev/null | head -1)" +MP="${CX_MASTER_PORT:-29551}" +[[ "$MA" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \ + || cx_die "could not resolve the allocated primary node" +[[ "$MP" =~ ^[1-9][0-9]*$ ]] && [ "$MP" -le 65535 ] \ + || cx_die "invalid distributed rendezvous port" +mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results" +# Restore process-local loader/import paths and exact backend build identity from build-only. +SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66' +BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; esac' +WRAP="$SOURCE_BACKEND_ENV"'; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"' + +# Prepare the backend once per node in the persistent container reused by every case. +CNAME="cxep8_${JOB_ID}" +CMOUNT=(--container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home + --container-writable --container-remap-root + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint) +cx_log "EP backend preparation: bench=$CX_BENCH" +cx_set_failure_stage backend-setup +build_log="$(cx_private_log_path backend-prepare)" +set +e +srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \ + --container-name="$CNAME" --container-image="$SQUASH_FILE" "${CMOUNT[@]}" \ + --export="$(cx_container_exports),CX_BUILD_ONLY=1" \ + bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \ + "$build_log" 2>&1 +build_rc=$? +if [ "$build_rc" = 0 ]; then + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \ + --container-name="$CNAME" "${CMOUNT[@]}" --export="$(cx_container_exports)" \ + bash -c "$BACKEND_PROBE" \ + >"$build_log" 2>&1 + build_rc=$? +fi +set -e +if [ "$build_rc" != 0 ]; then + cx_fail_stage backend-setup "$build_log" || true + cx_log "ERROR: EP backend preparation failed rc=$build_rc" + cx_emit_setup_failures "$CX_DIR" "$MOUNT_SRC/experimental/CollectiveX/results" \ + "$CX_BENCH" "$build_rc" + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || true + exit "$build_rc" +fi +cx_set_failure_stage execution + +# The EP8 case list as pipe-delimited records. SWEEP (CX_SHARD_FILE set): one line per shard case, +# so the rack-scale EP8 path sweeps EVERY case of its shard (parity with run_in_container's single- +# node SHARD loop). MANUAL (no shard file) emits one line per requested phase. +cx_ep8_cases() { + # CX_SHARD_FILE is workflow-relative (.shards/.json, written by the Extract step with + # working-directory=experimental/CollectiveX). This EP8 path runs on the SUBMIT HOST where cwd is + # the repo root, so resolve it against $CX_DIR (=experimental/CollectiveX) when not found as-is — + # else the SHARD branch is skipped and only ONE default case runs instead of the shard's N. + local sf="${CX_SHARD_FILE:-}" + [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf" + if [ -n "$sf" ]; then + [ -f "$sf" ] || { cx_log "ERROR: shard control disappeared"; return 1; } + # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` would collapse consecutive tabs and + # swallow empty fields like a false eplb, shifting every column. No case field contains '|'.) + python3 - "$sf" <<'PY' +import json, sys +d = json.load(open(sys.argv[1])) +for c in d["cases"]: + g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv) + print("|".join([g("phase","decode"), g("routing","uniform"), + ("1" if c.get("eplb") else ""), g("hidden","7168"), g("topk","8"), g("experts","256"), + g("ladder",""), g("suite",""), g("workload",""), g("required_publication",""), + ("1" if c.get("canonical") else ""), g("case_id",""), g("ep",""), + g("timing","8:64:32")])) +PY + else + local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill" + local ph; local -a fields + for ph in $phases; do + fields=("$ph" "${CX_ROUTING:-uniform}" "${CX_EPLB:+1}" + "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}" + "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" "${CX_REQUIRED_PUBLICATION:-}" + "${CX_CANONICAL:+1}" "${CX_CASE_ID:-}" "$NGPUS" + "${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}") + (IFS='|'; printf '%s\n' "${fields[*]}") + done + fi +} + +# Per-rank env for the EP8 case sruns. DeepEP main's Buffer gates multi-tray NVLink behind allow_mnnvl, which defaults +# False -> DeepEP then sets NVSHMEM_DISABLE_MNNVL=1 and the legacy buffer takes the intranode-only CUDA-IPC +# peer path, faulting across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). CX_ALLOW_MNNVL=1 +# makes tests/ep_deepep.py pass allow_mnnvl=True so the NVL buffer spans both trays over the fabric API. +# The pinned V1 exposes this flag explicitly; the adapter fails closed if that API changes. +EP8_EXPORTS="$(cx_container_exports),MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1" +[ "$CX_BENCH" = "deepep" ] && EP8_EXPORTS="$EP8_EXPORTS,CX_ALLOW_MNNVL=1" + +ci=0 +failed_cases=0 +cases_file="$(mktemp)" +if ! cx_ep8_cases > "$cases_file"; then + rm -f "$cases_file" + cx_die "could not enumerate validated shard cases" +fi +expected_cases="$(wc -l < "$cases_file" | tr -d ' ')" +[ "$expected_cases" -gt 0 ] || { rm -f "$cases_file"; cx_die "case list is empty"; } +while IFS='|' read -r ph routing eplb hidden topk experts lad suite workload required_pub \ + canonical case_id ep timing; do + [ -n "$ph" ] || continue + ci=$((ci+1)) + case_stem="${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")" + IFS=':' read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}" + case_iters="${case_iters:-8}"; case_trials="${case_trials:-64}"; case_warmup="${case_warmup:-32}" + ep="${ep:-$NGPUS}" + export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload" + export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep" + export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_TOKENS_LADDER="$lad" + export CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts" + export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup" + export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))" + export CX_WARMUP_SEMANTICS="full-roundtrip-before-each-component-trial-point-v1" + cx_log "EP${NGPUS}[$ci] id=${case_id:-manual} $ph $CX_BENCH routing=$routing eplb=${eplb:-0}" + if [ "$ep" != "$NGPUS" ]; then + cx_log "ERROR: case EP$ep does not match allocated world size $NGPUS" + export CX_ATTEMPT_ID=1 + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5 + failed_cases=$((failed_cases + 1)) + continue + fi + + workload_dir="" + if [ -n "$canonical" ]; then + workload_dir=".cx_workloads/ep${ep}_${routing}" + workload_ladder="$lad" + [ -n "$workload_ladder" ] || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096" + workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir" --routing "$routing" + --ep "$ep" --hidden "$hidden" --topk "$topk" --experts "$experts" + --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder") + workload_log="$(cx_private_log_path "workload-c$(printf '%03d' "$ci")")" + stage_rc=0 + set +e + srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \ + --container-name="$CNAME" "${CMOUNT[@]}" \ + --export="$EP8_EXPORTS" "${workload_args[@]}" \ + "$workload_log" 2>&1 + stage_rc=$? + set -e + if [ "$stage_rc" != 0 ]; then + cx_log "ERROR: canonical workload staging failed rc=$stage_rc" + export CX_ATTEMPT_ID=1 + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc" + failed_cases=$((failed_cases + 1)) + continue + fi + fi + + ep_args=(--backend "$CX_BENCH" --phase "$ph" --routing "$routing" + --gpus-per-node "$GPN" --scale-up-domain "$SCALE_UP_DOMAIN" + --tokens-ladder "$lad" + --hidden "$hidden" --topk "$topk" --experts "$experts" + --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials" + --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$CX_TOPO" + --transport "$CX_TRANSPORT" --case-id "$case_id" --suite "$suite" + --workload-name "$workload" --required-publication "$required_pub") + [ -n "$eplb" ] && ep_args+=(--eplb) + [ -n "$workload_dir" ] && ep_args+=(--workload-dir "$workload_dir") + attempt=1 + case_ok=0 + export CX_ATTEMPT_ID="$attempt" + attempt_tag="a01" + out="results/${case_stem}_${attempt_tag}.json" + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json" + runtime_log="$(cx_private_log_path "runtime-c$(printf '%03d' "$ci")-$attempt_tag")" + set +e + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" \ + --ntasks="$NGPUS" --chdir=/tmp \ + --ntasks-per-node="$GPN" --container-name="$CNAME" "${CMOUNT[@]}" \ + --export="$EP8_EXPORTS" \ + bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" \ + "$runtime_log" 2>&1 + run_rc=$? + set -e + expected_out="$MOUNT_SRC/experimental/CollectiveX/$out" + if [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" success; then + case_ok=1 + elif [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" invalid; then + cx_log "ERROR: EP${NGPUS}[$ci] completed with invalid semantic evidence" + else + [ "$run_rc" = 0 ] && run_rc=1 + if cx_has_result_doc "$expected_out"; then + cx_demote_result_doc "$expected_out" "$run_rc" \ + || { cx_quarantine_result_doc "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; } + else + cx_quarantine_result_doc "$expected_out" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc" + fi + fi + if [ "$case_ok" = 0 ]; then + failed_cases=$((failed_cases + 1)) + cx_log "ERROR: EP${NGPUS}[$ci] failed" + fi +done < "$cases_file" +rm -f "$cases_file" +[ "$ci" -eq "$expected_cases" ] || cx_die "enumerated $expected_cases cases but executed $ci" +run_rc=0 +if [ "$failed_cases" -ne 0 ]; then + summary_log="$(cx_private_log_path shard-summary)" + printf 'SHARD done: %s/%s case(s) failed\n' "$failed_cases" "$expected_cases" > "$summary_log" + cx_fail_stage execution "$summary_log" || true + run_rc=1 +fi +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +exit "$final_rc" diff --git a/experimental/CollectiveX/launchers/launch_mi-amds.sh b/experimental/CollectiveX/launchers/launch_mi-amds.sh new file mode 100644 index 0000000000..5f3de33078 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_mi-amds.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# CollectiveX shared MI325X/MI355X AMD Slurm launcher. +# +# The ROCm path imports its squash in the allocation and uses writable/remapped +# pyxis containers. Scheduling, exclusions, node pins, and storage come from the +# runner-local config. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}" +case "$RUNNER" in + mi325x) CPUS_PER_TASK=256; DEVICE_MOUNTS=",/dev/kfd:/dev/kfd,/dev/dri:/dev/dri" ;; + mi355x) CPUS_PER_TASK=128; DEVICE_MOUNTS="" ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to mi325x or mi355x" ;; +esac +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-mori}" +export CX_IMAGE_PLATFORM=linux/amd64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm image +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}" +# Optional node pin overrides the exclusion list. +NODELIST="${CX_NODELIST:-}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +# AMD EP backends: MoRI and the portable NCCL/RCCL all-to-all reference. +case "$CX_BENCH" in + mori|nccl-ep) ;; + *) cx_die "unsupported AMD EP backend: $CX_BENCH" ;; +esac +if [ "$RUNNER" = mi325x ]; then + export MORI_DISABLE_AUTO_XGMI="${MORI_DISABLE_AUTO_XGMI:-0}" + export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}" + export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-info}" + export MORI_SHMEM_LOG_LEVEL="${MORI_SHMEM_LOG_LEVEL:-info}" + export MORI_IO_LOG_LEVEL="${MORI_IO_LOG_LEVEL:-info}" + if [ "$CX_BENCH" = mori ]; then + export CX_IMAGE="${CX_IMAGE:-$CX_IMAGE_AMD_MORI_MI325}" + export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-asyncll}" + fi +fi +# Resolve the image now that CX_BENCH and RUNNER are both final (see note at IMAGE decl). +IMAGE="${CX_IMAGE:-$(cx_default_image "$RUNNER")}" +export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES=1 CX_GPUS_PER_NODE="$NGPUS" +export CX_SCALE_UP_DOMAIN="$NGPUS" CX_TS="$TS" +# topology_class is part of comparison_key; label the actual SKU when the MI325X wrapper calls this. +case "${RUNNER}" in + mi325x*) export CX_TOPO="mi325x-xgmi" ;; + *) export CX_TOPO="mi355x-xgmi" ;; +esac +export CX_TRANSPORT="xgmi" +# Allow a longer per-phase guard for large MoRI prefill points. +export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}" +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_SQUASH_DIR +PARTITION="$CX_PARTITION" +SQUASH_DIR="$CX_SQUASH_DIR" +cx_log "runner=$RUNNER ngpus=$NGPUS bench=$CX_BENCH" +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_prepare_runtime_marker "$MOUNT_SRC" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found on this runner" +cx_require_single_node "$RUNNER" + +# Pin to specific nodes when configured, otherwise apply the optional exclusion list. +allocation=(--partition="$PARTITION" --gres=gpu:"$NGPUS" --exclusive + --cpus-per-task="$CPUS_PER_TASK" + --time="$TIME_MIN" --job-name="$RUNNER") +if [ -n "$NODELIST" ]; then + cx_log "using configured node pin" + allocation+=(--nodelist="$NODELIST") +elif [ -n "$EXCLUDE_NODES" ]; then + allocation+=(--exclude="$EXCLUDE_NODES") +fi +cx_salloc_jobid "${allocation[@]}" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" + +cx_set_failure_stage container-import +SQUASH_FILE="$(cx_ensure_squash_on_job \ + "$JOB_ID" "$SQUASH_DIR" "$IMAGE" "${CX_LOCK_DIR:-}")" +cx_set_failure_stage container-hash +import_log="$(cx_private_log_path image-hash)" +if ! COLLECTIVEX_SQUASH_SHA256="$( + srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \ + --export="$(cx_host_exports)" \ + sha256sum "$SQUASH_FILE" \ + 2>>"$import_log" | awk 'NR==1 {print $1}' +)"; then + cx_fail_stage container-hash "$import_log" +fi +[[ "$COLLECTIVEX_SQUASH_SHA256" =~ ^[0-9a-f]{64}$ ]] \ + || cx_fail_stage container-hash "$import_log" +export COLLECTIVEX_SQUASH_SHA256 +cx_preflight_allocation "$JOB_ID" 1 "$MOUNT_SRC" "$SQUASH_FILE" "${CX_SHARD_FILE:-}" + +run_rc=0 +cx_set_failure_stage container-launch +runtime_log="$(cx_private_log_path runtime)" +srun --jobid="$JOB_ID" --chdir=/tmp \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR$DEVICE_MOUNTS" \ + --container-writable --container-remap-root --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export="$(cx_container_exports)" \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" \ + >"$runtime_log" 2>&1 || run_rc=$? + +cx_adopt_runtime_stage "$MOUNT_SRC" +[ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the +# next checkout on this runner is clean. +rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true +cx_log "done — result artifacts collected" +exit "$final_rc" diff --git a/experimental/CollectiveX/launchers/launch_single-slurm.sh b/experimental/CollectiveX/launchers/launch_single-slurm.sh new file mode 100644 index 0000000000..b9b1ef9e8d --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_single-slurm.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# CollectiveX shared single-node NVIDIA Slurm launcher. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}" +ALLOC_EXTRA=(); SRUN_EXTRA=(); LOCAL_IMPORT=0 +case "$RUNNER" in + h100-dgxc) PRODUCT=h100; TOPO=h100-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 ;; + h200-dgxc) + PRODUCT=h200; TOPO=h200-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=0 + SRUN_EXTRA=(--container-remap-root) + ;; + b200-dgxc) + PRODUCT=b200; TOPO=b200-nvlink-island; DEFAULT_TIME=30; REQUIRE_ACCOUNT=1 + ALLOC_EXTRA=(--mem=0) + ;; + b300) + PRODUCT=b300; TOPO=b300-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 + ALLOC_EXTRA=(-N 1 --mem=0) + SRUN_EXTRA=(--mpi=none --container-remap-root) + LOCAL_IMPORT=1 + ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to a registered single-node SKU" ;; +esac +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}" +export CX_IMAGE_PLATFORM=linux/amd64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" + +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-$DEFAULT_TIME}" +IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}" +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES=1 CX_GPUS_PER_NODE="$NGPUS" +export CX_SCALE_UP_DOMAIN="$NGPUS" CX_TS="$TS" CX_TOPO="$TOPO" CX_TRANSPORT=nvlink +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export NCCL_CUMEM_ENABLE=1 +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_SQUASH_DIR +[ "$REQUIRE_ACCOUNT" = 0 ] || cx_require_vars CX_ACCOUNT +[ "$RUNNER" != b300 ] || cx_require_vars CX_STAGE_DIR + +cx_log "runner=$RUNNER ngpus=$NGPUS bench=$CX_BENCH" +[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; } +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +SQUASH_FILE="" +if [ "$LOCAL_IMPORT" = 1 ]; then + cx_set_failure_stage container-import + SQUASH_FILE="$(CX_ENROOT_LOCAL_IMPORT=1 \ + cx_ensure_squash "$CX_SQUASH_DIR" "$IMAGE")" + cx_set_failure_stage container-hash + cx_export_squash_identity "$SQUASH_FILE" +fi +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_prepare_runtime_marker "$MOUNT_SRC" +CONTAINER_MOUNTS="$MOUNT_SRC:/ix" +if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then + cx_set_failure_stage backend-setup + cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \ + || cx_die "cannot stage the pinned backend source" + export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources +fi +if [ "$CX_BENCH" = deepep-v2 ]; then + cx_prepare_backend_cache "$CX_SQUASH_DIR" \ + || cx_die "cannot prepare the isolated backend cache" + BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE" + CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$BACKEND_CACHE:/cx-cache" + export CX_BACKEND_CACHE_ROOT=/cx-cache +fi + +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found on this runner" +cx_require_single_node "$RUNNER" + +allocation=(--partition="$CX_PARTITION" --gres=gpu:"$NGPUS" --exclusive + --time="$TIME_MIN" --job-name="$RUNNER" "${ALLOC_EXTRA[@]}") +[ -z "${CX_ACCOUNT:-}" ] || allocation+=(--account="$CX_ACCOUNT") +[ -z "${CX_EXCLUDE_NODES:-}" ] || allocation+=(--exclude="$CX_EXCLUDE_NODES") +cx_salloc_jobid "${allocation[@]}" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +if [ "$LOCAL_IMPORT" = 0 ]; then + cx_set_failure_stage container-import + SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$CX_SQUASH_DIR" "$IMAGE")" + cx_set_failure_stage container-hash + cx_export_squash_identity "$SQUASH_FILE" +fi +cx_preflight_allocation "$JOB_ID" 1 "$MOUNT_SRC" "$SQUASH_FILE" "${CX_SHARD_FILE:-}" + +run_rc=0 +cx_set_failure_stage container-launch +runtime_log="$(cx_private_log_path runtime)" +srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" \ + --container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home \ + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \ + "${SRUN_EXTRA[@]}" --export="$(cx_container_exports)" \ + bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \ + >"$runtime_log" 2>&1 || run_rc=$? +cx_adopt_runtime_stage "$MOUNT_SRC" +[ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +cx_log "done - result artifacts collected" +exit "$final_rc" diff --git a/experimental/CollectiveX/publisher.py b/experimental/CollectiveX/publisher.py new file mode 100644 index 0000000000..a90dc99970 --- /dev/null +++ b/experimental/CollectiveX/publisher.py @@ -0,0 +1,3167 @@ +#!/usr/bin/env python3 +"""Fail-closed filesystem publisher for CollectiveX EP v1 artifacts.""" +from __future__ import annotations + +import argparse +import contextlib +import datetime as dt +import fcntl +import hashlib +import json +import math +import os +from pathlib import Path, PurePosixPath +import re +import shutil +import stat +import statistics +import sys +import tempfile +from typing import Any, Iterator, Sequence +import zipfile + +import jsonschema + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) + +import artifact_safety # noqa: E402 +import capability # noqa: E402 +import contracts # noqa: E402 +import identity # noqa: E402 +import sweep_matrix # noqa: E402 + +FORMAT_BUNDLE = "collectivex.private.bundle.v1" +FORMAT_PUBLIC = "collectivex.public.v1" +FORMAT_CHANNEL = "collectivex.channel.v1" +POLICY = "collectivex-decision-grade-v1" +PUBLISHER_POLICY = "collectivex-publisher-v1" +OUTCOMES = ("success", "unsupported", "failed", "invalid", "diagnostic") +REQUIRED_ALLOCATIONS = 3 +REQUIRED_COHORT_KINDS = ("library", "chip", "system", "routing") +REQUIRED_PROMOTION_COHORT_COUNTS = {"library": 48, "system": 12, "routing": 76} +CANONICAL_FULL_V1_MATRIX_SHA256 = ( + "292e05f8faccaa4971eda527a327190a9943e99d4f71611987f7b95f57f253e8" +) +CANONICAL_FULL_V1_CASE_CATALOG_SHA256 = ( + "29a9e2d65777e0bf388d49bfe31f91e0ec6537dafdaa71ac91c6ed75f9e44b00" +) +P50_STABILITY_LIMIT = 1.10 +P99_STABILITY_LIMIT = 1.25 +MAX_ARCHIVE_MEMBERS = 20_000 +MAX_ARCHIVE_MEMBER_BYTES = 2 * 1024**3 +MAX_ARCHIVE_TOTAL_BYTES = 16 * 1024**3 +MAX_PUBLIC_DATASET_BYTES = 32 * 1024**2 +HEX64 = re.compile(r"[0-9a-f]{64}") +SAFE_ID = re.compile(r"[a-z0-9][a-z0-9_.-]{0,127}") +REASON = re.compile(r"[a-z0-9][a-z0-9.-]{0,95}") +ARTIFACT_NAME = re.compile( + r"cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*" +) +CHANNEL_PATH = re.compile(r"datasets/([0-9a-f]{64})/dataset\.json") +SCHEMA_DIR = HERE / "schemas" +_SCHEMAS: dict[str, jsonschema.protocols.Validator] = {} + + +class PublisherError(ValueError): + """Input or stored state violates the publication contract.""" + + +strict_load = contracts.strict_load +_canonical = contracts.canonical_json_bytes + + +def _sha_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _sha_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _latest_timestamp(values: Sequence[str]) -> str: + """Return the latest evidence timestamp without introducing publisher wall time.""" + if not values: + raise PublisherError("cannot derive a timestamp without evidence") + + def parsed(value: str) -> dt.datetime: + try: + timestamp = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError as exc: + raise PublisherError("evidence timestamp is not ISO-8601") from exc + if timestamp.tzinfo is None: + raise PublisherError("evidence timestamp must include a timezone") + return timestamp.astimezone(dt.timezone.utc) + + return max(values, key=lambda value: (parsed(value), value)) + + +def _schema(name: str, value: Any) -> None: + validator = _SCHEMAS.get(name) + if validator is None: + schema = strict_load(SCHEMA_DIR / name) + jsonschema.Draft202012Validator.check_schema(schema) + validator = jsonschema.Draft202012Validator( + schema, format_checker=jsonschema.FormatChecker() + ) + _SCHEMAS[name] = validator + errors = sorted(validator.iter_errors(value), key=lambda error: list(error.absolute_path)) + if errors: + error = errors[0] + location = ".".join(map(str, error.absolute_path)) or "$" + raise PublisherError(f"{name}:{location}: {error.message}") +def _exact(obj: Any, fields: set[str], path: str) -> dict[str, Any]: + if not isinstance(obj, dict): + raise PublisherError(f"{path} must be an object") + actual = set(obj) + if actual != fields: + raise PublisherError( + f"{path} fields differ: missing={sorted(fields - actual)}, " + f"extra={sorted(actual - fields)}" + ) + return obj +def _array(value: Any, path: str, *, nonempty: bool = False) -> list[Any]: + if not isinstance(value, list) or (nonempty and not value): + qualifier = "a nonempty" if nonempty else "an" + raise PublisherError(f"{path} must be {qualifier} array") + return value + + +def _integer(value: Any, path: str, *, minimum: int = 0) -> int: + if type(value) is not int or value < minimum: + raise PublisherError(f"{path} must be an integer >= {minimum}") + return value + + +def _unique(values: Sequence[Any], path: str) -> None: + serialized = [_canonical(value) for value in values] + if len(serialized) != len(set(serialized)): + raise PublisherError(f"{path} contains duplicates") + +def _eligibility(value: dict[str, Any], path: str) -> dict[str, Any]: + allocations = value["allocation_ids"] + p50 = value["p50_max_min_ratio"] + p99 = value["p99_max_min_ratio"] + gates = ( + len(allocations) >= REQUIRED_ALLOCATIONS, + value["complete"], value["correct"], value["measured_roundtrip_p99"], + value["stable_p50"], value["stable_p99"], value["stable_ordering"], + p50 is not None and p50 <= P50_STABILITY_LIMIT, + p99 is not None and p99 <= P99_STABILITY_LIMIT, + ) + if value["decision_grade"] != (all(gates) and not value["reasons"]): + raise PublisherError(f"{path}.decision_grade does not match promotion gates") + if value["decision_grade"] == bool(value["reasons"]): + raise PublisherError(f"{path}.reasons does not match decision status") + return value + + +def validate_channel(doc: Any, *, expected_channel: str | None = None) -> dict[str, Any]: + _schema("channel-v1.schema.json", doc) + if expected_channel and doc["channel"] != expected_channel: + raise PublisherError("channel name does not match its file") + target = doc["dataset"] + match = CHANNEL_PATH.fullmatch(target["path"]) if isinstance(target["path"], str) else None + if not match or match.group(1) != target["sha256"]: + raise PublisherError("channel dataset path and sha256 do not agree") + return doc + + +def _metric_value(series: dict[str, Any], metric: dict[str, Any]) -> tuple[str, float, str]: + point = next( + (point for point in series["points"] if point["tokens_per_rank"] == metric["tokens_per_rank"]), + None, + ) + if point is None or series["phase"] != metric["phase"]: + raise PublisherError("decision metric references an unavailable point") + component = point["components"]["roundtrip"] + if metric["measure"] == "latency_us": + value = component["latency_us"][metric["statistic"]] + unit = "us" + else: + rates = component["logical_payload_rate_gbps_at_latency_percentile"] + if rates is None: + raise PublisherError("logical bandwidth decision has no logical byte contract") + value = rates[metric["statistic"]] + unit = "GB/s" + return point["point_id"], value, unit + + +def _validate_metric(metric: dict[str, Any]) -> None: + expected = "min" if metric["measure"] == "latency_us" else "max" + if metric["objective"] != expected: + raise PublisherError(f"{metric['measure']} objective must be {expected}") + + +def _metric_label(measure: str, statistic: str) -> str: + return ( + f"{statistic} latency" + if measure == "latency_us" + else f"payload rate at {statistic} latency" + ) + + +def _routing_build_control(build: dict[str, Any]) -> dict[str, Any]: + return { + key: build[key] + for key in ( + "routing_control_sha256", "image_digest", "source_sha", "squash_sha256", + ) + } + + +def _routing_implementation_mismatch(members: Sequence[dict[str, Any]]) -> bool: + off_eplb_hashes = { + member["build"]["implementation_contract_sha256"] + for member in members if not member["workload"]["eplb"] + } + return len(off_eplb_hashes) > 1 + + +def _public_case_factors(series: dict[str, Any]) -> dict[str, Any]: + workload = series["workload"] + system = series["system"] + measurement = series["measurement"] + platform = capability.PLATFORMS[system["sku"]] + ep_size = system["ep_size"] + return { + "case": { + "backend": series["backend"]["id"], + "canonical": True, + "eplb": workload["eplb"], + "ep": ep_size, + "experts": workload["experts"], + "gpus_per_node": platform["gpus_per_node"], + "hidden": workload["hidden"], + "ladder": " ".join(str(point["tokens_per_rank"]) for point in series["points"]), + "nodes": ep_size // platform["gpus_per_node"], + "phase": series["phase"], + "required_publication": series["publication_tier"], + "routing": workload["routing"], + "samples_per_point": measurement["samples_per_component"], + "scale_up_domain": platform["scale_up_domain"], + "suite": series["suite"], + "timing": ( + f"{measurement['iters']}:{measurement['trials']}:" + f"{measurement['warmups']}" + ), + "topk": workload["top_k"], + "warmup_semantics": sweep_matrix.ep_harness.WARMUP_SEMANTICS, + "workload": series["model"], + }, + "profile": identity.V1_CASE_PROFILE, + "sku": system["sku"], + } + + +def _public_series_config(series: dict[str, Any]) -> dict[str, Any]: + return { + "backend": { + "generation": series["backend"]["generation"], + "version": series["backend"]["version"], + }, + "resource": series["resource"], + "system": {"label": series["system"]["label"]}, + } + + +def _public_cohort_factors(kind: str, item: dict[str, Any]) -> tuple[Any, Any]: + workload = item["workload"] + build = item["build"] + shape = { + key: workload[key] + for key in ( + "hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype", + "activation_profile", + ) + } + common = { + "model": item["model"], "phase": item["phase"], "shape": shape, + "measurement": item["measurement"], "ep_size": item["system"]["ep_size"], + } + if kind == "library": + return ( + {**common, "system": item["system"], "workload": workload, + "resource_mode": item["resource"]["mode"], "source": build["source_sha"]}, + item["backend"]["id"], + ) + if kind == "chip": + return ( + {**common, "backend": item["backend"], "workload": workload, + "resource_mode": item["resource"]["mode"], "source": build["source_sha"]}, + item["system"], + ) + if kind == "system": + return {**common, "workload": workload, "source": build["source_sha"]}, [ + item["system"]["sku"], item["backend"]["id"], item["resource"]["profile"] + ] + if kind == "routing": + return ( + {**common, "backend": item["backend"], "system": item["system"], + "resource": item["resource"], "build": _routing_build_control(build)}, + [workload["routing"], workload["eplb"], + build["implementation_contract_sha256"]], + ) + raise PublisherError(f"unknown cohort kind {kind}") + + +def _case_disposition_catalog_sha256(coverage: Sequence[dict[str, Any]]) -> str: + catalog = [ + {"case_id": item["case_id"], "disposition": item["disposition"]} + for item in sorted(coverage, key=lambda item: item["case_id"]) + ] + return _sha_bytes(_canonical(catalog)) + + +def validate_public_dataset(doc: Any) -> dict[str, Any]: + _schema("public-dataset-v1.schema.json", doc) + if len(_canonical(doc)) + 1 > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("public dataset exceeds the serving size limit") + try: + artifact_safety.assert_publication_safe([doc]) + except artifact_safety.ArtifactSafetyError as exc: + raise PublisherError(str(exc)) from exc + if doc["source_bundle_ids"] != sorted(doc["source_bundle_ids"]): + raise PublisherError("source bundle IDs are not canonical") + for field, key in ( + ("coverage", "case_id"), ("attempts", "attempt_id"), + ("series", "series_id"), ("cohorts", "cohort_id"), + ("rankings", "ranking_id"), ("recommendations", "recommendation_id"), + ("sensitivities", "sensitivity_id"), + ): + if doc[field] != sorted(doc[field], key=lambda item: item[key]): + raise PublisherError(f"{field} are not in canonical identity order") + promotion = doc["promotion"] + quarantined = promotion["status"] == "quarantined" + if quarantined != (promotion["reason"] is not None) or quarantined != ( + promotion["matrix_id"] is None + ): + raise PublisherError("promotion reason/matrix identity differs from status") + attempts = {item["attempt_id"]: item for item in doc["attempts"]} + if len(attempts) != len(doc["attempts"]): + raise PublisherError("dataset has duplicate attempt IDs") + evidence = [ + value["evidence_id"] for item in doc["attempts"] for value in item["evidence"] + ] + _unique(evidence, "dataset attempt evidence") + series = {item["series_id"]: item for item in doc["series"]} + if len(series) != len(doc["series"]): + raise PublisherError("dataset has duplicate series IDs") + allocation_ids = set(promotion["allocation_ids"]) + case_ids = {item["case_id"] for item in doc["coverage"]} + if len(case_ids) != len(doc["coverage"]): + raise PublisherError("dataset has duplicate case coverage") + coverage_by_case = {item["case_id"]: item for item in doc["coverage"]} + for item in doc["attempts"]: + if item["case_id"] not in case_ids or item["allocation_id"] not in allocation_ids: + raise PublisherError("attempt references undeclared coverage or allocation") + if item["series_id"] is not None and item["series_id"] not in series: + raise PublisherError("attempt references unknown series") + if (item["outcome"] == "success") != (item["reason"] is None): + raise PublisherError("attempt reason must be null exactly for success") + if item["outcome"] == "success" and item["failure_mode"] is not None: + raise PublisherError("successful attempt cannot have a failure mode") + if (item["outcome"] == "success" and item["selected"]) != ( + item["series_id"] is not None + ): + raise PublisherError("attempt series must be present exactly for selected success") + if {item["allocation_id"] for item in doc["attempts"]} != allocation_ids: + raise PublisherError("promotion allocation catalog differs from attempts") + attempt_groups: dict[tuple[str, str], list[dict[str, Any]]] = {} + for item in doc["attempts"]: + attempt_groups.setdefault((item["case_id"], item["allocation_id"]), []).append(item) + for (case_id, allocation_id), group in attempt_groups.items(): + ordinals = sorted(item["attempt_index"] for item in group) + if ordinals != list(range(1, len(group) + 1)): + raise PublisherError("public retries must retain contiguous attempt indexes") + if any( + item["attempt_id"] != identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=item["attempt_index"] + ) + for item in group + ): + raise PublisherError("public retry identity differs from its case/allocation/index") + selected = [item for item in group if item["selected"]] + if len(selected) != 1 or selected[0]["attempt_index"] != ordinals[-1]: + raise PublisherError("publisher must select the latest retry per case/allocation") + selected_by_series: dict[str, list[dict[str, Any]]] = {} + for item in doc["attempts"]: + if item["selected"] and item["outcome"] == "success": + selected_by_series.setdefault(item["series_id"], []).append(item) + terminal = 0 + for item in doc["coverage"]: + listed = set(item["attempt_ids"]) + selected = item["selected_attempt_id"] + expected_attempts = { + attempt_id for attempt_id, attempt in attempts.items() + if attempt["case_id"] == item["case_id"] + } + if listed != expected_attempts: + raise PublisherError("coverage references attempts from another case") + if selected is not None: + terminal += 1 + if (selected not in listed or not attempts[selected]["selected"] + or any(attempts[selected][field] != item[field] + for field in ("outcome", "failure_mode", "reason"))): + raise PublisherError("coverage selected outcome differs") + selected_candidates = [attempts[value] for value in listed if attempts[value]["selected"]] + latest = max( + selected_candidates, + key=lambda attempt: ( + int(attempt["run_id"]), attempt["run_attempt"], + attempt["attempt_index"], attempt["attempt_id"] + ), + ) + if selected != latest["attempt_id"]: + raise PublisherError("coverage does not select the latest canonical allocation") + if promotion["requested_cases"] != len(doc["coverage"]) or promotion["terminal_cases"] != terminal: + raise PublisherError("promotion coverage counts differ") + selected_evidence: dict[tuple[str, str], set[str]] = {} + for attempt in doc["attempts"]: + if attempt["selected"] and attempt["series_id"] is not None: + for value in attempt["evidence"]: + selected_evidence.setdefault( + (attempt["series_id"], value["point_id"]), set() + ).add(value["evidence_id"]) + for item in doc["series"]: + eligibility = _eligibility(item["eligibility"], f"series {item['series_id']}") + workload = item["workload"] + model, hidden, top_k, experts = sweep_matrix.V1_WORKLOAD + suite_contract = sweep_matrix.V1_SUITE_CONTRACTS.get(item["suite"]) + coordinate = (item["phase"], workload["routing"], workload["eplb"]) + if ( + item["model"] != model + or (workload["hidden"], workload["top_k"], workload["experts"]) + != (hidden, top_k, experts) + or suite_contract is None + or coordinate not in suite_contract["coordinates"] + or item["publication_tier"] != suite_contract["publication"] + ): + raise PublisherError("series differs from the frozen v1 workload/suite profile") + backend_id = item["backend"]["id"] + expected_role = "reference" if backend_id == "nccl-ep" else "library" + if ( + backend_id not in capability.BACKENDS + or item["backend"]["label"] != BACKEND_LABELS[backend_id] + or item["backend"]["role"] != expected_role + or item["backend"]["version"] is None + ): + raise PublisherError("series backend projection differs from v1") + sku = item["system"]["sku"] + platform = capability.PLATFORMS.get(sku) + ep_size = item["system"]["ep_size"] + if platform is None or ep_size % platform["gpus_per_node"]: + raise PublisherError("series system projection differs from v1") + nodes = ep_size // platform["gpus_per_node"] + supported, _ = capability.resolve( + sku, backend_id, nodes=nodes, + routing=workload["routing"], eplb=workload["eplb"], + ) + if ( + not supported + or item["system"]["vendor"] != platform["vendor"] + or item["system"]["transport"] != platform["transport"] + or item["system"]["topology_class"] != platform["topology_class"] + or item["system"]["world_size"] != ep_size + or platform["product"] not in set( + re.findall(r"[a-z]+\d+[a-z]*", item["system"]["label"].lower()) + ) + ): + raise PublisherError("series system projection differs from v1") + if contracts.public_series_config_sha256(_public_series_config(item)) != item[ + "build" + ]["public_config_sha256"]: + raise PublisherError("public series configuration differs from its commitment") + covered = [coverage_by_case.get(case_id) for case_id in item["case_ids"]] + if not covered or any( + case is None + or (case["sku"], case["backend"], case["phase"]) + != (sku, backend_id, item["phase"]) + for case in covered + ): + raise PublisherError("series projection differs from its case coverage") + if ( + item["eplb"]["enabled"] != item["workload"]["eplb"] + or item["eplb"]["logical_experts"] != item["workload"]["experts"] + ): + raise PublisherError("series EPLB descriptor differs from its workload") + eplb = item["eplb"] + expected_physical = eplb["logical_experts"] + eplb["redundant_experts"] + nullable_eplb = ( + "planner", "mapping_sha256", "reference_tokens_per_rank", "max_replicas", + "imbalance_before", "imbalance_after", + ) + if eplb["enabled"]: + if ( + item["workload"]["routing"] != "zipf" + or any(eplb[field] is None for field in nullable_eplb) + or eplb["planner"] != "greedy-rank-major-v1" + or eplb["reference_tokens_per_rank"] != 2048 + or eplb["redundant_experts"] != 32 + or eplb["redundant_experts"] % ep_size != 0 + or eplb["physical_experts"] != expected_physical + or eplb["logical_experts"] % ep_size != 0 + or eplb["physical_experts"] % ep_size != 0 + or not 1 <= eplb["replicated_experts"] <= min( + eplb["logical_experts"], eplb["redundant_experts"] + ) + or not 2 <= eplb["max_replicas"] <= 1 + eplb["redundant_experts"] + or not 1 <= eplb["imbalance_after"] <= eplb["imbalance_before"] <= ep_size + ): + raise PublisherError("enabled EPLB descriptor is incomplete") + expected_plan = contracts._expected_eplb_plan( + workload["routing"], workload["top_k"], + eplb["logical_experts"], eplb["physical_experts"], ep_size, + identity.V1_CASE_PROFILE["seed"], + identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"], + ) + expected_eplb = { + "enabled": True, + "planner": identity.V1_CASE_PROFILE["eplb_planner"], + "mapping_sha256": contracts.eplb_contract.mapping_hash(expected_plan), + "logical_experts": eplb["logical_experts"], + "physical_experts": eplb["physical_experts"], + "redundant_experts": identity.V1_CASE_PROFILE["eplb_redundant_experts"], + "reference_tokens_per_rank": identity.V1_CASE_PROFILE[ + "eplb_reference_tokens_per_rank" + ], + "replicated_experts": expected_plan["replicated_experts"], + "max_replicas": expected_plan["max_replicas"], + "imbalance_before": expected_plan["imbalance_before"], + "imbalance_after": expected_plan["imbalance_after"], + } + if eplb != expected_eplb: + raise PublisherError("enabled EPLB descriptor differs from deterministic plan") + elif ( + any(eplb[field] is not None for field in nullable_eplb) + or eplb["physical_experts"] != expected_physical + or eplb["redundant_experts"] != 0 + or eplb["replicated_experts"] != 0 + ): + raise PublisherError("disabled EPLB descriptor claims a plan") + if item["backend"]["id"] == "nccl-ep": + expected_generation = ( + "nccl" if item["system"]["vendor"] == "nvidia" else "rccl" + ) + if item["backend"]["generation"] != expected_generation: + raise PublisherError("NCCL/RCCL reference generation differs from system vendor") + if (item["status"] == "decision-grade") != eligibility["decision_grade"]: + raise PublisherError("series status differs from eligibility") + if ( + set(eligibility["allocation_ids"]) != set(item["allocation_ids"]) + or eligibility["correct"] != all(point["correct"] for point in item["points"]) + ): + raise PublisherError("series eligibility differs from its evidence") + selected_attempts = selected_by_series.get(item["series_id"], []) + if ( + set(item["case_ids"]) != {attempt["case_id"] for attempt in selected_attempts} + or set(item["allocation_ids"]) + != {attempt["allocation_id"] for attempt in selected_attempts} + ): + raise PublisherError("series case/allocation catalog differs from selected attempts") + if item["eligibility"]["decision_grade"] and len( + {attempt["run_id"] for attempt in selected_attempts} + ) < REQUIRED_ALLOCATIONS: + raise PublisherError("decision-grade series lacks independent workflow runs") + tokens = [point["tokens_per_rank"] for point in item["points"]] + if tokens != sorted(set(tokens)): + raise PublisherError("series points are not in unique ascending token order") + if len(item["case_ids"]) != 1: + raise PublisherError("public series must represent exactly one v1 case") + case_id = item["case_ids"][0] + if identity.digest("case", _public_case_factors(item)) != case_id: + raise PublisherError("public series projection differs from its case identity") + build = item["build"] + expected_series_id = identity.series_id({ + "backend": backend_id, + "case_id": case_id, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build[ + "implementation_contract_sha256" + ], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": workload["workload_id"], + }) + if item["series_id"] != expected_series_id: + raise PublisherError("public series identity differs from its committed factors") + for point in item["points"]: + if point["point_id"] != identity.point_id(series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]): + raise PublisherError("point identity differs") + if point["global_tokens"] != point["tokens_per_rank"] * item["system"]["ep_size"]: + raise PublisherError("global_tokens must use EP size") + routing = point["routing"] + max_fanout = min(item["workload"]["top_k"], item["system"]["ep_size"]) + if ( + routing["routed_copies"] < point["global_tokens"] + or routing["routed_copies"] > point["global_tokens"] * max_fanout + or routing["recv_tokens_max"] > routing["routed_copies"] + or routing["recv_tokens_max"] * item["system"]["ep_size"] + < routing["routed_copies"] + or not math.isclose( + routing["fanout_mean"], + routing["routed_copies"] / point["global_tokens"], + rel_tol=1e-12, + ) + or routing["hotspot_ratio"] < 1 + or routing["empty_expert_count"] >= eplb["physical_experts"] + or routing["empty_rank_count"] >= item["system"]["ep_size"] + ): + raise PublisherError("point routing/load facts are internally inconsistent") + expected_evidence = selected_evidence.get( + (item["series_id"], point["point_id"]), set() + ) + if set(point["evidence_ids"]) != expected_evidence: + raise PublisherError("point evidence differs from selected series attempts") + components = point["components"] + if (components["dispatch"] is None) != (components["combine"] is None): + raise PublisherError("dispatch/combine availability differs") + for name, component in components.items(): + if component is None: + continue + expected_origin = "derived" if name == "isolated_sum" else "measured" + expected_samples = None if name == "isolated_sum" else 512 + if component["origin"] != expected_origin or component["sample_count"] != expected_samples: + raise PublisherError(f"{name} origin or sample count differs") + if name == "isolated_sum" and ( + component["logical_bytes"] is not None + or component["logical_payload_rate_gbps_at_latency_percentile"] is not None + ): + raise PublisherError("isolated_sum cannot publish logical bandwidth") + if name != "isolated_sum" and ( + component["logical_bytes"] is None + or component["logical_payload_rate_gbps_at_latency_percentile"] is None + ): + raise PublisherError(f"{name} measured logical bandwidth is missing") + latency = component["latency_us"] + if list(latency.values()) != sorted(latency.values()): + raise PublisherError("latency percentiles are not ordered") + if component["logical_payload_rate_gbps_at_latency_percentile"] is not None: + for statistic, rate in component["logical_payload_rate_gbps_at_latency_percentile"].items(): + expected = component["logical_bytes"] / (latency[statistic] * 1000.0) + if not math.isclose(rate, expected, rel_tol=1e-9, abs_tol=1e-12): + raise PublisherError("logical GB/s formula differs") + if components["roundtrip"] is None or components["roundtrip"]["origin"] != "measured": + raise PublisherError("roundtrip must be measured") + for statistic, throughput in point["roundtrip_token_rate_at_latency_percentile"].items(): + expected = point["global_tokens"] / ( + components["roundtrip"]["latency_us"][statistic] * 1e-6 + ) + if not math.isclose(throughput, expected, rel_tol=1e-9): + raise PublisherError("roundtrip token throughput formula differs") + if components["dispatch"] is not None: + derived = components["isolated_sum"] + if derived is None or any(not math.isclose( + derived["latency_us"][statistic], + components["dispatch"]["latency_us"][statistic] + + components["combine"]["latency_us"][statistic], rel_tol=1e-12 + ) for statistic in ("p50", "p90", "p95", "p99")): + raise PublisherError("isolated_sum is not the component percentile sum") + elif components["isolated_sum"] is not None: + raise PublisherError("isolated_sum requires measured dispatch/combine components") + cohorts = {item["cohort_id"]: item for item in doc["cohorts"]} + if len(cohorts) != len(doc["cohorts"]): + raise PublisherError("dataset has duplicate cohort IDs") + for item in doc["cohorts"]: + if not set(item["series_ids"]).issubset(series): + raise PublisherError("cohort references unknown series") + members = [series[series_id] for series_id in item["series_ids"]] + expected_tier = ( + "comparable-experimental" + if any(member["publication_tier"] == "comparable-experimental" for member in members) + else "official" + ) + if item["publication_tier"] != expected_tier: + raise PublisherError("cohort publication tier differs from its members") + roles = {member["backend"]["role"] for member in members} + if item["kind"] == "library" and roles != {"library"}: + raise PublisherError("library cohort contains non-library evidence") + if item["kind"] == "system" and roles != {"reference"}: + raise PublisherError("system cohort is not a portable reference comparison") + if item["kind"] in {"chip", "routing"} and len( + {_canonical(member["backend"]) for member in members} + ) != 1: + raise PublisherError(f"{item['kind']} cohort mixes backend implementations") + public_factors = [_public_cohort_factors(item["kind"], member) for member in members] + if len({_canonical(value[0]) for value in public_factors}) != 1: + raise PublisherError(f"{item['kind']} cohort does not control its public factors") + if len({_canonical(value[1]) for value in public_factors}) < 2: + raise PublisherError(f"{item['kind']} cohort does not vary its declared contrast") + if item["kind"] == "routing": + if item["publication_tier"] != "comparable-experimental": + raise PublisherError("routing cohort must be experimental") + has_baseline = sum( + member["workload"]["routing"] == "uniform" + and not member["workload"]["eplb"] + for member in members + ) == 1 + missing_reason = "missing-uniform-baseline" in item["eligibility"]["reasons"] + if has_baseline == missing_reason: + raise PublisherError("routing baseline and eligibility reason disagree") + mismatch = _routing_implementation_mismatch(members) + mismatch_reason = "implementation-config-mismatch" in item["eligibility"]["reasons"] + if mismatch != mismatch_reason: + raise PublisherError("routing implementation control and eligibility disagree") + expected_id = _derived_id("cxcohort-v1-", { + "kind": item["kind"], "series_ids": item["series_ids"], + "controlled_factors": item["controlled_factors"], + "varying_factors": item["varying_factors"], + }) + if item["cohort_id"] != expected_id: + raise PublisherError("cohort ID differs from its public factors") + expected_factors = { + "library": ( + ["system", "workload", "phase", "measurement", "resource.mode", "source"], + ["backend", "resource"], + ), + "chip": ( + ["backend", "source", "workload", "phase", "measurement", "resource.mode"], + ["system", "resource"], + ), + "system": ( + ["workload", "phase", "measurement", "source"], + ["system", "backend", "resource"], + ), + "routing": ( + ["backend", "implementation-static-build", "system", "model-shape", "phase", "measurement", "resource"], + ["workload.routing", "workload.eplb", "implementation-config"], + ), + }[item["kind"]] + member_allocations = { + allocation for series_id in item["series_ids"] + for allocation in series[series_id]["allocation_ids"] + } + if ( + (item["controlled_factors"], item["varying_factors"]) != expected_factors + or set(item["eligibility"]["allocation_ids"]) != member_allocations + ): + raise PublisherError("cohort factors or allocations differ from its members") + _eligibility(item["eligibility"], f"cohort {item['cohort_id']}") + expected_ranking_keys: set[tuple[str, str, str, int]] = set() + for cohort in doc["cohorts"]: + if not cohort["eligibility"]["decision_grade"]: + continue + members = [series[series_id] for series_id in cohort["series_ids"]] + tokens = set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} + for member in members + )) + expected_ranking_keys.update( + (cohort["cohort_id"], measure, statistic, token) + for token in tokens + for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile") + for statistic in ("p50", "p99") + ) + ranking_top: dict[tuple[str, str, str, int], dict[str, Any]] = {} + ranking_ids: set[str] = set() + for ranking in doc["rankings"]: + cohort = cohorts.get(ranking["cohort_id"]) + if ( + cohort is None + or not cohort["eligibility"]["decision_grade"] + or ranking["eligibility"] != cohort["eligibility"] + or ranking["publication_tier"] != cohort["publication_tier"] + ): + raise PublisherError("ranking references an ineligible cohort") + entries = ranking["entries"] + _validate_metric(ranking["metric"]) + if cohort["kind"] == "library" and any( + series[series_id]["backend"]["role"] == "reference" + for series_id in cohort["series_ids"] + ): + raise PublisherError("reference evidence cannot drive a library ranking") + if {entry["series_id"] for entry in entries} != set(cohort["series_ids"]): + raise PublisherError("ranking does not cover its cohort") + for entry in entries: + point_id, value, unit = _metric_value(series[entry["series_id"]], ranking["metric"]) + if entry["point_id"] != point_id or entry["unit"] != unit or not math.isclose(entry["value"], value, rel_tol=1e-12): + raise PublisherError("ranking entry differs from series data") + reverse = ranking["metric"]["objective"] == "max" + expected = sorted(entries, key=lambda entry: (entry["value"], entry["series_id"]), reverse=reverse) + if entries != expected or [entry["rank"] for entry in entries] != list(range(1, len(entries) + 1)): + raise PublisherError("ranking order differs") + metric = ranking["metric"] + expected_id = _derived_id("cxranking-v1-", { + "cohort_id": ranking["cohort_id"], "metric": metric, + }) + if ranking["ranking_id"] != expected_id or expected_id in ranking_ids: + raise PublisherError("ranking ID is duplicate or differs") + ranking_ids.add(expected_id) + ranking_top[(ranking["cohort_id"], metric["measure"], metric["statistic"], metric["tokens_per_rank"])] = entries[0] + if set(ranking_top) != expected_ranking_keys: + raise PublisherError("rankings do not cover every eligible cohort metric") + objective = { + "min-p50-latency": ("latency_us", "p50"), "min-p99-latency": ("latency_us", "p99"), + "max-payload-rate-at-p50-latency": ( + "logical_payload_rate_gbps_at_latency_percentile", "p50" + ), + "max-payload-rate-at-p99-latency": ( + "logical_payload_rate_gbps_at_latency_percentile", "p99" + ), + } + recommendation_ids: set[str] = set() + for item in doc["recommendations"]: + measure, statistic = objective[item["objective"]] + candidates = [top for key, top in ranking_top.items() + if key[:3] == (item["cohort_id"], measure, statistic) and top["point_id"] == item["point_id"]] + if len(candidates) != 1 or any(item[field] != candidates[0][field] for field in ("series_id", "point_id", "value", "unit")): + raise PublisherError("recommendation is not a ranking winner") + matching_ranking = next( + ranking for ranking in doc["rankings"] + if ranking["cohort_id"] == item["cohort_id"] + and ranking["metric"]["measure"] == measure + and ranking["metric"]["statistic"] == statistic + and ranking["entries"][0]["point_id"] == item["point_id"] + ) + expected_id = _derived_id("cxrecommendation-v1-", { + "objective": item["objective"], "ranking_id": matching_ranking["ranking_id"], + }) + cohort = cohorts[item["cohort_id"]] + if (item["recommendation_id"] != expected_id or expected_id in recommendation_ids + or cohort["publication_tier"] != "official" + or item["publication_tier"] != "official" + or item["eligibility"] != cohort["eligibility"]): + raise PublisherError("recommendation ID/eligibility differs") + recommendation_ids.add(expected_id) + expected_recommendations = sum( + cohorts[ranking["cohort_id"]]["publication_tier"] == "official" + for ranking in doc["rankings"] + ) + if len(doc["recommendations"]) != expected_recommendations: + raise PublisherError("recommendations do not cover every actionable ranking") + sensitivity_ids: set[str] = set() + sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set() + for item in doc["sensitivities"]: + cohort = cohorts.get(item["cohort_id"]) + if ( + cohort is None + or cohort["kind"] != "routing" + or not cohort["eligibility"]["decision_grade"] + or item["publication_tier"] != cohort["publication_tier"] + or item["eligibility"] != cohort["eligibility"] + ): + raise PublisherError("sensitivity references a non-routing cohort") + if ( + item["baseline_series_id"] == item["candidate_series_id"] + or not {item["baseline_series_id"], item["candidate_series_id"]}.issubset(cohort["series_ids"]) + ): + raise PublisherError("sensitivity series differ from its routing cohort") + _validate_metric(item["metric"]) + baseline_series = series[item["baseline_series_id"]] + if ( + baseline_series["workload"]["routing"] != "uniform" + or baseline_series["workload"]["eplb"] + ): + raise PublisherError("sensitivity baseline is not uniform without EPLB") + _, baseline, _ = _metric_value(series[item["baseline_series_id"]], item["metric"]) + _, candidate, _ = _metric_value(series[item["candidate_series_id"]], item["metric"]) + if not math.isclose(item["signed_change_ratio"], (candidate - baseline) / baseline, rel_tol=1e-12): + raise PublisherError("sensitivity ratio differs") + expected_id = _derived_id("cxsensitivity-v1-", { + "baseline": item["baseline_series_id"], + "candidate": item["candidate_series_id"], + "cohort": item["cohort_id"], "metric": item["metric"], + }) + if item["sensitivity_id"] != expected_id or expected_id in sensitivity_ids: + raise PublisherError("sensitivity ID is duplicate or differs") + sensitivity_ids.add(expected_id) + sensitivity_keys.add(( + item["cohort_id"], item["baseline_series_id"], item["candidate_series_id"], + item["metric"]["measure"], item["metric"]["statistic"], + item["metric"]["tokens_per_rank"], + )) + expected_sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set() + for cohort in doc["cohorts"]: + if cohort["kind"] != "routing" or not cohort["eligibility"]["decision_grade"]: + continue + members = [series[series_id] for series_id in cohort["series_ids"]] + baseline = next(( + member for member in members + if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"] + ), None) + if baseline is None: + continue + tokens = set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} + for member in members + )) + expected_sensitivity_keys.update( + (cohort["cohort_id"], baseline["series_id"], candidate["series_id"], + measure, statistic, token) + for candidate in members if candidate is not baseline + for token in tokens + for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile") + for statistic in ("p50", "p99") + ) + if sensitivity_keys != expected_sensitivity_keys: + raise PublisherError("sensitivities do not cover every routing contrast metric") + if promotion["status"] == "promoted": + run_ids = {item["run_id"] for item in doc["attempts"] if item["selected"]} + repeated_cases = all( + len({ + attempts[attempt_id]["run_id"] + for attempt_id in coverage["attempt_ids"] + if attempts[attempt_id]["selected"] + }) == REQUIRED_ALLOCATIONS + for coverage in doc["coverage"] + ) + if promotion["matrix_id"] != CANONICAL_FULL_V1_MATRIX_SHA256: + raise PublisherError("promotion requires the canonical full-v1 matrix") + if ( + _case_disposition_catalog_sha256(doc["coverage"]) + != CANONICAL_FULL_V1_CASE_CATALOG_SHA256 + ): + raise PublisherError("promotion requires the canonical case/disposition catalog") + if ( + terminal != len(doc["coverage"]) + or len(doc["source_bundle_ids"]) != REQUIRED_ALLOCATIONS + or len(run_ids) != REQUIRED_ALLOCATIONS + or not repeated_cases + ): + raise PublisherError("promoted dataset lacks complete coverage") + expected_outcomes = { + item["case_id"]: ( + "success" if item["disposition"] == "runnable" else "unsupported" + ) + for item in doc["coverage"] + } + if any( + item["selected"] + and item["outcome"] != expected_outcomes[item["case_id"]] + for item in doc["attempts"] + ): + raise PublisherError("promoted outcomes differ from requested dispositions") + runnable_cases = { + item["case_id"] for item in doc["coverage"] + if item["disposition"] == "runnable" + } + if any( + item["case_id"] in runnable_cases and item["outcome"] != "success" + for item in doc["attempts"] + ): + raise PublisherError( + "promotion rejects runnable cases with failed, invalid, or diagnostic retries" + ) + _require_promotion_series(doc["series"]) + _require_promotion_cohorts(doc["cohorts"], doc["series"]) + if not doc["rankings"] or not doc["recommendations"]: + raise PublisherError("promoted dataset lacks eligible decisions") + if promotion["status"] == "quarantined" and any(( + doc["source_bundle_ids"], promotion["allocation_ids"], doc["coverage"], + doc["attempts"], doc["series"], doc["cohorts"], doc["rankings"], + doc["recommendations"], doc["sensitivities"], + )): + raise PublisherError("quarantined dataset exposes unvalidated evidence") + return doc + + +def _file_record(value: Any, path: str) -> dict[str, Any]: + item = _exact(value, {"path", "sha256", "bytes"}, path) + if not isinstance(item["path"], str) or PurePosixPath(item["path"]).is_absolute() or ".." in PurePosixPath(item["path"]).parts: + raise PublisherError(f"{path}.path is unsafe") + if not isinstance(item["sha256"], str) or HEX64.fullmatch(item["sha256"]) is None: + raise PublisherError(f"{path}.sha256 is invalid") + _integer(item["bytes"], f"{path}.bytes", minimum=1) + return item + +def validate_bundle_manifest(doc: Any) -> dict[str, Any]: + _schema("private-bundle-v1.schema.json", doc) + attempts = {item["attempt_id"]: item for item in doc["attempts"]} + if len(attempts) != len(doc["attempts"]): + raise PublisherError("bundle has duplicate attempt IDs") + selections = doc["coverage"]["selections"] + if len({item["case_id"] for item in selections}) != len(selections): + raise PublisherError("bundle has duplicate selected cases") + counts = {name: 0 for name in OUTCOMES} + for selection in selections: + attempt = attempts.get(selection["selected_attempt_id"]) + if attempt is None or not attempt["selected"] or attempt["case_id"] != selection["case_id"] or attempt["outcome"] != selection["outcome"]: + raise PublisherError("bundle selection differs from retained attempt") + counts[selection["outcome"]] += 1 + coverage = doc["coverage"] + if coverage["terminal_cases"] != len(selections) or coverage["outcome_counts"] != counts: + raise PublisherError("bundle terminal counts differ") + if coverage["complete"] != (coverage["expected_cases"] == len(selections)): + raise PublisherError("bundle completeness differs from coverage") + fingerprints: dict[str, set[str]] = {} + for attempt in doc["attempts"]: + value = attempt["runtime_fingerprint_sha256"] + if value: + fingerprints.setdefault(attempt["allocation_id"], set()).add(value) + if any(len(values) != 1 for values in fingerprints.values()): + raise PublisherError("bundle runtime is heterogeneous within an allocation") + return doc + + +def _fsync_dir(path: Path) -> None: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)) + try: + os.fsync(descriptor) + finally: + os.close(descriptor) + + +def _write_bytes(path: Path, data: bytes, *, mode: int) -> None: + descriptor = os.open( + path, + os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0), + mode, + ) + try: + os.fchmod(descriptor, mode) + with os.fdopen(descriptor, "wb", closefd=False) as handle: + handle.write(data) + handle.flush() + os.fsync(handle.fileno()) + finally: + os.close(descriptor) + + +def _write_all(descriptor: int, data: bytes) -> None: + view = memoryview(data) + while view: + view = view[os.write(descriptor, view):] + + +def _write_json(path: Path, value: Any, *, mode: int) -> bytes: + data = _canonical(value) + b"\n" + _write_bytes(path, data, mode=mode) + return data + + +def _file_metadata(path: Path, relative_to: Path) -> dict[str, Any]: + return { + "path": path.relative_to(relative_to).as_posix(), + "sha256": _sha_file(path), + "bytes": path.stat().st_size, + } + + +def _tree_files(root: Path) -> list[Path]: + return sorted( + path for path in root.rglob("*") + if path.is_file() and not path.is_symlink() and path.name != "COMPLETE" + ) + + +def _verify_regular_file(path: Path, expected_mode: int) -> None: + _reject_symlinked_path(path.parent) + try: + metadata = os.lstat(path) + except FileNotFoundError as exc: + raise PublisherError(f"required file is missing: {path.name}") from exc + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != expected_mode + ): + raise PublisherError( + f"file is not an owned regular {expected_mode:o} object: {path.name}" + ) + + +def _verify_frozen_tree(root: Path, *, private: bool) -> None: + _reject_symlinked_path(root) + directory_mode = 0o500 if private else 0o555 + file_mode = 0o400 if private else 0o444 + try: + root_metadata = os.lstat(root) + except OSError as exc: + raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc + if not stat.S_ISDIR(root_metadata.st_mode): + raise PublisherError(f"immutable object is not a real directory: {root.name}") + try: + entries = [root, *root.rglob("*")] + except OSError as exc: + raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc + for path in entries: + metadata = os.lstat(path) + if metadata.st_uid != os.getuid(): + raise PublisherError(f"immutable object has the wrong owner: {path.name}") + if stat.S_ISDIR(metadata.st_mode): + expected = directory_mode + elif stat.S_ISREG(metadata.st_mode): + expected = file_mode + else: + raise PublisherError(f"immutable object contains a linked or special entry: {path.name}") + if stat.S_IMODE(metadata.st_mode) != expected: + raise PublisherError( + f"immutable object mode differs for {path.name}: expected {expected:o}" + ) + + +def _freeze_tree(root: Path, *, private: bool) -> None: + files: list[Path] = [] + directories = [root] + for path in root.rglob("*"): + metadata = os.lstat(path) + if stat.S_ISDIR(metadata.st_mode): + directories.append(path) + elif stat.S_ISREG(metadata.st_mode): + files.append(path) + else: + raise PublisherError(f"immutable object contains a linked or special entry: {path.name}") + for path in files: + os.chmod(path, 0o400 if private else 0o444) + for path in sorted(directories, key=lambda item: len(item.parts), reverse=True): + os.chmod(path, 0o500 if private else 0o555) + _fsync_dir(path) + _verify_frozen_tree(root, private=private) + + +def _reject_symlinked_path(path: Path) -> None: + current = Path(path.anchor) + for part in path.parts[1:]: + current /= part + try: + metadata = os.lstat(current) + except FileNotFoundError: + break + if stat.S_ISLNK(metadata.st_mode): + raise PublisherError("COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent") + if not stat.S_ISDIR(metadata.st_mode): + raise PublisherError(f"store path component is not a directory: {current}") + + +class Store: + """Atomic private/public directory operations on one operator filesystem.""" + + def __init__(self, root: str | os.PathLike[str]): + candidate = Path(os.path.abspath(os.path.expanduser(root))) + _reject_symlinked_path(candidate) + candidate.mkdir(parents=True, exist_ok=True, mode=0o750) + resolved = candidate.resolve() + if candidate != resolved: + raise PublisherError( + "COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent" + ) + root_metadata = candidate.stat() + if root_metadata.st_uid != os.getuid() or stat.S_IMODE(root_metadata.st_mode) & 0o022: + raise PublisherError( + "COLLECTIVEX_STORE_ROOT must be owned by this user and not group/world writable" + ) + os.chmod(candidate, 0o750) + if stat.S_IMODE(candidate.stat().st_mode) != 0o750: + raise PublisherError("COLLECTIVEX_STORE_ROOT mode must be 750") + self.root = resolved + raw = self.root + self.private = raw / "private" + self.incoming = self.private / "incoming" + self.bundles = self.private / "bundles" + self.quarantine = self.private / "quarantine" + self.public = raw / "public" + self.datasets = self.public / "datasets" + self.channels = self.public / "channels" + self.locks = raw / "locks" + for path, mode in ( + (self.private, 0o700), (self.incoming, 0o700), (self.bundles, 0o700), + (self.quarantine, 0o700), (self.public, 0o755), (self.datasets, 0o755), + (self.channels, 0o755), (self.locks, 0o700), + ): + path.mkdir(parents=True, exist_ok=True, mode=mode) + if path.is_symlink() or not path.is_dir(): + raise PublisherError(f"store path is not a real directory: {path}") + os.chmod(path, mode) + + @contextlib.contextmanager + def locked(self) -> Iterator[None]: + lock_path = self.locks / "publisher.lock" + descriptor = os.open( + lock_path, + os.O_RDWR | os.O_CREAT | getattr(os, "O_NOFOLLOW", 0), + 0o600, + ) + try: + os.fchmod(descriptor, 0o600) + metadata = os.fstat(descriptor) + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != 0o600 + ): + raise PublisherError("publisher lock is not an owned regular 600 file") + fcntl.flock(descriptor, fcntl.LOCK_EX) + yield + finally: + fcntl.flock(descriptor, fcntl.LOCK_UN) + os.close(descriptor) + + @contextlib.contextmanager + def staging(self, parent: Path, *, private: bool) -> Iterator[Path]: + stage = Path(tempfile.mkdtemp(prefix=".staging-", dir=parent)) + os.chmod(stage, 0o700 if private else 0o755) + try: + yield stage + finally: + if stage.exists(): + for path in stage.rglob("*"): + metadata = os.lstat(path) + if stat.S_ISDIR(metadata.st_mode): + os.chmod(path, 0o700) + elif stat.S_ISREG(metadata.st_mode): + os.chmod(path, 0o600) + os.chmod(stage, 0o700) + shutil.rmtree(stage, ignore_errors=True) + + @staticmethod + def complete(stage: Path, value: str, *, private: bool) -> None: + _write_bytes(stage / "COMPLETE", (value + "\n").encode(), mode=0o600 if private else 0o644) + _fsync_dir(stage) + + @staticmethod + def install(stage: Path, destination: Path, *, private: bool) -> None: + if destination.is_symlink(): + raise PublisherError(f"immutable destination is a symlink: {destination.name}") + if destination.exists(): + _verify_frozen_tree(destination, private=private) + marker = destination / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != destination.name: + raise PublisherError(f"immutable destination is incomplete: {destination.name}") + return + _freeze_tree(stage, private=private) + os.rename(stage, destination) + _fsync_dir(destination.parent) + _verify_frozen_tree(destination, private=private) + + def install_dataset(self, dataset: dict[str, Any]) -> tuple[str, int]: + validate_public_dataset(dataset) + payload = _canonical(dataset) + b"\n" + if len(payload) > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("public dataset exceeds the serving size limit") + digest = _sha_bytes(payload) + destination = self.datasets / digest + with self.staging(self.datasets, private=False) as stage: + _write_bytes(stage / "dataset.json", payload, mode=0o644) + self.complete(stage, digest, private=False) + self.install(stage, destination, private=False) + stored = destination / "dataset.json" + marker = destination / "COMPLETE" + if (not marker.is_file() or marker.read_text().strip() != digest + or _sha_file(stored) != digest or stored.stat().st_size != len(payload)): + raise PublisherError("stored dataset checksum differs after installation") + return digest, len(payload) + + def update_channel(self, channel: str, digest: str, size: int, generated_at: str) -> None: + if size > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("channel dataset exceeds the serving size limit") + _verify_frozen_tree(self.datasets / digest, private=False) + marker = self.datasets / digest / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != digest: + raise PublisherError("cannot advance a channel to an incomplete dataset") + dataset_path = self.datasets / digest / "dataset.json" + dataset = validate_public_dataset(strict_load(dataset_path)) + if ( + _sha_file(dataset_path) != digest + or dataset_path.stat().st_size != size + or dataset["generated_at"] != generated_at + ): + raise PublisherError("channel metadata differs from its stored dataset") + if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted": + raise PublisherError("dev-latest may only reference a promoted dataset") + pointer = { + "format": FORMAT_CHANNEL, + "channel": channel, + "dataset": { + "path": f"datasets/{digest}/dataset.json", + "sha256": digest, + "bytes": size, + }, + "generated_at": generated_at, + } + validate_channel(pointer, expected_channel=channel) + destination = self.channels / f"{channel}.json" + temporary = self.channels / f".{channel}.tmp-{os.getpid()}" + try: + data = _canonical(pointer) + b"\n" + _write_bytes(temporary, data, mode=0o644) + os.replace(temporary, destination) + _fsync_dir(self.channels) + finally: + temporary.unlink(missing_ok=True) + + def verify_channel(self, channel: str) -> dict[str, Any]: + channel_path = self.channels / f"{channel}.json" + _verify_regular_file(channel_path, 0o644) + pointer = validate_channel(strict_load(channel_path), expected_channel=channel) + target = self.public / pointer["dataset"]["path"] + _verify_frozen_tree(target.parent, private=False) + if target.stat().st_size != pointer["dataset"]["bytes"] or _sha_file(target) != pointer["dataset"]["sha256"]: + raise PublisherError(f"channel {channel} dataset checksum differs") + marker = target.parent / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != pointer["dataset"]["sha256"]: + raise PublisherError(f"channel {channel} dataset is incomplete") + dataset = validate_public_dataset(strict_load(target)) + if pointer["generated_at"] != dataset["generated_at"]: + raise PublisherError(f"channel {channel} metadata differs from its dataset") + if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted": + raise PublisherError("dev-latest points to a non-promoted dataset") + return pointer + + +def _copy_source(source: Path, destination: Path) -> None: + if source.is_symlink() or not source.is_file() or not stat.S_ISREG(source.stat().st_mode): + raise PublisherError(f"source must be a regular non-symlink file: {source}") + descriptor = os.open(source, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + output = os.open(destination, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + try: + while True: + chunk = os.read(descriptor, 1024 * 1024) + if not chunk: + break + _write_all(output, chunk) + os.fsync(output) + finally: + os.close(output) + finally: + os.close(descriptor) + + +def _archive_download_directory(source: Path, destination: Path) -> None: + if source.is_symlink() or not source.is_dir(): + raise PublisherError(f"artifact directory is invalid: {source}") + files: list[Path] = [] + for path in source.rglob("*"): + if path.is_symlink(): + raise PublisherError("artifact directory contains a symlink") + if path.is_dir(): + continue + if not path.is_file(): + raise PublisherError("artifact directory contains a non-regular entry") + files.append(path) + files.sort() + if not files or len(files) > MAX_ARCHIVE_MEMBERS: + raise PublisherError("artifact directory has an invalid file count") + total = 0 + with zipfile.ZipFile(destination, "x", compression=zipfile.ZIP_STORED) as archive: + for path in files: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + with os.fdopen(descriptor, "rb") as handle: + metadata = os.fstat(handle.fileno()) + if not stat.S_ISREG(metadata.st_mode): + raise PublisherError("artifact directory member changed type") + size = metadata.st_size + total += size + if size > MAX_ARCHIVE_MEMBER_BYTES or total > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact directory exceeds size limits") + relative = path.relative_to(source).as_posix() + _safe_member(relative) + info = zipfile.ZipInfo(relative, date_time=(1980, 1, 1, 0, 0, 0)) + info.compress_type = zipfile.ZIP_STORED + info.external_attr = (stat.S_IFREG | 0o600) << 16 + with archive.open(info, "w") as output: + written = 0 + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + output.write(chunk) + written += len(chunk) + if written != size: + raise PublisherError("artifact directory member changed size") + descriptor = os.open(destination, os.O_RDONLY) + try: + os.fsync(descriptor) + finally: + os.close(descriptor) + + +def _artifact_name(source: Path) -> str: + name = source.name if source.is_dir() else source.name.removesuffix(".zip") + if ( + not source.is_dir() and source.suffix != ".zip" + or ARTIFACT_NAME.fullmatch(name) is None + ): + raise PublisherError(f"artifact source has an invalid GHA name: {source.name}") + return name + + +def archive_incoming( + store: Store, + matrix: Path, + artifacts: Sequence[Path], + run: dict[str, Any], +) -> tuple[str, Path, list[dict[str, Any]]]: + """Copy exact delivery bytes into immutable incoming before any JSON/ZIP parse.""" + if not artifacts: + raise PublisherError("at least one GitHub artifact archive is required") + with store.staging(store.incoming, private=True) as stage: + sources = stage / "sources" + sources.mkdir(mode=0o700) + copied: list[dict[str, Any]] = [] + named_artifacts = sorted( + ((_artifact_name(path), path) for path in artifacts), key=lambda item: item[0] + ) + artifact_names = [name for name, _ in named_artifacts] + if len(artifact_names) != len(set(artifact_names)): + raise PublisherError("artifact delivery contains duplicate GHA names") + inputs = [("matrix.json", matrix, "matrix", None)] + [ + (f"artifact-{index:04d}.zip", path, "artifact", artifact_name) + for index, (artifact_name, path) in enumerate(named_artifacts) + ] + for name, source, kind, artifact_name in inputs: + destination = sources / name + if source.is_dir(): + _archive_download_directory(source, destination) + else: + if source != matrix and source.stat().st_size > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact archive exceeds the size limit") + _copy_source(source, destination) + copied.append({ + **_file_metadata(destination, stage), + "kind": kind, + "artifact_name": artifact_name, + }) + ingest_id = _sha_bytes(_canonical({"run": run, "sources": copied})) + incoming_manifest = { + "format": "collectivex.incoming.v1", + "schema_version": 1, + "ingest_id": ingest_id, + "run": run, + "sources": copied, + } + _write_json(stage / "incoming.json", incoming_manifest, mode=0o600) + store.complete(stage, ingest_id, private=True) + destination = store.incoming / ingest_id + store.install(stage, destination, private=True) + installed = store.incoming / ingest_id + if strict_load(installed / "incoming.json") != incoming_manifest: + raise PublisherError("existing incoming object differs from archived delivery") + for record in copied: + _resolve_bundle_file(installed, record) + return ingest_id, installed, copied + + +def _safe_member(name: str) -> PurePosixPath: + if "\\" in name or "\0" in name: + raise PublisherError("archive member has an unsafe separator") + path = PurePosixPath(name) + if path.is_absolute() or not path.parts or any(part in {"", ".", ".."} for part in path.parts): + raise PublisherError("archive member path escapes its artifact") + return path + + +def extract_archive(archive: Path, destination: Path) -> list[Path]: + """Extract a bounded regular-file ZIP without trusting member paths or links.""" + try: + handle = zipfile.ZipFile(archive) + except (OSError, zipfile.BadZipFile) as exc: + raise PublisherError("artifact is not a valid ZIP archive") from exc + extracted: list[Path] = [] + seen: set[str] = set() + total = 0 + with handle: + members = handle.infolist() + if not members or len(members) > MAX_ARCHIVE_MEMBERS: + raise PublisherError("artifact has an invalid member count") + for member in members: + path = _safe_member(member.filename.rstrip("/")) + key = path.as_posix() + if key in seen: + raise PublisherError("artifact contains duplicate member paths") + seen.add(key) + mode = member.external_attr >> 16 + if stat.S_ISLNK(mode) or (mode and not (stat.S_ISREG(mode) or stat.S_ISDIR(mode))): + raise PublisherError("artifact contains a non-regular member") + if member.flag_bits & 0x1: + raise PublisherError("encrypted artifact members are not accepted") + if member.file_size > MAX_ARCHIVE_MEMBER_BYTES: + raise PublisherError("artifact member exceeds the size limit") + total += member.file_size + if total > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact exceeds the expanded size limit") + target = destination.joinpath(*path.parts) + if member.is_dir(): + target.mkdir(parents=True, exist_ok=True, mode=0o700) + continue + target.parent.mkdir(parents=True, exist_ok=True, mode=0o700) + output = os.open(target, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + try: + with handle.open(member, "r") as source: + written = 0 + while True: + chunk = source.read(1024 * 1024) + if not chunk: + break + _write_all(output, chunk) + written += len(chunk) + if written != member.file_size: + raise PublisherError("artifact member size changed during extraction") + os.fsync(output) + finally: + os.close(output) + extracted.append(target) + return extracted + + +def validate_matrix(document: Any) -> list[dict[str, Any]]: + try: + artifact_safety.assert_publication_safe([document]) + matrix = sweep_matrix.validate_matrix_document(document) + except (SystemExit, ValueError, artifact_safety.ArtifactSafetyError) as exc: + raise PublisherError(f"requested matrix is invalid: {exc}") from exc + return [ + { + "sku": item["sku"], + **item["case"], + "_disposition": item["disposition"], + "_reason": item["reason"], + } + for item in matrix["requested_cases"] + ] + + +def _expected_deliveries( + matrix: dict[str, Any], cases: Sequence[dict[str, Any]], run: dict[str, Any] +) -> dict[str, tuple[str, str, str]]: + shard_by_case: dict[str, str] = {} + for shard in matrix["include"]: + for case_id in shard["case_ids"]: + if case_id in shard_by_case: + raise PublisherError("requested case appears in two runnable shards") + shard_by_case[case_id] = shard["id"] + suffix = f"{run['run_id']}-{run['run_attempt']}" + deliveries: dict[str, tuple[str, str, str]] = {} + for case in cases: + case_id = case["case_id"] + if case["_disposition"] == "unsupported": + deliveries[case_id] = ( + f"cxunsupported-{suffix}", "setup", + f"{run['run_id']}_{run['run_attempt']}_unsupported", + ) + continue + shard_id = shard_by_case.get(case_id) + if shard_id is None: + raise PublisherError("runnable case has no matrix shard") + deliveries[case_id] = ( + f"cxshard-{shard_id}-{suffix}", "sweep", + f"{run['run_id']}_{run['run_attempt']}_{shard_id}", + ) + return deliveries + + +def _document_git_run(document: dict[str, Any]) -> dict[str, Any] | None: + provenance = document.get("provenance") + if not isinstance(provenance, dict): + return None + value = provenance.get("git_run", provenance) + return value if isinstance(value, dict) else None + + +def _run_matches(document: dict[str, Any], run: dict[str, Any]) -> bool: + git_run = _document_git_run(document) + if git_run is None: + return False + return ( + str(git_run.get("run_id")) == run["run_id"] + and str(git_run.get("run_attempt")) == str(run["run_attempt"]) + and git_run.get("source_sha") == run["source_sha"] + and (git_run.get("repo") or git_run.get("repository")) == run["repository"] + ) + + +def _case_matches(document: dict[str, Any], expected: dict[str, Any]) -> bool: + scheduled = { + key: value for key, value in expected.items() + if key not in {"sku", "case_id"} and not key.startswith("_") + } + return document.get("identity", {}).get("case_factors") == { + "case": scheduled, + "profile": identity.V1_CASE_PROFILE, + "sku": expected["sku"], + } + + +def _outcome(document: dict[str, Any]) -> tuple[str, str | None]: + status = document["outcome"]["status"] + if status == "success": + return status, None + native = document["outcome"].get("reason") + reason = native if isinstance(native, str) and REASON.fullmatch(native) else { + "unsupported": "unsupported-capability", "failed": "execution-failed", + "invalid": "validation-failed", "diagnostic": "diagnostic-evidence", + }.get(status) + if reason is None: + raise PublisherError(f"unsupported native outcome {status!r}") + return status, reason + + +def _attempt_record( + document: dict[str, Any], path: Path, root: Path, *, selected: bool +) -> dict[str, Any]: + normalized = contracts.normalize_attempt(document) + runtime = normalized["runtime_fingerprint"] + runtime_sha = _sha_bytes(_canonical(runtime)) if runtime is not None else None + sample_record = None + evidence_ids: list[str] = [] + series_ids: list[str] = [] + if document["format"] == contracts.RAW_FORMAT: + sample_path = path.with_name(document["sample_artifact"]["path"]) + sample_record = _file_metadata(sample_path, root) + evidence_ids = [row["evidence_id"] for row in document["measurement"]["rows"]] + series_ids = [document["identity"]["series_id"]] + declared = document["identity"]["series_factors"]["runtime_fingerprint_sha256"] + if runtime_sha != declared: + raise PublisherError("runtime fingerprint checksum differs from series identity") + status, reason = _outcome(document) + return { + "attempt_id": normalized["attempt_id"], + "allocation_id": normalized["allocation_id"], + "case_id": normalized["case_id"], + "outcome": status, + "reason": reason, + "selected": selected, + "document": _file_metadata(path, root), + "samples": sample_record, + "runtime_fingerprint_sha256": runtime_sha, + "series_ids": series_ids, + "evidence_ids": evidence_ids, + } + + +def _validate_delivery_binding( + document: dict[str, Any], path: Path, raw_root: Path, + artifact_by_root: dict[str, str], expected_by_id: dict[str, dict[str, Any]], + expected_deliveries: dict[str, tuple[str, str, str]], run: dict[str, Any], +) -> str: + case_id = document["identity"]["case_id"] + if case_id not in expected_by_id: + raise PublisherError("artifact contains an extra case outcome") + expected = expected_by_id[case_id] + if not _case_matches(document, expected): + raise PublisherError("attempt case coordinates differ from the requested matrix") + unsupported = document["outcome"]["status"] == "unsupported" + if (expected["_disposition"] == "unsupported") != unsupported: + raise PublisherError("terminal outcome differs from requested capability disposition") + if unsupported and document["outcome"]["reason"] != expected["_reason"]: + raise PublisherError("unsupported outcome reason differs from requested matrix") + if not _run_matches(document, run): + raise PublisherError("attempt provenance differs from publisher run metadata") + relative = path.relative_to(raw_root) + if len(relative.parts) < 2: + raise PublisherError("attempt document is outside a delivered artifact") + delivered_name = artifact_by_root.get(relative.parts[0]) + expected_name, expected_job, expected_execution = expected_deliveries[case_id] + git_run = _document_git_run(document) + allocation = document["identity"]["allocation_factors"] + if ( + git_run is None + or delivered_name != expected_name + or git_run["artifact"] != delivered_name + or git_run["job"] != expected_job + or allocation["execution_id"] != expected_execution + ): + raise PublisherError("attempt provenance differs from its delivered GHA shard") + return case_id + + +def _parse_extracted(root: Path) -> tuple[list[tuple[Path, dict[str, Any]]], set[Path]]: + attempts: list[tuple[Path, dict[str, Any]]] = [] + consumed_samples: set[Path] = set() + json_paths = sorted(path for path in root.rglob("*.json") if path.is_file()) + for path in json_paths: + if path in consumed_samples: + continue + try: + document = contracts.strict_load(path) + artifact_safety.assert_publication_safe([document]) + format_name = document.get("format") if isinstance(document, dict) else None + if format_name == contracts.SAMPLES_FORMAT: + _schema("samples-v1.schema.json", document) + # It must be claimed by a raw document; orphan checking happens after the scan. + continue + if format_name == contracts.RAW_FORMAT: + _schema("raw-case-v1.schema.json", document) + sample_path = path.with_name(document["sample_artifact"]["path"]) + sample_document = contracts.strict_load(sample_path) + artifact_safety.assert_publication_safe([sample_document]) + _schema("samples-v1.schema.json", sample_document) + validated = contracts.load_raw_attempt(path) + consumed_samples.add(sample_path) + elif format_name == contracts.TERMINAL_FORMAT: + _schema("terminal-outcome-v1.schema.json", document) + validated = contracts.validate_terminal_document(document) + else: + raise PublisherError(f"artifact contains unknown JSON document {path.name}") + except ( + contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, OSError, + ) as exc: + raise PublisherError(f"native contract rejected {path.name}: {exc}") from exc + attempts.append((path, validated)) + orphan_samples = [ + path for path in json_paths + if isinstance((doc := contracts.strict_load(path)), dict) + and doc.get("format") == contracts.SAMPLES_FORMAT + and path not in consumed_samples + ] + if orphan_samples: + raise PublisherError("artifact contains an orphan samples document") + if not attempts: + raise PublisherError("artifact contains zero native attempt documents") + return attempts, consumed_samples + + +def build_bundle( + store: Store, + incoming_id: str, + incoming_path: Path, + run: dict[str, Any], +) -> tuple[str, dict[str, Any], list[dict[str, Any]]]: + """Validate one exact workflow delivery and install its immutable private bundle.""" + incoming_manifest = strict_load(incoming_path / "incoming.json") + _exact( + incoming_manifest, + {"format", "schema_version", "ingest_id", "run", "sources"}, + "incoming", + ) + artifact_safety.assert_publication_safe([incoming_manifest]) + if ( + incoming_manifest["format"] != "collectivex.incoming.v1" + or incoming_manifest["schema_version"] != 1 + or incoming_manifest["ingest_id"] != incoming_id + or incoming_manifest["run"] != run + or _sha_bytes(_canonical({"run": run, "sources": incoming_manifest["sources"]})) + != incoming_id + ): + raise PublisherError("incoming manifest identity differs from archived delivery") + incoming_sources = _array(incoming_manifest["sources"], "incoming.sources", nonempty=True) + for index, record in enumerate(incoming_sources): + _exact( + record, + {"path", "sha256", "bytes", "kind", "artifact_name"}, + f"incoming.sources[{index}]", + ) + _resolve_bundle_file(incoming_path, record) + matrix_records = [record for record in incoming_sources if record["kind"] == "matrix"] + artifact_records = [record for record in incoming_sources if record["kind"] == "artifact"] + if ( + len(matrix_records) != 1 + or matrix_records[0]["artifact_name"] is not None + or not artifact_records + or any(ARTIFACT_NAME.fullmatch(record["artifact_name"] or "") is None + for record in artifact_records) + or len({record["artifact_name"] for record in artifact_records}) != len(artifact_records) + ): + raise PublisherError("incoming source catalog is invalid") + matrix_source = _resolve_bundle_file(incoming_path, matrix_records[0]) + matrix_document = strict_load(matrix_source) + expected_cases = validate_matrix(matrix_document) + expected_by_id = {case["case_id"]: case for case in expected_cases} + expected_deliveries = _expected_deliveries(matrix_document, expected_cases, run) + if {record["artifact_name"] for record in artifact_records} != { + delivery[0] for delivery in expected_deliveries.values() + }: + raise PublisherError("incoming artifact archive set differs from requested matrix shards") + with store.staging(store.bundles, private=True) as stage: + source_copy = stage / "source" + raw_root = stage / "raw" + source_copy.mkdir(mode=0o700) + raw_root.mkdir(mode=0o700) + matrix_path = stage / "matrix.json" + _copy_source(matrix_source, matrix_path) + source_records: list[dict[str, Any]] = [] + artifact_by_root: dict[str, str] = {} + for index, source_record in enumerate(artifact_records): + archive = _resolve_bundle_file(incoming_path, source_record) + copied = source_copy / f"artifact-{index:04d}.zip" + _copy_source(archive, copied) + source_records.append({ + **_file_metadata(copied, stage), + "artifact_name": source_record["artifact_name"], + }) + artifact_root = raw_root / f"artifact-{index:04d}" + artifact_root.mkdir(mode=0o700) + artifact_by_root[artifact_root.name] = source_record["artifact_name"] + extract_archive(copied, artifact_root) + parsed, consumed_samples = _parse_extracted(raw_root) + created_at = _latest_timestamp( + [document["generated_at"] for _, document in parsed] + ) + consumed_files = {path for path, _ in parsed} | consumed_samples + extracted_files = { + path for path in raw_root.rglob("*") + if path.is_file() and not path.is_symlink() + } + if consumed_files != extracted_files: + raise PublisherError("artifact contains an unconsumed non-native member") + by_case: dict[str, list[tuple[Path, dict[str, Any]]]] = {} + for path, document in parsed: + case_id = _validate_delivery_binding( + document, path, raw_root, artifact_by_root, expected_by_id, + expected_deliveries, run, + ) + by_case.setdefault(case_id, []).append((path, document)) + missing = set(expected_by_id) - set(by_case) + if missing: + raise PublisherError(f"artifact is missing {len(missing)} requested case outcomes") + attempt_records: list[dict[str, Any]] = [] + selections: list[dict[str, Any]] = [] + selected_documents: list[dict[str, Any]] = [] + runtime_hashes: set[str] = set() + outcome_counts = {name: 0 for name in OUTCOMES} + for case_id in sorted(expected_by_id): + case_attempts = by_case[case_id] + ordinals = [document["identity"]["attempt_ordinal"] for _, document in case_attempts] + allocations_for_case = { + document["identity"]["allocation_id"] for _, document in case_attempts + } + if len(allocations_for_case) != 1 or sorted(ordinals) != list( + range(1, len(ordinals) + 1) + ): + raise PublisherError( + "case retries must retain contiguous ordinals in one allocation" + ) + _, selected_document = max( + case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"] + ) + selected_id = selected_document["identity"]["attempt_id"] + selected_documents.append(selected_document) + selected_status, _ = _outcome(selected_document) + selections.append({ + "case_id": case_id, + "selected_attempt_id": selected_id, + "outcome": selected_status, + }) + outcome_counts[selected_status] += 1 + for path, document in sorted( + case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"] + ): + normalized = contracts.normalize_attempt(document) + if document["format"] == contracts.RAW_FORMAT: + sample_path = path.with_name(document["sample_artifact"]["path"]) + if sample_path not in consumed_samples: + raise PublisherError("validated raw attempt lost its samples document") + record = _attempt_record( + document, path, stage, + selected=normalized["attempt_id"] == selected_id, + ) + if record["runtime_fingerprint_sha256"]: + runtime_hashes.add(record["runtime_fingerprint_sha256"]) + attempt_records.append(record) + # Every extracted byte is covered; the bundle manifest anchors this checksum catalog. + payload_records = [_file_metadata(path, stage) for path in _tree_files(stage)] + checksum_document = { + "format": "collectivex.checksums.v1", + "files": payload_records, + } + checksum_path = stage / "checksums.json" + _write_json(checksum_path, checksum_document, mode=0o600) + bundle = { + "format": FORMAT_BUNDLE, + "schema_version": 1, + "created_at": created_at, + "ingest_id": incoming_id, + "run": run, + "matrix": _file_metadata(matrix_path, stage), + "sources": source_records, + "attempts": attempt_records, + "coverage": { + "expected_cases": len(expected_cases), + "terminal_cases": len(selections), + "complete": len(selections) == len(expected_cases), + "outcome_counts": outcome_counts, + "selections": selections, + }, + "runtime_fingerprints": sorted(runtime_hashes), + "checksums": _file_metadata(checksum_path, stage), + "validation": { + "policy": PUBLISHER_POLICY, + "passed": True, + "checks": [ + "archive-safety", "checksums", "exact-coverage", "identity", + "native-schema", "privacy", "runtime-homogeneity", "terminal-outcomes", + ], + }, + } + validate_bundle_manifest(bundle) + # Runtime homogeneity is scoped to a realized allocation, not across unlike SKUs. + by_allocation: dict[str, set[str]] = {} + for attempt in attempt_records: + fingerprint = attempt["runtime_fingerprint_sha256"] + if fingerprint: + by_allocation.setdefault(attempt["allocation_id"], set()).add(fingerprint) + if any(len(values) != 1 for values in by_allocation.values()): + raise PublisherError("runtime fingerprint is heterogeneous within an allocation") + bundle_bytes = _canonical(bundle) + b"\n" + bundle_id = _sha_bytes(bundle_bytes) + _write_bytes(stage / "bundle.json", bundle_bytes, mode=0o600) + store.complete(stage, bundle_id, private=True) + store.install(stage, store.bundles / bundle_id, private=True) + installed = load_bundle(store, bundle_id) + if installed["manifest"] != bundle: + raise PublisherError("existing bundle differs from validated manifest") + return bundle_id, bundle, selected_documents + + +def _slug(value: Any, fallback: str = "unknown") -> str: + text = re.sub(r"[^a-z0-9_.-]+", "-", str(value or "").lower()).strip("-.") + return text[:128] if text and SAFE_ID.fullmatch(text[:128]) else fallback + + +def _derived_id(prefix: str, value: Any) -> str: + return f"{prefix}{_sha_bytes(_canonical(value))}" + + +def _git_run(document: dict[str, Any]) -> dict[str, Any]: + return _document_git_run(document) or {} + + +def _public_attempt(document: dict[str, Any], *, selected: bool = False) -> dict[str, Any]: + normalized = contracts.normalize_attempt(document) + run = _git_run(document) + evidence = ( + [{"evidence_id": row["evidence_id"], "point_id": row["point_id"]} + for row in document["measurement"]["rows"]] + if document["format"] == contracts.RAW_FORMAT else [] + ) + status, reason = _outcome(document) + failure_mode = document["outcome"].get("failure_mode") + if not isinstance(failure_mode, str) or REASON.fullmatch(failure_mode) is None: + failure_mode = None if status == "success" else reason + series_id = normalized["series_id"] if status == "success" and selected else None + return { + "attempt_id": normalized["attempt_id"], + "evidence": evidence, + "case_id": normalized["case_id"], + "allocation_id": normalized["allocation_id"], + "run_id": str(run["run_id"]), + "run_attempt": int(run["run_attempt"]), + "attempt_index": document["identity"]["attempt_ordinal"], + "selected": selected, + "outcome": status, + "failure_mode": failure_mode, + "reason": reason, + "series_id": series_id, + "completed_at": document["generated_at"], + } + + +def _ratio(values: Sequence[float]) -> float | None: + return max(values) / min(values) if len(values) >= REQUIRED_ALLOCATIONS and min(values) > 0 else None + + +def _eligibility_record( + allocations: Sequence[str], + *, + complete: bool, + correct: bool, + measured: bool, + stable_ordering: bool, + p50_ratio: float | None, + p99_ratio: float | None, + extra_reasons: Sequence[str] = (), +) -> dict[str, Any]: + ids = sorted(set(allocations)) + stable_p50 = p50_ratio is not None and p50_ratio <= P50_STABILITY_LIMIT + stable_p99 = p99_ratio is not None and p99_ratio <= P99_STABILITY_LIMIT + reasons = list(extra_reasons) + for condition, reason in ( + (len(ids) >= REQUIRED_ALLOCATIONS, "insufficient-allocations"), + (complete, "incomplete-repeat-coverage"), + (correct, "correctness-failed"), + (measured, "missing-measured-roundtrip-p99"), + (stable_p50, "unstable-p50"), + (stable_p99, "unstable-p99"), + (stable_ordering, "unstable-ordering"), + ): + if not condition: + reasons.append(reason) + reasons = sorted(set(reasons)) + decision = not reasons + return { + "decision_grade": decision, + "allocation_ids": ids, + "complete": complete, + "correct": correct, + "measured_roundtrip_p99": measured, + "stable_p50": stable_p50, + "stable_p99": stable_p99, + "stable_ordering": stable_ordering, + "p50_max_min_ratio": p50_ratio, + "p99_max_min_ratio": p99_ratio, + "reasons": reasons, + } + + +def _aggregate_percentiles(values: Sequence[dict[str, Any]]) -> dict[str, float]: + return { + name: float(statistics.median(float(value[name]) for value in values)) + for name in ("p50", "p90", "p95", "p99") + } + + +def _aggregate_component( + rows: Sequence[dict[str, Any]], name: str +) -> dict[str, Any] | None: + components = [row["components"][name] for row in rows] + if all(component["availability"] == "unavailable" for component in components): + return None + if any(component["availability"] == "unavailable" for component in components): + raise PublisherError("component availability differs across repeat allocations") + latency = _aggregate_percentiles([component["percentiles_us"] for component in components]) + if name == "isolated_sum": + return { + "origin": "derived", + "latency_us": latency, + "logical_bytes": None, + "logical_payload_rate_gbps_at_latency_percentile": None, + "sample_count": None, + } + byte_values = {row["logical_bytes"][name] for row in rows} + if len(byte_values) != 1: + raise PublisherError("logical byte accounting differs across repeat allocations") + logical_bytes = byte_values.pop() + rates = {statistic: logical_bytes / (latency[statistic] * 1000.0) for statistic in latency} + return { + "origin": "measured", + "latency_us": latency, + "logical_bytes": logical_bytes, + "logical_payload_rate_gbps_at_latency_percentile": rates, + "sample_count": 512, + } + + +def _exact_repeat_value(values: Sequence[Any], label: str) -> Any: + if not values or len({_canonical(value) for value in values}) != 1: + raise PublisherError(f"{label} differs across repeat allocations") + return values[0] + + +def _eplb_descriptor(document: dict[str, Any]) -> dict[str, Any]: + value = document["case"]["eplb"] + return { + "enabled": value["enabled"], + "planner": value["planner"], + "mapping_sha256": value["mapping_hash"], + "logical_experts": value["num_logical_experts"], + "physical_experts": value["num_physical_experts"], + "redundant_experts": value["num_redundant"], + "reference_tokens_per_rank": value["reference_tokens_per_rank"], + "replicated_experts": value["replicated_experts"], + "max_replicas": value["max_replicas"], + "imbalance_before": value["imbalance_before"], + "imbalance_after": value["imbalance_after"], + } + + +def _routing_facts(row: dict[str, Any]) -> dict[str, Any]: + routing = row["routing"] + return { + "fanout_mean": routing["fanout_mean"], + "recv_tokens_max": row["receive"]["max"], + "expert_load_cv": routing["expert_load_cv"], + "payload_rank_cv": routing["payload_rank_cv"], + "hotspot_ratio": routing["hotspot_ratio"], + "empty_expert_count": routing["empty_expert_count"], + "empty_rank_count": routing["empty_rank_count"], + "routed_copies": routing["routed_copies"], + } + + +def _series_extra_reasons(documents: Sequence[dict[str, Any]]) -> list[str]: + reasons: set[str] = set() + for document in documents: + validity = document["outcome"]["validity"] + rows = document["measurement"]["rows"] + if validity.get("provenance_complete") is not True: + reasons.add("incomplete-provenance") + if validity.get("workload_source") != "canonical-serialized": + reasons.add("noncanonical-workload") + if validity.get("anomaly_free") is not True or any(row["anomalies"] for row in rows): + reasons.add("unresolved-anomaly") + if validity.get("semantic_correctness") != "pass": + reasons.add("semantic-correctness-failed") + if validity.get("measurement_conformance") != "conformant" or validity.get("sampling_conformance") != "conformant": + reasons.add("measurement-nonconformant") + scopes = {row["correctness"].get("scope") for row in rows} + if scopes != {"dispatch-metadata-and-transformed-combine"}: + reasons.add("expert-oracle-incomplete") + return sorted(reasons) + + +BACKEND_LABELS = { + "deepep": "DeepEP V1", + "deepep-v2": "DeepEP V2", + "deepep-hybrid": "DeepEP Hybrid", + "uccl": "UCCL", + "mori": "MoRI", + "nccl-ep": "NCCL/RCCL reference", +} + + +def _build_series( + series_id: str, + documents: Sequence[dict[str, Any]], + expected_repeats: int, +) -> tuple[dict[str, Any], dict[str, Any]]: + if not documents: + raise PublisherError("cannot aggregate an empty series") + first = documents[0] + if any(document["identity"]["series_id"] != series_id for document in documents): + raise PublisherError("series aggregation mixed identities") + allocations = [document["identity"]["allocation_id"] for document in documents] + if len(allocations) != len(set(allocations)): + raise PublisherError("series repeats reuse an allocation identity") + row_maps = [ + {row["tokens_per_rank"]: row for row in document["measurement"]["rows"]} + for document in documents + ] + token_sets = {tuple(sorted(rows)) for rows in row_maps} + if len(token_sets) != 1: + raise PublisherError("series token coverage differs across allocations") + tokens = list(next(iter(token_sets))) + p50_ratios = [ + _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p50"] for rows in row_maps]) + for token in tokens + ] + p99_ratios = [ + _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p99"] for rows in row_maps]) + for token in tokens + ] + p50_ratio = max((value for value in p50_ratios if value is not None), default=None) + p99_ratio = max((value for value in p99_ratios if value is not None), default=None) + correct = all( + row["correctness"]["passed"] + for document in documents for row in document["measurement"]["rows"] + ) + measured = all( + row["components"]["roundtrip"]["availability"] == "measured" + and row["components"]["roundtrip"]["percentiles_us"].get("p99") is not None + for document in documents for row in document["measurement"]["rows"] + ) + eligibility = _eligibility_record( + allocations, + complete=len(documents) == expected_repeats, + correct=correct, + measured=measured, + # Ordering is defined only across alternatives in a controlled cohort. + stable_ordering=True, + p50_ratio=p50_ratio, + p99_ratio=p99_ratio, + extra_reasons=_series_extra_reasons(documents), + ) + case = first["case"] + shape = case["shape"] + topology = first["topology"] + runtime = first["runtime_fingerprint"] + workload_id = first["workload"]["workload_id"] + if not identity.is_typed_id(workload_id, "workload"): + raise PublisherError("raw workload is not canonical") + backend_id = case["backend"] + resource_raw = first["implementation"]["resource_profile"] + public_config = contracts.public_series_config( + kernel_generation=first["implementation"]["kernel_generation"], + provenance=first["implementation"]["provenance"], + resource_profile=resource_raw, + resource_mode=case["resource_mode"], + device_product=topology["device_product"], + ) + resource_profile = public_config["resource"]["profile"] + configured_units = public_config["resource"]["configured_units"] + units_kind = public_config["resource"]["comm_units_kind"] + resource_label = ( + f"{configured_units} {str(units_kind).upper()}" + if configured_units is not None and units_kind + else resource_profile + ) + eplb = _exact_repeat_value( + [_eplb_descriptor(document) for document in documents], "EPLB descriptor" + ) + points: list[dict[str, Any]] = [] + run_metrics: dict[str, dict[int, dict[str, float]]] = {} + for document, rows in zip(documents, row_maps, strict=True): + run_id = str(_git_run(document)["run_id"]) + if run_id in run_metrics: + raise PublisherError("series has two allocations from one workflow run") + run_metrics[run_id] = {} + for token in tokens: + latency = rows[token]["components"]["roundtrip"]["percentiles_us"] + logical_bytes = rows[token]["logical_bytes"]["roundtrip"] + run_metrics[run_id][token] = { + "latency_us": {statistic: latency[statistic] for statistic in ("p50", "p99")}, + "logical_payload_rate_gbps_at_latency_percentile": { + statistic: logical_bytes / (latency[statistic] * 1000.0) + for statistic in ("p50", "p99") + }, + } + for token in tokens: + rows = [row_map[token] for row_map in row_maps] + routing = _exact_repeat_value( + [_routing_facts(row) for row in rows], "routing/load facts" + ) + components = { + name: _aggregate_component(rows, name) + for name in ("dispatch", "combine", "roundtrip") + } + if components["dispatch"] is None: + components["isolated_sum"] = None + else: + latency = { + statistic: components["dispatch"]["latency_us"][statistic] + + components["combine"]["latency_us"][statistic] + for statistic in ("p50", "p90", "p95", "p99") + } + components["isolated_sum"] = { + "origin": "derived", "latency_us": latency, "logical_bytes": None, + "logical_payload_rate_gbps_at_latency_percentile": None, "sample_count": None, + } + points.append({ + "point_id": rows[0]["point_id"], + "tokens_per_rank": token, + "global_tokens": token * case["ep_size"], + "correct": all(row["correctness"]["passed"] for row in rows), + "routing": routing, + "components": components, + "roundtrip_token_rate_at_latency_percentile": { + statistic: (token * case["ep_size"]) + / (components["roundtrip"]["latency_us"][statistic] * 1e-6) + for statistic in ("p50", "p90", "p95", "p99") + }, + "evidence_ids": [row["evidence_id"] for row in rows], + }) + series = { + "series_id": series_id, + "label": ( + f"{case['runner'].upper()} / {BACKEND_LABELS.get(backend_id, backend_id)} / " + f"EP{case['ep_size']} / {case['phase']} / {shape['routing']}" + f"{' + EPLB' if case['eplb']['enabled'] else ''} / {resource_label}" + ), + "status": "decision-grade" if eligibility["decision_grade"] else "diagnostic", + "case_ids": sorted({document["identity"]["case_id"] for document in documents}), + "allocation_ids": sorted(allocations), + "model": _slug(case["workload_name"]), + "suite": _slug(case["suite"]), + "phase": case["phase"], + "publication_tier": case["required_publication"], + "backend": { + "id": _slug(backend_id), + "label": BACKEND_LABELS.get(backend_id, backend_id), + "role": "reference" if backend_id == "nccl-ep" else "library", + **public_config["backend"], + }, + "build": { + "implementation_contract_sha256": first["identity"]["series_factors"][ + "implementation_contract_sha256" + ], + "public_config_sha256": first["identity"]["series_factors"][ + "public_config_sha256" + ], + "routing_control_sha256": first["identity"]["series_factors"][ + "routing_control_sha256" + ], + "runtime_fingerprint_sha256": first["identity"]["series_factors"][ + "runtime_fingerprint_sha256" + ], + "image_digest": first["identity"]["series_factors"]["image_digest"], + "source_sha": first["identity"]["series_factors"]["source_sha"], + "squash_sha256": first["identity"]["series_factors"]["squash_sha256"], + }, + "system": { + "sku": _slug(case["runner"]), + "label": public_config["system"]["label"], + "vendor": runtime["vendor"], + "topology_class": _slug(topology["topology_class"]), + "transport": _slug(topology["transport"]), + "world_size": topology["world_size"], + "ep_size": case["ep_size"], + "placement": topology["placement"], + }, + "workload": { + "workload_id": workload_id, + "hidden": shape["hidden"], + "top_k": shape["topk"], + "experts": case["eplb"]["num_logical_experts"], + "routing": shape["routing"], + "eplb": case["eplb"]["enabled"], + "dispatch_dtype": shape["dispatch_dtype"], + "combine_dtype": shape["quant"]["combine_output_dtype"], + "activation_profile": shape["activation_profile"], + }, + "eplb": eplb, + "resource": public_config["resource"], + "measurement": { + "contract": first["measurement"]["contract"], + "sampling_contract": first["measurement"]["sampling"]["contract"], + "iters": first["measurement"]["sampling"]["iterations_per_trial"], + "trials": first["measurement"]["sampling"]["trials"], + "warmups": first["measurement"]["sampling"]["warmup_iterations"], + "samples_per_component": first["measurement"]["sampling"]["samples_per_component"], + "headline_component": "roundtrip", + "headline_percentile": "p99", + }, + "points": points, + "eligibility": eligibility, + } + internal = { + "documents": list(documents), + "run_metrics": run_metrics, + "series_factors": first["identity"]["series_factors"], + } + return series, internal + + +def _resolve_bundle_file(root: Path, record: dict[str, Any]) -> Path: + path = root.joinpath(*PurePosixPath(record["path"]).parts) + try: + path.relative_to(root) + except ValueError as exc: + raise PublisherError("bundle record escapes its directory") from exc + if path.resolve() != path or path.is_symlink() or not path.is_file(): + raise PublisherError("bundle record points to a missing or linked file") + if path.stat().st_size != record["bytes"] or _sha_file(path) != record["sha256"]: + raise PublisherError("bundle file checksum differs from its manifest") + return path + + +def load_bundle(store: Store, bundle_id: str) -> dict[str, Any]: + if HEX64.fullmatch(bundle_id) is None: + raise PublisherError("bundle ID must be a SHA-256 digest") + root = store.bundles / bundle_id + if root.is_symlink() or not (root / "COMPLETE").is_file(): + raise PublisherError(f"bundle {bundle_id} is missing or incomplete") + _verify_frozen_tree(root, private=True) + if (root / "COMPLETE").read_text().strip() != bundle_id: + raise PublisherError("bundle COMPLETE marker differs") + manifest_path = root / "bundle.json" + if _sha_file(manifest_path) != bundle_id: + raise PublisherError("bundle directory digest differs from bundle.json") + manifest = validate_bundle_manifest(strict_load(manifest_path)) + checksum_path = _resolve_bundle_file(root, manifest["checksums"]) + checksum_document = strict_load(checksum_path) + checksum_document = _exact(checksum_document, {"format", "files"}, "checksums") + if checksum_document["format"] != "collectivex.checksums.v1": + raise PublisherError("bundle checksum format is invalid") + records = [_file_record(value, f"checksums.files[{index}]") + for index, value in enumerate(_array(checksum_document["files"], "checksums.files"))] + _unique([record["path"] for record in records], "checksums.files[].path") + for record in records: + _resolve_bundle_file(root, record) + expected_paths = { + path.relative_to(root).as_posix() for path in _tree_files(root) + if path.name not in {"bundle.json", "checksums.json"} + } + if {record["path"] for record in records} != expected_paths: + raise PublisherError("bundle checksum catalog does not cover its payload exactly") + artifact_by_root: dict[str, str] = {} + for index, source in enumerate(manifest["sources"]): + _resolve_bundle_file(root, source) + archive_key = f"artifact-{index:04d}" + if source["path"] != f"source/{archive_key}.zip": + raise PublisherError("bundle source catalog order/path differs") + artifact_by_root[archive_key] = source["artifact_name"] + if len(set(artifact_by_root.values())) != len(artifact_by_root): + raise PublisherError("bundle source catalog repeats an artifact name") + matrix_path = _resolve_bundle_file(root, manifest["matrix"]) + matrix_document = strict_load(matrix_path) + cases = validate_matrix(matrix_document) + expected_by_id = {case["case_id"]: case for case in cases} + expected_deliveries = _expected_deliveries( + matrix_document, cases, manifest["run"] + ) + if {item["case_id"] for item in manifest["coverage"]["selections"]} != set(expected_by_id): + raise PublisherError("bundle selected coverage differs from requested matrix") + documents: dict[str, dict[str, Any]] = {} + runtime_fingerprints: set[str] = set() + for attempt in manifest["attempts"]: + document_path = _resolve_bundle_file(root, attempt["document"]) + document = contracts.strict_load(document_path) + artifact_safety.assert_publication_safe([document]) + if document.get("format") == contracts.RAW_FORMAT: + _schema("raw-case-v1.schema.json", document) + sample_path = document_path.with_name(document["sample_artifact"]["path"]) + if attempt["samples"] is None: + raise PublisherError("raw attempt is missing its sample manifest record") + manifest_sample_path = _resolve_bundle_file(root, attempt["samples"]) + if manifest_sample_path != sample_path: + raise PublisherError("sample manifest record points to the wrong raw evidence") + sample_document = contracts.strict_load(sample_path) + artifact_safety.assert_publication_safe([sample_document]) + _schema("samples-v1.schema.json", sample_document) + document = contracts.load_raw_attempt(document_path) + else: + if attempt["samples"] is not None: + raise PublisherError("terminal attempt unexpectedly names a sample artifact") + _schema("terminal-outcome-v1.schema.json", document) + document = contracts.validate_terminal_document(document) + _validate_delivery_binding( + document, document_path, root / "raw", artifact_by_root, + expected_by_id, expected_deliveries, manifest["run"], + ) + expected_record = _attempt_record( + document, document_path, root, selected=attempt["selected"] + ) + if expected_record != attempt: + raise PublisherError("bundle attempt record differs from native document") + if attempt["runtime_fingerprint_sha256"]: + runtime_fingerprints.add(attempt["runtime_fingerprint_sha256"]) + documents[attempt["attempt_id"]] = document + if sorted(runtime_fingerprints) != manifest["runtime_fingerprints"]: + raise PublisherError("bundle runtime fingerprint catalog differs from attempts") + selected = { + selection["case_id"]: documents[selection["selected_attempt_id"]] + for selection in manifest["coverage"]["selections"] + } + return { + "id": bundle_id, + "root": root, + "manifest": manifest, + "cases": cases, + "documents": documents, + "selected": selected, + } + + +def _cohort_control( + kind: str, series: dict[str, Any], internal: dict[str, Any] +) -> tuple[dict[str, Any], list[str], list[str], Any]: + binary_build = series["build"] + source = binary_build["source_sha"] + workload = series["workload"] + shape = { + key: workload[key] + for key in ("hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype", "activation_profile") + } + common = { + "model": series["model"], "phase": series["phase"], "shape": shape, + "measurement": series["measurement"], "ep_size": series["system"]["ep_size"], + } + if kind == "library": + control = {**common, "system": series["system"], "workload": workload, + "resource_mode": series["resource"]["mode"], "source": source} + return control, ["system", "workload", "phase", "measurement", "resource.mode", "source"], ["backend", "resource"], series["backend"]["id"] + if kind == "chip": + control = {**common, "backend": series["backend"], "source": source, + "workload": workload, "resource_mode": series["resource"]["mode"]} + return control, ["backend", "source", "workload", "phase", "measurement", "resource.mode"], ["system", "resource"], series["system"] + if kind == "system": + control = {**common, "workload": workload, "source": source} + varying = [series["system"]["sku"], series["backend"]["id"], series["resource"]["profile"]] + return control, ["workload", "phase", "measurement", "source"], ["system", "backend", "resource"], varying + if kind == "routing": + control = { + **common, + "backend": series["backend"], + "system": series["system"], + "resource": series["resource"], + "build": _routing_build_control(binary_build), + } + varying = [ + workload["routing"], workload["eplb"], + binary_build["implementation_contract_sha256"], + ] + return ( + control, + ["backend", "implementation-static-build", "system", "model-shape", "phase", "measurement", "resource"], + ["workload.routing", "workload.eplb", "implementation-config"], + varying, + ) + raise PublisherError(f"unknown cohort kind {kind}") + + +def _cohort_ordering( + members: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]], tokens: Sequence[int] +) -> tuple[bool, int]: + run_ids = set.intersection(*( + set(internals[member["series_id"]]["run_metrics"]) for member in members + )) + if len(run_ids) < REQUIRED_ALLOCATIONS: + return False, len(run_ids) + orders: list[tuple[str, str, int, str, tuple[str, ...]]] = [] + for run_id in sorted(run_ids): + for token in tokens: + for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"): + for statistic in ("p50", "p99"): + ordered = tuple( + member["series_id"] + for member in sorted( + members, + key=lambda item: ( + internals[item["series_id"]]["run_metrics"][run_id][token][measure][statistic], + item["series_id"], + ), + reverse=measure == "logical_payload_rate_gbps_at_latency_percentile", + ) + ) + orders.append((measure, statistic, token, run_id, ordered)) + for token in tokens: + for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"): + for statistic in ("p50", "p99"): + observed = { + entry[4] + for entry in orders + if entry[0] == measure and entry[1] == statistic and entry[2] == token + } + if len(observed) != 1: + return False, len(run_ids) + return True, len(run_ids) + + +def build_decisions( + series: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]] +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]: + cohorts: list[dict[str, Any]] = [] + for kind in ("library", "chip", "system", "routing"): + groups: dict[bytes, list[tuple[dict[str, Any], Any, list[str], list[str]]]] = {} + for item in series: + if kind == "library" and item["backend"]["role"] != "library": + continue + if kind == "system" and item["backend"]["role"] != "reference": + continue + control, controlled, varying, variant = _cohort_control(kind, item, internals[item["series_id"]]) + groups.setdefault(_canonical(control), []).append((item, variant, controlled, varying)) + for entries in groups.values(): + variants = {_canonical(entry[1]) for entry in entries} + if len(entries) < 2 or len(variants) < 2: + continue + members = sorted((entry[0] for entry in entries), key=lambda item: item["series_id"]) + token_sets = [set(point["tokens_per_rank"] for point in member["points"]) for member in members] + tokens = sorted(set.intersection(*token_sets)) + same_points = len({tuple(sorted(values)) for values in token_sets}) == 1 + ordering, aligned_runs = _cohort_ordering(members, internals, tokens) if tokens else (False, 0) + allocations = sorted({value for member in members for value in member["allocation_ids"]}) + p50_ratio = max( + (member["eligibility"]["p50_max_min_ratio"] for member in members + if member["eligibility"]["p50_max_min_ratio"] is not None), default=None + ) + p99_ratio = max( + (member["eligibility"]["p99_max_min_ratio"] for member in members + if member["eligibility"]["p99_max_min_ratio"] is not None), default=None + ) + extra = { + reason for member in members for reason in member["eligibility"]["reasons"] + if reason not in {"unstable-ordering"} + } + if aligned_runs < REQUIRED_ALLOCATIONS: + extra.add("incomplete-aligned-repeats") + if kind == "routing" and sum( + member["workload"]["routing"] == "uniform" + and not member["workload"]["eplb"] + for member in members + ) != 1: + extra.add("missing-uniform-baseline") + if kind == "routing" and { + (member["workload"]["routing"], member["workload"]["eplb"]) + for member in members + } != {("uniform", False), ("zipf", False), ("zipf", True)}: + extra.add("incomplete-routing-anchors") + if kind == "routing" and _routing_implementation_mismatch(members): + extra.add("implementation-config-mismatch") + if not tokens or (kind != "routing" and not same_points): + extra.add("unmatched-token-coverage") + eligibility = _eligibility_record( + allocations, + complete=all(member["eligibility"]["complete"] for member in members) + and bool(tokens) and (kind == "routing" or same_points), + correct=all(member["eligibility"]["correct"] for member in members), + measured=all(member["eligibility"]["measured_roundtrip_p99"] for member in members), + stable_ordering=ordering, + p50_ratio=p50_ratio, + p99_ratio=p99_ratio, + extra_reasons=sorted(extra), + ) + member_ids = [member["series_id"] for member in members] + publication_tier = ( + "comparable-experimental" + if any(member["publication_tier"] == "comparable-experimental" for member in members) + else "official" + ) + controlled, varying = entries[0][2], entries[0][3] + cohort_id = _derived_id("cxcohort-v1-", { + "kind": kind, "series_ids": member_ids, + "controlled_factors": controlled, "varying_factors": varying, + }) + kind_label = "Platform" if kind == "chip" else kind.title() + first = members[0] + routing_label = first["workload"]["routing"] + ( + "+EPLB" if first["workload"]["eplb"] else "" + ) + context = { + "library": ( + f"{first['system']['sku'].upper()} EP{first['system']['ep_size']} / " + f"{first['phase']} / {routing_label}" + ), + "chip": ( + f"{first['backend']['label']} EP{first['system']['ep_size']} / " + f"{first['phase']} / {routing_label}" + ), + "system": ( + f"Reference EP{first['system']['ep_size']} / {first['phase']} / " + f"{routing_label}" + ), + "routing": ( + f"{first['system']['sku'].upper()} / {first['backend']['label']} / " + f"EP{first['system']['ep_size']} / {first['phase']}" + ), + }[kind] + cohorts.append({ + "cohort_id": cohort_id, + "kind": kind, + "label": f"{context} / {kind_label} contrast ({len(members)} series)", + "description": ( + "Publisher-controlled NCCL/RCCL system comparison" + if kind == "system" + else f"Publisher-controlled {kind_label.lower()} comparison" + ), + "series_ids": member_ids, + "controlled_factors": controlled, + "varying_factors": varying, + "publication_tier": publication_tier, + "eligibility": eligibility, + }) + cohorts.sort(key=lambda item: item["cohort_id"]) + series_by_id = {item["series_id"]: item for item in series} + rankings: list[dict[str, Any]] = [] + recommendations: list[dict[str, Any]] = [] + sensitivities: list[dict[str, Any]] = [] + for cohort in cohorts: + if not cohort["eligibility"]["decision_grade"]: + continue + members = [series_by_id[series_id] for series_id in cohort["series_ids"]] + tokens = sorted(set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} for member in members + ))) + for token in tokens: + for measure, objective, unit in ( + ("latency_us", "min", "us"), ("logical_payload_rate_gbps_at_latency_percentile", "max", "GB/s") + ): + for statistic in ("p50", "p99"): + metric = { + "operation": "roundtrip", "statistic": statistic, + "measure": measure, "objective": objective, + "tokens_per_rank": token, "phase": members[0]["phase"], + } + entries = [] + for member in members: + point_id, value, observed_unit = _metric_value(member, metric) + if observed_unit != unit: + raise PublisherError("publisher metric unit differs") + entries.append({ + "rank": 0, "series_id": member["series_id"], "point_id": point_id, + "value": value, "unit": unit, + }) + entries.sort(key=lambda item: (item["value"], item["series_id"]), reverse=objective == "max") + for rank, entry in enumerate(entries, 1): + entry["rank"] = rank + ranking_id = _derived_id("cxranking-v1-", { + "cohort_id": cohort["cohort_id"], "metric": metric, + }) + metric_label = _metric_label(measure, statistic) + rankings.append({ + "ranking_id": ranking_id, "cohort_id": cohort["cohort_id"], + "label": f"{cohort['kind'].title()} {metric_label} T={token}", + "metric": metric, "entries": entries, + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + if cohort["publication_tier"] != "official": + continue + objective_name = ( + f"min-{statistic}-latency" + if measure == "latency_us" + else f"max-payload-rate-at-{statistic}-latency" + ) + top = entries[0] + recommendation_id = _derived_id("cxrecommendation-v1-", { + "objective": objective_name, "ranking_id": ranking_id, + }) + recommendations.append({ + "recommendation_id": recommendation_id, + "cohort_id": cohort["cohort_id"], + "label": f"Best {metric_label} at T={token}", + "objective": objective_name, + "series_id": top["series_id"], "point_id": top["point_id"], + "value": top["value"], "unit": top["unit"], + "rationale": "Top stable measured roundtrip result in a controlled cohort", + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + if cohort["kind"] == "routing": + baseline = next( + (member for member in members + if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]), + None, + ) + if baseline: + for candidate in members: + if candidate is baseline: + continue + for token in tokens: + for measure, objective in (("latency_us", "min"), ("logical_payload_rate_gbps_at_latency_percentile", "max")): + for statistic in ("p50", "p99"): + metric = { + "operation": "roundtrip", "statistic": statistic, + "measure": measure, "objective": objective, + "tokens_per_rank": token, "phase": baseline["phase"], + } + _, base_value, _ = _metric_value(baseline, metric) + _, candidate_value, _ = _metric_value(candidate, metric) + sensitivity_id = _derived_id("cxsensitivity-v1-", { + "baseline": baseline["series_id"], "candidate": candidate["series_id"], + "cohort": cohort["cohort_id"], "metric": metric, + }) + sensitivities.append({ + "sensitivity_id": sensitivity_id, + "cohort_id": cohort["cohort_id"], + "label": ( + f"Routing sensitivity: " + f"{_metric_label(measure, statistic)} T={token}" + ), + "baseline_series_id": baseline["series_id"], + "candidate_series_id": candidate["series_id"], + "metric": metric, + "signed_change_ratio": (candidate_value - base_value) / base_value, + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + rankings.sort(key=lambda item: item["ranking_id"]) + recommendations.sort(key=lambda item: item["recommendation_id"]) + sensitivities.sort(key=lambda item: item["sensitivity_id"]) + return cohorts, rankings, recommendations, sensitivities + + +def _require_runnable_promotion_success( + bundles: Sequence[dict[str, Any]], cases: dict[str, dict[str, Any]] +) -> None: + for bundle in bundles: + for case_id, case in cases.items(): + if case["_disposition"] != "runnable": + continue + status, _ = _outcome(bundle["selected"][case_id]) + if status != "success": + raise PublisherError( + "promotion requires every runnable matrix case to succeed " + "in every selected bundle" + ) + prior_statuses = { + _outcome(document)[0] + for document in bundle["documents"].values() + if document["identity"]["case_id"] == case_id + } + if prior_statuses != {"success"}: + raise PublisherError( + "promotion rejects runnable cases with failed, invalid, or diagnostic retries" + ) + + +def _expected_chip_cohort_count(series: Sequence[dict[str, Any]]) -> int: + groups: dict[bytes, set[bytes]] = {} + for item in series: + control, variant = _public_cohort_factors("chip", item) + groups.setdefault(_canonical(control), set()).add(_canonical(variant)) + return sum(len(variants) >= 2 for variants in groups.values()) + + +def _require_promotion_cohorts( + cohorts: Sequence[dict[str, Any]], series: Sequence[dict[str, Any]] +) -> None: + eligible_kinds = { + cohort["kind"] + for cohort in cohorts + if cohort["eligibility"]["decision_grade"] + } + missing = [kind for kind in REQUIRED_COHORT_KINDS if kind not in eligible_kinds] + if missing: + raise PublisherError( + "promotion lacks decision-grade cohort kinds: " + ", ".join(missing) + ) + for kind, expected in REQUIRED_PROMOTION_COHORT_COUNTS.items(): + members = [cohort for cohort in cohorts if cohort["kind"] == kind] + if len(members) != expected or any( + not cohort["eligibility"]["decision_grade"] for cohort in members + ): + raise PublisherError( + f"promotion requires exactly {expected} decision-grade {kind} cohorts" + ) + + chip_cohorts = [cohort for cohort in cohorts if cohort["kind"] == "chip"] + expected_chips = _expected_chip_cohort_count(series) + if len(chip_cohorts) != expected_chips or any( + not cohort["eligibility"]["decision_grade"] for cohort in chip_cohorts + ): + raise PublisherError( + f"promotion requires all {expected_chips} derived chip cohorts to be decision-grade" + ) + + by_id = {item["series_id"]: item for item in series} + anchors = {("uniform", False), ("zipf", False), ("zipf", True)} + for cohort in ( + item for item in cohorts + if item["kind"] == "routing" and item["eligibility"]["decision_grade"] + ): + observed = { + (by_id[series_id]["workload"]["routing"], by_id[series_id]["workload"]["eplb"]): + by_id[series_id] + for series_id in cohort["series_ids"] + } + if len(cohort["series_ids"]) != len(anchors) or set(observed) != anchors: + raise PublisherError( + "promotion routing cohorts require exact uniform, zipf, and zipf+EPLB anchors" + ) + if ( + observed[("uniform", False)]["build"]["implementation_contract_sha256"] + != observed[("zipf", False)]["build"]["implementation_contract_sha256"] + ): + raise PublisherError( + "promotion routing cohorts require identical off-EPLB generated implementation" + ) + + +def _require_promotion_series(series: Sequence[dict[str, Any]]) -> None: + if not series or any(item["status"] != "decision-grade" for item in series): + raise PublisherError("promotion has unstable or incomplete required series") + + +def build_dataset( + store: Store, + bundle_ids: Sequence[str], + *, + promote: bool, +) -> dict[str, Any]: + if not bundle_ids or len(bundle_ids) != len(set(bundle_ids)): + raise PublisherError("dataset requires unique explicit bundle IDs") + loaded = [load_bundle(store, bundle_id) for bundle_id in bundle_ids] + loaded.sort(key=lambda bundle: ( + int(bundle["manifest"]["run"]["run_id"]), + bundle["manifest"]["run"]["run_attempt"], + bundle["id"], + )) + matrix_ids = {bundle["manifest"]["matrix"]["sha256"] for bundle in loaded} + case_sets = [{case["case_id"] for case in bundle["cases"]} for bundle in loaded] + if len(matrix_ids) != 1 or len({tuple(sorted(values)) for values in case_sets}) != 1: + raise PublisherError("dataset bundles do not share one exact requested matrix") + run_ids = [bundle["manifest"]["run"]["run_id"] for bundle in loaded] + if promote and ( + len(loaded) != REQUIRED_ALLOCATIONS + or len(run_ids) != len(set(run_ids)) + ): + raise PublisherError("promotion requires three independent complete workflow runs") + if promote and matrix_ids != {CANONICAL_FULL_V1_MATRIX_SHA256}: + raise PublisherError("promotion requires the canonical full-v1 matrix") + cases = {case["case_id"]: case for case in loaded[0]["cases"]} + if promote: + _require_runnable_promotion_success(loaded, cases) + all_documents = [ + document for bundle in loaded for document in bundle["documents"].values() + ] + selected_ids = { + selection["selected_attempt_id"] + for bundle in loaded for selection in bundle["manifest"]["coverage"]["selections"] + } + public_attempts = [ + _public_attempt( + document, selected=document["identity"]["attempt_id"] in selected_ids + ) + for document in all_documents + ] + _unique([attempt["attempt_id"] for attempt in public_attempts], "dataset attempts") + selected_by_case: dict[str, list[dict[str, Any]]] = { + case_id: [bundle["selected"][case_id] for bundle in loaded] + for case_id in sorted(cases) + } + coverage: list[dict[str, Any]] = [] + for case_id, case in sorted(cases.items()): + attempts = sorted( + (attempt for attempt in public_attempts if attempt["case_id"] == case_id), + key=lambda attempt: ( + int(attempt["run_id"]), attempt["run_attempt"], + attempt["attempt_index"], attempt["attempt_id"], + ), + ) + selected = _public_attempt(selected_by_case[case_id][-1], selected=True) + coverage.append({ + "case_id": case_id, + "label": f"{case['sku'].upper()} / {case['backend']} / EP{case['ep']} / {case['phase']} / {case['routing']}", + "required": True, + "sku": _slug(case["sku"]), + "backend": _slug(case["backend"]), + "phase": case["phase"], + "disposition": case["_disposition"], + "selected_attempt_id": selected["attempt_id"], + "outcome": selected["outcome"], + "failure_mode": selected["failure_mode"], + "reason": case["_reason"] if case["_disposition"] == "unsupported" else selected["reason"], + "attempt_ids": [attempt["attempt_id"] for attempt in attempts], + }) + by_series: dict[str, list[dict[str, Any]]] = {} + for case_documents in selected_by_case.values(): + for document in case_documents: + if ( + document["format"] == contracts.RAW_FORMAT + and document["outcome"]["status"] == "success" + ): + by_series.setdefault(document["identity"]["series_id"], []).append(document) + series: list[dict[str, Any]] = [] + internals: dict[str, dict[str, Any]] = {} + for series_id, documents in sorted(by_series.items()): + item, internal = _build_series(series_id, documents, len(loaded)) + series.append(item) + internals[series_id] = internal + cohorts, rankings, recommendations, sensitivities = build_decisions(series, internals) + allocation_ids = sorted({attempt["allocation_id"] for attempt in public_attempts}) + status = "promoted" if promote else "diagnostic" + dataset = { + "format": FORMAT_PUBLIC, + "schema_version": 1, + "generated_at": _latest_timestamp( + [bundle["manifest"]["created_at"] for bundle in loaded] + ), + "source_bundle_ids": sorted(bundle_ids), + "promotion": { + "status": status, + "reason": None, + "matrix_id": next(iter(matrix_ids)), + "allocation_ids": allocation_ids, + "required_allocations": REQUIRED_ALLOCATIONS, + "requested_cases": len(coverage), + "terminal_cases": len(coverage), + "policy": POLICY, + }, + "coverage": coverage, + "attempts": sorted(public_attempts, key=lambda attempt: attempt["attempt_id"]), + "series": series, + "cohorts": cohorts, + "rankings": rankings, + "recommendations": recommendations, + "sensitivities": sensitivities, + } + if promote: + _require_promotion_series(series) + _require_promotion_cohorts(cohorts, series) + validate_public_dataset(dataset) + return dataset + + +def _quarantine_dataset(reason: str, generated_at: str) -> dict[str, Any]: + dataset = { + "format": FORMAT_PUBLIC, + "schema_version": 1, + "generated_at": generated_at, + "source_bundle_ids": [], + "promotion": { + "status": "quarantined", + "reason": reason, + "matrix_id": None, + "allocation_ids": [], + "required_allocations": REQUIRED_ALLOCATIONS, + "requested_cases": 0, + "terminal_cases": 0, + "policy": POLICY, + }, + "coverage": [], + "attempts": [], + "series": [], + "cohorts": [], + "rankings": [], + "recommendations": [], + "sensitivities": [], + } + validate_public_dataset(dataset) + return dataset + + +def quarantine_incoming( + store: Store, ingest_id: str, reason: str, generated_at: str +) -> str: + if REASON.fullmatch(reason) is None: + raise PublisherError("quarantine reason must be a machine code") + public_reason = f"{reason}-{ingest_id}" + if REASON.fullmatch(public_reason) is None: + raise PublisherError("quarantine reason and incoming ID exceed the public reason contract") + manifest = { + "format": "collectivex.quarantine.v1", + "schema_version": 1, + "created_at": generated_at, + "incoming_id": ingest_id, + "reason": reason, + } + digest = _sha_bytes(_canonical(manifest)) + with store.staging(store.quarantine, private=True) as stage: + _write_json(stage / "quarantine.json", manifest, mode=0o600) + store.complete(stage, digest, private=True) + store.install(stage, store.quarantine / digest, private=True) + if _sha_bytes(_canonical(strict_load(store.quarantine / digest / "quarantine.json"))) != digest: + raise PublisherError("existing quarantine object differs") + # The incoming digest distinguishes separate rejected deliveries while preserving + # byte-identical output when the operator retries the same immutable input. + dataset = _quarantine_dataset(public_reason, generated_at) + dataset_digest, size = store.install_dataset(dataset) + store.update_channel("latest-attempt", dataset_digest, size, generated_at) + return digest + + +def _store_from_args(args: argparse.Namespace) -> Store: + root = args.store_root or os.environ.get("COLLECTIVEX_STORE_ROOT") + if not root: + raise PublisherError("COLLECTIVEX_STORE_ROOT or --store-root is required") + if not Path(root).is_absolute(): + raise PublisherError("COLLECTIVEX_STORE_ROOT must be an absolute path") + return Store(root) + + +def _run_metadata(args: argparse.Namespace) -> dict[str, Any]: + """Validate offline operator assertions about a completed successful GHA run. + + The publisher deliberately performs no network access. The caller must preflight workflow + identity and conclusion against GitHub before supplying these values; artifact-internal + provenance is then required to match them exactly. + """ + run = { + "repository": args.repository, + "run_id": args.run_id, + "run_attempt": args.run_attempt, + "source_sha": args.source_sha, + } + # Reuse the authoritative private schema constraints before any filesystem mutation. + if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", run["repository"] or ""): + raise PublisherError("--repository must be owner/name") + if not re.fullmatch(r"[1-9][0-9]*", run["run_id"] or ""): + raise PublisherError("--run-id must be a positive decimal string") + if type(run["run_attempt"]) is not int or run["run_attempt"] < 1: + raise PublisherError("--run-attempt must be positive") + if not re.fullmatch(r"[0-9a-f]{40}", run["source_sha"] or ""): + raise PublisherError("--source-sha must be a 40-character lowercase Git SHA") + return run + + +def _ingest_inputs( + args: argparse.Namespace, +) -> tuple[dict[str, Any], Path, list[Path]]: + run = _run_metadata(args) + matrix = Path(args.matrix).absolute() + if matrix.is_symlink() or not matrix.is_file(): + raise PublisherError("--matrix must be a regular non-symlink file") + artifacts = [Path(value).absolute() for value in args.artifact] + if not artifacts: + raise PublisherError("at least one --artifact is required") + names = [_artifact_name(path) for path in artifacts] + if len(names) != len(set(names)): + raise PublisherError("--artifact contains duplicate GHA names") + for path in artifacts: + if path.is_symlink() or not (path.is_dir() or path.is_file()): + raise PublisherError("--artifact must be a regular ZIP or real directory") + return run, matrix, artifacts + + +def _bundle_ids(values: Sequence[str], *, promote: bool) -> list[str]: + bundle_ids = list(values) + if ( + not bundle_ids + or len(bundle_ids) != len(set(bundle_ids)) + or any(HEX64.fullmatch(value) is None for value in bundle_ids) + ): + raise PublisherError("bundle IDs must be unique SHA-256 digests") + if promote and len(bundle_ids) != REQUIRED_ALLOCATIONS: + raise PublisherError("promotion requires exactly three explicit bundle IDs") + return bundle_ids + + +def ingest_command(args: argparse.Namespace) -> dict[str, Any]: + run, matrix, artifacts = _ingest_inputs(args) + store = _store_from_args(args) + with store.locked(): + ingest_id, incoming, _ = archive_incoming( + store, matrix, artifacts, run + ) + try: + bundle_id, _, _ = build_bundle(store, ingest_id, incoming, run) + dataset = build_dataset(store, [bundle_id], promote=False) + dataset_id, size = store.install_dataset(dataset) + store.update_channel( + "latest-attempt", dataset_id, size, dataset["generated_at"] + ) + store.verify_channel("latest-attempt") + return { + "status": "accepted", "incoming_id": ingest_id, + "bundle_id": bundle_id, "dataset_sha256": dataset_id, + "channel": "latest-attempt", + } + except ( + PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, + ) as exc: + # Invalid delivery bytes provide no trusted timestamp. A fixed sentinel keeps + # repeated quarantine of the same immutable incoming object content-idempotent. + generated_at = "1970-01-01T00:00:00Z" + quarantine_id = quarantine_incoming( + store, ingest_id, "artifact-validation-failed", generated_at + ) + raise PublisherError( + f"incoming {ingest_id} quarantined as {quarantine_id}: {exc}" + ) from exc + + +def promote_command(args: argparse.Namespace) -> dict[str, Any]: + bundle_ids = _bundle_ids(args.bundle, promote=True) + store = _store_from_args(args) + with store.locked(): + dataset = build_dataset(store, bundle_ids, promote=True) + digest, size = store.install_dataset(dataset) + store.update_channel("dev-latest", digest, size, dataset["generated_at"]) + store.verify_channel("dev-latest") + return { + "status": "promoted", "bundle_ids": bundle_ids, + "dataset_sha256": digest, "channel": "dev-latest", + } + + +def verify_command(args: argparse.Namespace) -> dict[str, Any]: + bundle_ids = _bundle_ids(args.bundle, promote=False) if args.bundle else [] + channels = args.channel or ["latest-attempt"] + if any(channel not in {"latest-attempt", "dev-latest"} for channel in channels): + raise PublisherError("unknown channel") + store = _store_from_args(args) + if args.channel is None and (store.channels / "dev-latest.json").is_file(): + channels.append("dev-latest") + with store.locked(): + pointers = {channel: store.verify_channel(channel) for channel in channels} + bundles = [load_bundle(store, bundle_id)["id"] for bundle_id in bundle_ids] + return {"status": "verified", "channels": pointers, "bundle_ids": bundles} + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="CollectiveX isolated filesystem publisher") + parser.add_argument("--store-root", help="defaults to COLLECTIVEX_STORE_ROOT") + subparsers = parser.add_subparsers(dest="command", required=True) + ingest = subparsers.add_parser("ingest", help="archive and validate one complete GHA run") + ingest.add_argument("--matrix", required=True) + ingest.add_argument("--artifact", action="append", required=True) + ingest.add_argument("--repository", required=True) + ingest.add_argument("--run-id", required=True) + ingest.add_argument("--run-attempt", required=True, type=int) + ingest.add_argument("--source-sha", required=True) + promote = subparsers.add_parser("promote", help="publish explicit independent bundles") + promote.add_argument("--bundle", action="append", required=True) + verify = subparsers.add_parser("verify", help="verify immutable targets and pointers") + verify.add_argument("--channel", action="append", choices=["latest-attempt", "dev-latest"]) + verify.add_argument("--bundle", action="append", default=[]) + return parser + + +def main() -> int: + args = _parser().parse_args() + try: + if args.command == "ingest": + result = ingest_command(args) + elif args.command == "promote": + result = promote_command(args) + elif args.command == "verify": + result = verify_command(args) + else: + raise PublisherError(f"unknown command {args.command!r}") + except ( + PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, OSError, + ) as exc: + print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr) + return 2 + print(json.dumps(result, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt new file mode 100644 index 0000000000..f68f97d83d --- /dev/null +++ b/experimental/CollectiveX/requirements.txt @@ -0,0 +1,8 @@ +# Host-side matrix generation. GPU libraries are supplied by benchmark images. +PyYAML==6.0.2 + +# Canonical workload serialization. +numpy>=1.26,<3 + +# Host-only strict artifact publisher schemas (never imported by GPU execution). +jsonschema==4.25.1 diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh new file mode 100644 index 0000000000..3720afcf07 --- /dev/null +++ b/experimental/CollectiveX/runtime/common.sh @@ -0,0 +1,1686 @@ +# shellcheck shell=bash +# CollectiveX — shared launcher helpers (sourced, not executed). +# +# Cluster-generic scaffolding only (Slurm/container/build/staging); no +# model-serving. Logging goes to stderr so functions can `echo` a single +# result on stdout. + +_CX_COMMON_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CX_SQUASH_FORMAT_VERSION="repro-v1" +CX_SQUASH_SOURCE_DATE_EPOCH=1 +CX_DEEPEP_V2_COMMIT="fa8a9b16898204afd347c663b89e65ef87dc6ce6" # pragma: allowlist secret +CX_DEEPEP_V2_TREE="29809e75c5874e6609dac4804e7b651d5226959f" # pragma: allowlist secret +CX_DEEPEP_V2_FMT_COMMIT="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" # pragma: allowlist secret +CX_DEEPEP_HYBRID_COMMIT="e0a5b1d9848ab3e7b4a67842bf06f067bfac67f8" # pragma: allowlist secret +CX_DEEPEP_HYBRID_TREE="d77aeab7f1bb52b615666fe178d26ced41fae08e" # pragma: allowlist secret +unset COLLECTIVEX_OPERATOR_CONFIG_LOADED COLLECTIVEX_EPHEMERAL_CONFIG_PATH + +cx_log() { printf '[collectivex] %s\n' "$*" >&2; } +cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } + +# Public failure telemetry is a closed vocabulary. Raw scheduler, container, +# host, and filesystem diagnostics stay in the mode-0600 private logs. +cx_set_failure_stage() { + local stage="$1" + case "$stage" in + setup|repository-stage|registry-verification|scheduler-allocation|container-import) ;; + container-hash|container-launch|backend-setup|execution|artifact-collection) ;; + *) cx_die "invalid launcher failure stage" ;; + esac + export CX_FAILSAFE_MODE="$stage" +} + +cx_fail_stage() { + local stage="$1" log_path="${2:-}" diagnostic="unknown" + cx_set_failure_stage "$stage" + if [ -n "$log_path" ] && [ -f "$log_path" ]; then + if grep -aEqi 'no space left|disk quota|quota exceeded' "$log_path"; then + diagnostic="storage-capacity" + elif grep -aEqi 'permission denied|operation not permitted|read-only file system|source mount (creation|ownership validation|permission inspection|permission normalization|permission validation) failed' "$log_path"; then + diagnostic="storage-permission" + elif grep -aEqi 'outside one realized LSA domain|lsa(Size| team| domain).*(mismatch|invalid|expected)|ranks.*not in (one|the same) nvlink.domain' "$log_path"; then + diagnostic="accelerator-topology" + elif grep -aEqi 'cuda driver version is insufficient|call requires newer driver|cudaErrorCallRequiresNewerDriver|CUDA_ERROR_SYSTEM_DRIVER_MISMATCH|unsupported toolchain' "$log_path"; then + diagnostic="accelerator-driver" + elif grep -aEqi 'ncclDevCommCreate|ncclCommWindowRegister|ncclGetLsa(Device)?Pointer' "$log_path"; then + diagnostic="nccl-device-api" + elif grep -aEqi 'NVCC (PTX )?compilation failed|cuobjdump failed|invalid device (kernel )?image|no kernel image is available' "$log_path"; then + diagnostic="jit-toolchain" + elif grep -aEqi 'cuda out of memory|CUDA_ERROR_OUT_OF_MEMORY|out of memory.*cuda' "$log_path"; then + diagnostic="accelerator-memory" + elif grep -aEqi 'does not match its pinned image contract|requires the exact pinned|version mismatch' "$log_path"; then + diagnostic="backend-version" + elif grep -aEqi 'nvshmem is unavailable|build-tool installation failed' "$log_path"; then + diagnostic="backend-dependency" + elif grep -aEqi 'revision fetch failed|submodule fetch failed|package installation failed|staged source is invalid|source (pin resolution|seed validation|seed copy|checkout creation|publication validation|existing source validation) failed' "$log_path"; then + diagnostic="backend-source" + elif grep -aEqi 'failed to mount|squashfs|enroot|pyxis|mount.*invalid argument|invalid argument.*mount' "$log_path"; then + diagnostic="container-runtime" + elif grep -aEqi 'backend preparation failed|build (failed|is incomplete)|cache (mount identity )?validation failed|import failed' "$log_path"; then + diagnostic="backend-build" + elif grep -aEqi 'command not found|not found on this runner|git lookup failed' "$log_path"; then + diagnostic="missing-runtime" + elif grep -aEqi 'too many requests|rate.?limit' "$log_path"; then + diagnostic="registry-rate-limit" + elif grep -aEqi 'timed out|operation timeout|wait timeout after|watchdog.*timeout|timeout: sending signal|connection reset|could not resolve|TLS|certificate' "$log_path"; then + diagnostic="network-or-timeout" + elif grep -aEqi 'salloc:|srun:.*(unable to create step|step creation|invalid partition|invalid account)|unable to create step|job allocation' "$log_path"; then + diagnostic="scheduler" + elif grep -aEqi 'SHARD done: [0-9]+/[0-9]+ case\(s\) failed|WARN: .* run failed rc=|completed with invalid semantic evidence' "$log_path"; then + diagnostic="benchmark-case-failure" + elif [ -s "$log_path" ]; then + diagnostic="unclassified" + else + diagnostic="empty-log" + fi + fi + cx_log "ERROR: failure-class=$stage diagnostic=$diagnostic" + return 1 +} + +# Runner-local deployment settings are strict JSON kept outside the checkout. +# Only the selected runner's allowlisted values are exported; the document is +# never sourced or evaluated as shell. +cx_load_operator_config() { + [ -n "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" ] \ + && [ "$COLLECTIVEX_OPERATOR_CONFIG_LOADED" = "$$" ] && return 0 + local config_path generated=0 parsed_path config_log key value + unset CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR CX_ENROOT_CACHE_PATH + unset ENROOT_CACHE_PATH + unset CX_EXCLUDE_NODES CX_NODELIST CX_LOCK_DIR CX_MASTER_PORT + config_path="${COLLECTIVEX_OPERATOR_CONFIG:-${XDG_CONFIG_HOME:-${HOME}/.config}/inferencex/collectivex.json}" + if [ -n "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT:-}" ]; then + umask 077 + if [[ "${CX_JOB_ROOT:-}" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + && [ -d "$CX_JOB_ROOT" ] && [ ! -L "$CX_JOB_ROOT" ] \ + && [ "$(stat -c '%u:%a' "$CX_JOB_ROOT" 2>/dev/null)" = "$(id -u):700" ]; then + config_path="$CX_JOB_ROOT/operator-config.json" + (set -C; : > "$config_path") 2>/dev/null \ + || cx_die "cannot create ephemeral runner configuration" + else + config_path="$(mktemp /tmp/inferencex-collectivex-config.XXXXXX)" \ + || cx_die "cannot create ephemeral runner configuration" + fi + COLLECTIVEX_EPHEMERAL_CONFIG_PATH="$config_path" + generated=1 + if ! printf '%s' "$COLLECTIVEX_OPERATOR_CONFIG_CONTENT" > "$config_path"; then + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT + rm -f -- "$config_path" + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + cx_die "cannot materialize runner configuration" + fi + elif [ "${COLLECTIVEX_OPERATOR_CONFIG_REQUIRED:-0}" = 1 ]; then + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT + cx_die "runner configuration is unavailable" + fi + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT COLLECTIVEX_OPERATOR_CONFIG_REQUIRED + if [ ! -e "$config_path" ]; then + COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$" + return 0 + fi + umask 077 + parsed_path="$(mktemp /tmp/inferencex-collectivex-parsed.XXXXXX)" || { + [ "$generated" = 0 ] || rm -f -- "$config_path" + cx_die "cannot parse runner configuration" + } + config_log="$(cx_private_log_path operator-config)" + if ! python3 - "$config_path" "${CX_RUNNER:-${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}}" \ + > "$parsed_path" 2> "$config_log" <<'PY' +import json +import os +import posixpath +import re +import stat +import sys + +RUNNERS = { + "h100-dgxc", "h200-dgxc", "b200-dgxc", "b300", + "gb200", "gb300", "mi325x", "mi355x", +} +FIELDS = { + "partition": "CX_PARTITION", + "account": "CX_ACCOUNT", + "squash_dir": "CX_SQUASH_DIR", + "stage_dir": "CX_STAGE_DIR", + "enroot_cache_path": "CX_ENROOT_CACHE_PATH", + "exclude_nodes": "CX_EXCLUDE_NODES", + "nodelist": "CX_NODELIST", + "lock_dir": "CX_LOCK_DIR", +} +REQUIRED = { + "h100-dgxc": {"partition", "account", "squash_dir"}, + "h200-dgxc": {"partition", "squash_dir"}, + "b200-dgxc": {"partition", "account", "squash_dir"}, + "b300": {"partition", "account", "squash_dir", "stage_dir"}, + "gb200": {"partition", "account", "storage_roots"}, + "gb300": {"partition", "account", "squash_dir", "stage_dir", "enroot_cache_path"}, + "mi325x": {"partition", "squash_dir"}, + "mi355x": {"partition", "squash_dir"}, +} +ALLOWED = { + "h100-dgxc": REQUIRED["h100-dgxc"] | {"exclude_nodes", "stage_dir"}, + "h200-dgxc": REQUIRED["h200-dgxc"] | {"account", "exclude_nodes", "stage_dir"}, + "b200-dgxc": REQUIRED["b200-dgxc"] | {"exclude_nodes", "stage_dir"}, + "b300": REQUIRED["b300"] | {"exclude_nodes"}, + "gb200": REQUIRED["gb200"], + "gb300": REQUIRED["gb300"], + "mi325x": REQUIRED["mi325x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"}, + "mi355x": REQUIRED["mi355x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"}, +} +TOKEN = re.compile(r"^[A-Za-z0-9_.\[\],-]+$") +PATH = re.compile(r"^/[A-Za-z0-9._/+\-]+$") +IPV4 = re.compile(r"(? 65536 + ): + raise ValueError + flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0) + descriptor = os.open(path, flags) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (metadata.st_dev, metadata.st_ino): + raise ValueError + payload = b"" + while len(payload) <= 65536: + chunk = os.read(descriptor, 65537 - len(payload)) + if not chunk: + break + payload += chunk + document = json.loads( + payload.decode("utf-8"), + object_pairs_hook=pairs, + parse_constant=lambda _: (_ for _ in ()).throw(ValueError()), + ) + finally: + os.close(descriptor) + if ( + set(document) != {"schema_version", "runners"} + or type(document["schema_version"]) is not int + or document["schema_version"] != 1 + ): + raise ValueError + runners = document["runners"] + if ( + not isinstance(runners, dict) or not runners or set(runners) - RUNNERS + or runner not in runners + ): + raise ValueError + selected = None + for name, config in runners.items(): + if not isinstance(config, dict) or not REQUIRED[name].issubset(config): + raise ValueError + if set(config) - ALLOWED[name]: + raise ValueError + for field, value in config.items(): + if field == "storage_roots": + if ( + not isinstance(value, list) or not 1 <= len(value) <= 16 + or len(value) != len(set(value)) or not all(valid_path(item) for item in value) + ): + raise ValueError + elif field.endswith(("_dir", "_path")): + if not valid_path(value): + raise ValueError + elif ( + not isinstance(value, str) or not value or len(value) > 512 + or not TOKEN.fullmatch(value) or IPV4.search(value) + ): + raise ValueError + if name == runner: + selected = dict(config) + if selected is None: + raise ValueError + roots = selected.pop("storage_roots", None) + if roots is not None: + for root in roots: + squash = posixpath.join(root, "collectivex", "containers") + stage = posixpath.join(root, "collectivex", "stage") + probes = [] + try: + for directory in (squash, stage): + os.makedirs(directory, mode=0o700, exist_ok=True) + probe = posixpath.join(directory, f".write-probe-{os.getpid()}") + fd = os.open(probe, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + os.close(fd) + probes.append(probe) + selected.update(squash_dir=squash, stage_dir=stage) + break + except OSError: + pass + finally: + for probe in probes: + try: + os.unlink(probe) + except OSError: + pass + else: + raise ValueError + for field, value in selected.items(): + key = FIELDS[field] + sys.stdout.buffer.write(key.encode() + b"\0" + value.encode() + b"\0") +except (KeyError, OSError, TypeError, UnicodeError, ValueError): + raise SystemExit(1) +PY + then + rm -f -- "$parsed_path" + [ "$generated" = 0 ] || rm -f -- "$config_path" + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL + cx_die "runner-local configuration failed" + fi + while IFS= read -r -d '' key && IFS= read -r -d '' value; do + printf -v "$key" '%s' "$value" + export "${key?}" + done < "$parsed_path" + rm -f -- "$parsed_path" + if [ "$generated" = 1 ] || [ "${COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL:-0}" = 1 ]; then + rm -f -- "$config_path" || cx_die "cannot remove ephemeral runner configuration" + fi + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL + COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$" +} + +cx_private_log_path() { + local label="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" path + path="$(python3 - "$tag" "$label" <<'PY' 2>/dev/null +import os +import re +import shutil +import stat +import sys +import time + +tag, label = sys.argv[1:] +if not all(re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", value) for value in (tag, label)): + raise SystemExit(1) +root = f"/tmp/inferencex-collectivex-{os.getuid()}" +old_umask = os.umask(0o077) +flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) +try: + try: + os.mkdir(root, 0o700) + except FileExistsError: + pass + root_fd = os.open(root, flags) + try: + metadata = os.fstat(root_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise OSError("unsafe root") + cutoff = time.time() - 86400 + for entry in os.scandir(root): + try: + if ( + entry.name != tag and entry.is_dir(follow_symlinks=False) + and entry.stat(follow_symlinks=False).st_mtime < cutoff + ): + shutil.rmtree(entry.path) + except OSError: + pass + try: + os.mkdir(tag, 0o700, dir_fd=root_fd) + except FileExistsError: + pass + directory_fd = os.open(tag, flags, dir_fd=root_fd) + try: + metadata = os.fstat(directory_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise OSError("unsafe directory") + log_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0) + log_fd = os.open(f"{label}.log", log_flags, 0o600, dir_fd=directory_fd) + os.close(log_fd) + finally: + os.close(directory_fd) + finally: + os.close(root_fd) +finally: + os.umask(old_umask) +print(f"{root}/{tag}/{label}.log", end="") +PY +)" || cx_die "cannot create private runtime log" + printf '%s' "$path" +} + +# Manual successes delete diagnostics immediately. Canonical workflow logs survive +# until artifact upload succeeds; failed logs remain private for debugging, and a +# later run prunes abandoned directories older than 24 hours. +cx_cleanup_private_logs() { + local rc="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" + [ "$rc" = 0 ] || return 0 + python3 - "$tag" <<'PY' >/dev/null 2>&1 || true +import os +import re +import shutil +import stat +import sys + +tag = sys.argv[1] +if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", tag): + raise SystemExit(1) +root = f"/tmp/inferencex-collectivex-{os.getuid()}" +flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) +root_fd = os.open(root, flags) +try: + metadata = os.fstat(root_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise SystemExit(1) +finally: + os.close(root_fd) +path = os.path.join(root, tag) +if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) +PY +} + +# Explicit Slurm export boundary. Operator config, runner credentials, HOME, +# workspace paths, and unrelated service secrets never enter the container. +cx_container_exports() { + printf '%s' 'COLLECTIVEX_SOURCE_SHA,COLLECTIVEX_ARTIFACT_NAME,COLLECTIVEX_EXECUTION_ID,COLLECTIVEX_CONTROL_SHA256,COLLECTIVEX_IMAGE,COLLECTIVEX_IMAGE_DIGEST,COLLECTIVEX_IMAGE_DIGEST_VERIFIED,COLLECTIVEX_SQUASH_SHA256,GITHUB_REF_NAME,GITHUB_REF,GITHUB_REPOSITORY,GITHUB_JOB,GITHUB_RUN_ID,GITHUB_RUN_ATTEMPT,GITHUB_SHA,CX_RUNNER,CX_BENCH,CX_NODES,CX_GPUS_PER_NODE,CX_SCALE_UP_DOMAIN,CX_SHARD_FILE,CX_SHARD_SKU,CX_NGPUS,CX_TS,CX_TOPO,CX_TRANSPORT,CX_PHASE,CX_ROUTING,CX_EPLB,CX_CASE_ID,CX_SUITE,CX_WORKLOAD_NAME,CX_REQUIRED_PUBLICATION,CX_HIDDEN,CX_TOPK,CX_EXPERTS,CX_TOKENS_LADDER,CX_CANONICAL,CX_ITERS,CX_TRIALS,CX_WARMUP,CX_SAMPLES_PER_POINT,CX_WARMUP_SEMANTICS,CX_SEED,CX_RUN_TIMEOUT,CX_NCCL_HOME,CX_ALLOW_MNNVL,CX_ATTEMPT_ID,CX_RUNTIME_MARKER,CX_MORI_KERNEL_TYPE,CX_WORKLOAD_DIR,CX_BACKEND_CACHE_ROOT,CX_BACKEND_CACHE_SENTINEL_SHA256,CX_BACKEND_SOURCE_ROOT,NCCL_CUMEM_ENABLE,NCCL_MNNVL_ENABLE,MC_FORCE_MNNVL,MORI_DISABLE_AUTO_XGMI,MORI_ENABLE_SDMA,MORI_APP_LOG_LEVEL,MORI_SHMEM_LOG_LEVEL,MORI_IO_LOG_LEVEL' + printf '%s' ',MORI_COMMIT' +} + +# Host-side utility steps need only the basic login paths. They never receive +# the complete Actions or runner environment. +cx_host_exports() { + printf '%s' 'HOME,PATH,USER,XDG_CACHE_HOME,ENROOT_CACHE_PATH' +} + +cx_prepare_runtime_marker() { + local mount_src="$1" tag="${COLLECTIVEX_EXECUTION_ID:-${CX_TS:-}}" marker + [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \ + || cx_die "cannot create runtime stage marker" + marker=".shards/runtime-stage-${tag}.txt" + mkdir -p "$mount_src/experimental/CollectiveX/.shards" >/dev/null 2>&1 \ + || cx_die "cannot create runtime stage marker" + rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 \ + || cx_die "cannot reset runtime stage marker" + export CX_RUNTIME_MARKER="$marker" +} + +cx_write_runtime_stage() { + local stage="$1" marker="${CX_RUNTIME_MARKER:-}" + [ -n "$marker" ] || return 0 + [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \ + || return 1 + case "$stage" in backend-setup|execution) ;; *) return 1 ;; esac + printf '%s\n' "$stage" > "$marker" +} + +cx_adopt_runtime_stage() { + local mount_src="$1" marker="${CX_RUNTIME_MARKER:-}" stage="" + [ -n "$marker" ] || return 0 + if [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \ + && [ -f "$mount_src/experimental/CollectiveX/$marker" ]; then + IFS= read -r stage < "$mount_src/experimental/CollectiveX/$marker" || true + rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 || true + case "$stage" in + backend-setup|execution) cx_set_failure_stage "$stage" ;; + esac + fi +} + +cx_require_vars() { + local name + local -a missing=() + for name in "$@"; do + [ -n "${!name:-}" ] || missing+=("$name") + done + [ "${#missing[@]}" -eq 0 ] || cx_die \ + "missing runner-local configuration: ${missing[*]} (set them in COLLECTIVEX_OPERATOR_CONFIG)" +} + +cx_require_single_node() { + [ "${CX_NODES:-1}" = "1" ] || cx_die "$1 supports one-node EP only" +} + +# A set shard path is an execution contract, never a hint. Validate it before +# staging/allocation and again in-container so a missing or stale control file +# cannot silently fall back to a manual single-case run. +cx_validate_shard_control() { + local cx_root="$1" shard="${CX_SHARD_FILE:-}" path expected_sku control_sha256 + [ -n "$shard" ] || return 0 + expected_sku="${CX_SHARD_SKU:-}" + [ -n "$expected_sku" ] || cx_die "CX_SHARD_SKU is required with CX_SHARD_FILE" + [ -n "${CX_BENCH:-}" ] || cx_die "CX_BENCH is required with CX_SHARD_FILE" + [[ "${CX_NODES:-}" =~ ^[1-9][0-9]*$ ]] \ + || cx_die "positive CX_NODES is required with CX_SHARD_FILE" + path="$shard" + [ -f "$path" ] || path="${cx_root%/}/$shard" + [ -f "$path" ] || cx_die "shard control does not exist" + [ -s "$path" ] || cx_die "shard control is empty" + python3 "${cx_root%/}/sweep_matrix.py" \ + --validate-control "$path" --expect-sku "$expected_sku" \ + --expect-backend "$CX_BENCH" --expect-nodes "$CX_NODES" >/dev/null 2>&1 \ + || cx_die "invalid shard control" + control_sha256="$(sha256sum "$path" | awk '{print $1}')" + [[ "$control_sha256" =~ ^[0-9a-f]{64}$ ]] \ + || cx_die "cannot hash shard control" + export COLLECTIVEX_CONTROL_SHA256="$control_sha256" +} + +cx_apply_timing_profile() { + [ -n "${CX_TIMING:-}" ] || return 0 + local iters trials warmup extra + IFS=: read -r iters trials warmup extra <<< "$CX_TIMING" + [[ "$iters" =~ ^[1-9][0-9]*$ && "$trials" =~ ^[1-9][0-9]*$ \ + && "$warmup" =~ ^[1-9][0-9]*$ && -z "$extra" ]] \ + || cx_die "CX_TIMING must be positive iters:trials:warmup" + export CX_ITERS="$iters" CX_TRIALS="$trials" CX_WARMUP="$warmup" +} + +# Allocate via salloc's stable grant message and assign JOB_ID in this shell. +# Raw scheduler output remains in the bounded private execution log. +cx_salloc_jobid() { + local log job_id salloc_rc=0 + log="$(cx_private_log_path scheduler-allocation)" + CX_ALLOCATION_REQUESTED=1 + # salloc has no portable --parsable option. Parse the stable grant message + # used by the production launchers, while also accepting a bare ID from + # site wrappers. + salloc "$@" --no-shell > "$log" 2>&1 || salloc_rc=$? + job_id="$(sed -nE \ + -e 's/^([0-9]+)(;[^[:space:]]+)?$/\1/p' \ + -e 's/.*Granted job allocation ([0-9]+).*/\1/p' \ + "$log" | head -n1)" + if [ -n "$job_id" ]; then + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + JOB_ID="$job_id" + fi + if [ "$salloc_rc" != 0 ]; then + cx_fail_stage scheduler-allocation "$log" + return 1 + fi + [ -n "$JOB_ID" ] || { + cx_fail_stage scheduler-allocation "$log" + return 1 + } +} + +cx_cancel_job() { + local job_id="$1" active attempt + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + scancel "$job_id" >/dev/null 2>&1 || true + for ((attempt = 0; attempt < 60; attempt++)); do + if ! active="$(squeue -h -j "$job_id" -o %A 2>/dev/null)"; then + sleep 2 + continue + fi + [ -n "$active" ] || return 0 + sleep 2 + done + cx_log "ERROR: scheduled allocation did not terminate during cleanup" + return 1 +} + +cx_write_cleanup_guard() { + local state="$1" root="${CX_JOB_ROOT:-}" safe unsafe + [[ "$root" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + && [ -d "$root" ] && [ ! -L "$root" ] \ + && [ "$(stat -c '%u:%a' "$root" 2>/dev/null)" = "$(id -u):700" ] || return 0 + safe="$root/cleanup-safe" + unsafe="$root/cleanup-unsafe" + umask 077 + case "$state" in + safe) : > "$safe" && rm -f -- "$unsafe" ;; + unsafe) rm -f -- "$safe" && : > "$unsafe" ;; + *) return 1 ;; + esac +} + +# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI +# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import +# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# Import remains tag-based because Enroot cannot reliably import a digest-qualified +# Docker Hub reference non-interactively. The registry digest is resolved and checked +# immediately before import, then recorded as verified provenance. +CX_IMAGE_MULTIARCH_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975" +# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based +# squash creation on these nodes — "failed to mount overlay ... Invalid argument". +# v0.5.11-cu130 imports cleanly.) +# Runtime setup verifies the image-bundled DeepEP build for the detected GPU target. +CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" + +# AMD (ROCm/CDNA): separate single-arch images bundle MoRI. +CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" +CX_IMAGE_AMD_MORI_DIGEST="sha256:24c3b30d64475937abbb6498e3b29528649adcb836dde7a468979f767809b0e8" +CX_MORI_COMMIT_MI355="99bc0a3a6e7a70aacc6372cd9a4275ccfb4de567" # pragma: allowlist secret +CX_IMAGE_AMD_MORI_MI325="rocm/sgl-dev:sglang-0.5.14-rocm720-mi35x-mori-0701" +CX_IMAGE_AMD_MORI_MI325_DIGEST="sha256:ea42375343c2ef8f73b3bdb9e1b7b435556e3ca92aba5e3f74ada29ba217fabc" +CX_MORI_COMMIT_MI325="bf99bdf18fc69887a346913ca01c315c2aa9bd4c" # pragma: allowlist secret +cx_default_image() { + case "$1" in + mi325x*) echo "$CX_IMAGE_AMD_MORI_MI325" ;; + mi355x*) echo "$CX_IMAGE_AMD_MORI" ;; + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; + *) cx_die "no default image for runner prefix: $1" ;; + esac +} + +cx_resolve_registry_digest() { + local image="$1" repository reference token digest registry + if [[ "$image" == *@* ]]; then + cx_die "digest-qualified image overrides are unsupported; configure a tag and pinned digest" + fi + registry="${image%%/*}" + if [[ "$image" == */* && ( "$registry" == *.* || "$registry" == *:* || "$registry" = localhost ) ]]; then + case "$registry" in + docker.io|registry-1.docker.io) image="${image#*/}" ;; + *) cx_die "only Docker Hub images are supported by the registry verifier" ;; + esac + fi + repository="${image%:*}" + reference="${image##*:}" + [ "$repository" != "$image" ] || { repository="$image"; reference=latest; } + [ -n "$repository" ] && [ -n "$reference" ] \ + || cx_die "configured image reference is malformed" + [[ "$repository" == */* ]] || repository="library/$repository" + token="$(curl -fsSLG --connect-timeout 10 --max-time 30 --retry 2 \ + --retry-delay 1 --retry-all-errors 'https://auth.docker.io/token' \ + --data-urlencode 'service=registry.docker.io' \ + --data-urlencode "scope=repository:${repository}:pull" \ + | python3 -c 'import json,sys; print(json.load(sys.stdin)["token"])')" \ + || cx_die "cannot authenticate to the image registry" + digest="$(curl -fsSI --connect-timeout 10 --max-time 30 --retry 2 \ + --retry-delay 1 --retry-all-errors \ + -H "Authorization: Bearer $token" \ + -H 'Accept: application/vnd.oci.image.index.v1+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.docker.distribution.manifest.v2+json' \ + "https://registry-1.docker.io/v2/${repository}/manifests/${reference}" \ + | tr -d '\r' | awk 'tolower($1)=="docker-content-digest:" {print $2; exit}')" \ + || cx_die "cannot resolve the configured image digest" + [[ "$digest" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || cx_die "registry returned an invalid image digest" + printf '%s' "$digest" +} + +cx_verify_registry_image() { + local image="$1" expected actual + expected="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$image")}" + [[ "$expected" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || cx_die "a pinned digest is required for the configured image" + actual="$(cx_resolve_registry_digest "$image")" + [ "$actual" = "$expected" ] \ + || cx_die "configured image tag no longer matches its pinned digest" + export COLLECTIVEX_IMAGE="$image" COLLECTIVEX_IMAGE_DIGEST="$actual" + export COLLECTIVEX_IMAGE_DIGEST_VERIFIED=1 +} + +cx_default_image_digest() { + case "$1" in + "$CX_IMAGE_MULTIARCH") printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST" ;; + "$CX_IMAGE_AMD_MORI") printf '%s' "$CX_IMAGE_AMD_MORI_DIGEST" ;; + "$CX_IMAGE_AMD_MORI_MI325") printf '%s' "$CX_IMAGE_AMD_MORI_MI325_DIGEST" ;; + esac +} + +# Canonical workflow runs must not inherit benchmark controls from a persistent +# self-hosted runner service. Manual/SSH diagnostics retain their explicit +# overrides by leaving COLLECTIVEX_CANONICAL_GHA unset. +cx_gha_workspace_stage_root() { + local workspace="${GITHUB_WORKSPACE:-}" + python3 - "$workspace" <<'PY' +import os +import stat +import sys + +workspace = sys.argv[1] +try: + if ( + not os.path.isabs(workspace) + or os.path.realpath(workspace) != workspace + or not os.path.isdir(workspace) + ): + raise OSError + metadata = os.stat(workspace, follow_symlinks=False) + # GitHub runner workspaces are runner-owned but commonly writable by the + # trusted runner-service group. Keep the child mode 0700 and reject world write. + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) & stat.S_IWOTH: + raise OSError +except OSError: + raise SystemExit(1) +print(workspace, end="") +PY +} + +# Create a per-UID cache under validated cluster-local storage. Only the fixed +# /cx-cache mount enters the container; the operator host path does not. +cx_prepare_backend_cache() { + local stage_parent="$1" cache info sentinel_sha256 + unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_CACHE_SENTINEL_SHA256 + info="$(python3 - "$stage_parent" <<'PY' +import hashlib +import os +import secrets +import stat +import sys + +configured_parent = sys.argv[1] +try: + if ( + not os.path.isabs(configured_parent) + or "\n" in configured_parent + or "\r" in configured_parent + ): + raise OSError + parent = os.path.realpath(configured_parent) + if not os.path.isdir(parent): + raise OSError + flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) + parent_fd = os.open(parent, flags) + try: + probe_name = f".collectivex-owner-probe-{os.getpid()}-{secrets.token_hex(8)}" + os.mkdir(probe_name, 0o700, dir_fd=parent_fd) + try: + probe_fd = os.open(probe_name, flags, dir_fd=parent_fd) + try: + probe = os.fstat(probe_fd) + if stat.S_IMODE(probe.st_mode) & 0o777 != 0o700: + raise OSError + realized_owner = probe.st_uid + finally: + os.close(probe_fd) + finally: + os.rmdir(probe_name, dir_fd=parent_fd) + for generation in (3, 4): + name = f".collectivex-backend-cache-v{generation}-{os.getuid()}" + try: + os.mkdir(name, 0o700, dir_fd=parent_fd) + except FileExistsError: + pass + try: + cache_fd = os.open(name, flags, dir_fd=parent_fd) + try: + metadata = os.fstat(cache_fd) + if ( + metadata.st_uid != realized_owner + or stat.S_IMODE(metadata.st_mode) & 0o777 != 0o700 + ): + raise OSError + sentinel_name = ".collectivex-mount-sentinel-v1" + temporary_name = ( + f"{sentinel_name}.tmp.{os.getpid()}.{secrets.token_hex(8)}" + ) + create_flags = ( + os.O_WRONLY | os.O_CREAT | os.O_EXCL + | getattr(os, "O_NOFOLLOW", 0) + ) + payload = secrets.token_bytes(32) + temporary_fd = os.open( + temporary_name, create_flags, 0o600, dir_fd=cache_fd + ) + try: + try: + view = memoryview(payload) + try: + while view: + written = os.write(temporary_fd, view) + if written <= 0: + raise OSError + view = view[written:] + os.fsync(temporary_fd) + finally: + view.release() + finally: + os.close(temporary_fd) + try: + os.link( + temporary_name, + sentinel_name, + src_dir_fd=cache_fd, + dst_dir_fd=cache_fd, + follow_symlinks=False, + ) + except FileExistsError: + pass + finally: + try: + os.unlink(temporary_name, dir_fd=cache_fd) + except FileNotFoundError: + pass + sentinel_fd = os.open( + sentinel_name, + os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0), + dir_fd=cache_fd, + ) + try: + sentinel = os.fstat(sentinel_fd) + payload = os.read(sentinel_fd, 33) + if ( + not stat.S_ISREG(sentinel.st_mode) + or sentinel.st_uid != realized_owner + or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600 + or sentinel.st_size != 32 + or len(payload) != 32 + ): + raise OSError + sentinel_sha256 = hashlib.sha256(payload).hexdigest() + finally: + os.close(sentinel_fd) + finally: + os.close(cache_fd) + except OSError: + if generation == 3: + continue + raise + break + finally: + os.close(parent_fd) +except OSError: + raise SystemExit(1) +print(sentinel_sha256, os.path.join(parent, name), end="") +PY +)" || return 1 + sentinel_sha256="${info%% *}" + cache="${info#* }" + [ "$cache" != "$info" ] && [[ "$sentinel_sha256" =~ ^[0-9a-f]{64}$ ]] \ + && [[ "$cache" = /* ]] || return 1 + export CX_PREPARED_BACKEND_CACHE="$cache" + export CX_BACKEND_CACHE_SENTINEL_SHA256="$sentinel_sha256" +} + +cx_verify_backend_cache_mount() { + python3 - "${CX_BACKEND_CACHE_ROOT:-}" \ + "${CX_BACKEND_CACHE_SENTINEL_SHA256:-}" <<'PY' +import hashlib +import os +import re +import stat +import sys + +root, expected = sys.argv[1:] +try: + if ( + not os.path.isabs(root) + or os.path.realpath(root) != root + or re.fullmatch(r"[0-9a-f]{64}", expected) is None + ): + raise OSError + flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) + root_fd = os.open(root, flags) + try: + root_item = os.fstat(root_fd) + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + ): + raise OSError + sentinel_fd = os.open( + ".collectivex-mount-sentinel-v1", + os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0), + dir_fd=root_fd, + ) + try: + sentinel = os.fstat(sentinel_fd) + payload = os.read(sentinel_fd, 33) + if ( + not stat.S_ISREG(sentinel.st_mode) + or sentinel.st_uid != root_item.st_uid + or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600 + or sentinel.st_size != 32 + or len(payload) != 32 + or hashlib.sha256(payload).hexdigest() != expected + ): + raise OSError + finally: + os.close(sentinel_fd) + finally: + os.close(root_fd) +except OSError: + raise SystemExit(1) +PY +} + +cx_git() { + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null GIT_TERMINAL_PROMPT=0 \ + git -c credential.helper= "$@" +} + +cx_git_in_tree() { + local directory="$1" canonical + shift + [[ "$directory" = /* ]] && [ -d "$directory" ] && [ ! -L "$directory" ] \ + || return 1 + [[ "$directory" != *'*'* && "$directory" != *$'\n'* && "$directory" != *$'\r'* ]] \ + || return 1 + canonical="$(cd -P -- "$directory" && pwd -P)" || return 1 + cx_git -c "safe.directory=$canonical" -C "$canonical" "$@" +} + +cx_fetch_revision() { + local repository="$1" revision="$2" destination="$3" attempt + for attempt in 1 2 3; do + rm -rf -- "$destination" + if cx_git init -q "$destination" \ + && cx_git_in_tree "$destination" remote add origin "$repository" \ + && cx_git_in_tree "$destination" fetch -q --no-tags --depth 1 origin "$revision" \ + && cx_git_in_tree "$destination" -c advice.detachedHead=false \ + checkout -q --detach FETCH_HEAD \ + && [ "$(cx_git_in_tree "$destination" rev-parse HEAD)" = "$revision" ]; then + return 0 + fi + [ "$attempt" = 3 ] || sleep $((attempt * 5)) + done + return 1 +} + +cx_backend_source_pin() { + case "$1" in + deepep-v2) + printf '%s|%s|%s' \ + "$CX_DEEPEP_V2_COMMIT" "$CX_DEEPEP_V2_TREE" "$CX_DEEPEP_V2_FMT_COMMIT" + ;; + deepep-hybrid) + printf '%s|%s|' "$CX_DEEPEP_HYBRID_COMMIT" "$CX_DEEPEP_HYBRID_TREE" + ;; + *) return 1 ;; + esac +} + +cx_backend_source_path() { + local root="$1" backend="$2" revision tree fmt pin + pin="$(cx_backend_source_pin "$backend")" || return 1 + IFS='|' read -r revision tree fmt <<< "$pin" + printf '%s/%s-%s' "$root" "$backend" "$revision" +} + +cx_backend_source_is_valid() { + local backend="$1" source="$2" revision tree fmt pin status ignored + pin="$(cx_backend_source_pin "$backend")" || return 1 + IFS='|' read -r revision tree fmt <<< "$pin" + [ -d "$source" ] && [ ! -L "$source" ] \ + && [ "$(cx_git_in_tree "$source" rev-parse HEAD 2>/dev/null)" = "$revision" ] \ + && [ "$(cx_git_in_tree "$source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + || return 1 + status="$(cx_git_in_tree "$source" status --porcelain --untracked-files=all \ + --ignore-submodules=none 2>/dev/null)" || return 1 + [ -z "$status" ] || return 1 + ignored="$(cx_git_in_tree "$source" ls-files --others --ignored --exclude-standard \ + 2>/dev/null)" || return 1 + [ -z "$ignored" ] || return 1 + [ -z "$fmt" ] \ + || [ "$(cx_git_in_tree "$source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt" ] +} + +cx_extension_pair_sha256() { + python3 - "$1" "$2" "$3" <<'PY' +import hashlib +import os +from pathlib import Path +import stat +import sys + +root = Path(sys.argv[1]) +digest = hashlib.sha256() +try: + if root.is_symlink() or not root.is_dir(): + raise OSError + for pattern in sys.argv[2:]: + matches = list(root.glob(pattern)) + if len(matches) != 1 or matches[0].is_symlink(): + raise OSError + path = matches[0] + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + metadata = os.fstat(descriptor) + if not stat.S_ISREG(metadata.st_mode): + raise OSError + file_digest = hashlib.sha256() + with os.fdopen(descriptor, "rb", closefd=False) as stream: + for chunk in iter(lambda: stream.read(1024 * 1024), b""): + file_digest.update(chunk) + digest.update(path.name.encode("utf-8") + b"\0") + digest.update(str(metadata.st_size).encode("ascii") + b"\0") + digest.update(file_digest.digest()) + finally: + os.close(descriptor) +except (OSError, UnicodeError): + raise SystemExit(1) +print(digest.hexdigest(), end="") +PY +} + +# Acquire source before compute allocation, preferring the verified same-run GHA seed. +_cx_prepare_backend_source() { + local mount_src="$1" backend="$2" root source temporary revision tree fmt pin + local root_mode stage_mode root_owner stage_owner + local seed_root="${CX_BACKEND_SOURCE_SEED_ROOT:-}" seed seed_mode + root="$mount_src/experimental/CollectiveX/.cx_sources" + CX_BACKEND_SOURCE_STEP="source mount creation" + if [ ! -e "$root" ] && [ ! -L "$root" ]; then + mkdir -m 700 -- "$root" || return 1 + fi + CX_BACKEND_SOURCE_STEP="source mount ownership validation" + [ -d "$mount_src" ] && [ ! -L "$mount_src" ] \ + && [ -d "$root" ] && [ ! -L "$root" ] || return 1 + stage_owner="$(stat -c '%u' "$mount_src" 2>/dev/null)" || return 1 + root_owner="$(stat -c '%u' "$root" 2>/dev/null)" || return 1 + [ "$root_owner" = "$stage_owner" ] || return 1 + stage_mode="$(stat -c '%a' "$mount_src" 2>/dev/null)" || return 1 + case "$stage_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + # Shared stage parents may retain harmless special bits despite mkdir -m. + CX_BACKEND_SOURCE_STEP="source mount permission inspection" + root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1 + case "$root_mode" in + 700|[1-7]700) ;; + *) + CX_BACKEND_SOURCE_STEP="source mount permission normalization" + chmod 700 "$root" || return 1 + CX_BACKEND_SOURCE_STEP="source mount permission validation" + root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1 + case "$root_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + ;; + esac + CX_BACKEND_SOURCE_STEP="git lookup" + command -v git >/dev/null || return 1 + CX_BACKEND_SOURCE_STEP="source pin resolution" + source="$(cx_backend_source_path "$root" "$backend")" || return 1 + if [ -e "$source" ] || [ -L "$source" ]; then + CX_BACKEND_SOURCE_STEP="existing source validation" + cx_backend_source_is_valid "$backend" "$source" + return + fi + if [ -n "$seed_root" ]; then + CX_BACKEND_SOURCE_STEP="source seed validation" + [[ "$seed_root" = /* ]] && [ -d "$seed_root" ] && [ ! -L "$seed_root" ] \ + || return 1 + seed_mode="$(stat -c '%a' "$seed_root" 2>/dev/null)" || return 1 + case "$seed_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + seed="$(cx_backend_source_path "$seed_root" "$backend")" || return 1 + cx_backend_source_is_valid "$backend" "$seed" || return 1 + CX_BACKEND_SOURCE_STEP="source seed copy" + temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1 + if ! cp -R -- "$seed/." "$temporary/" \ + || ! cx_backend_source_is_valid "$backend" "$temporary" \ + || ! mv -- "$temporary" "$source"; then + rm -rf -- "$temporary" + return 1 + fi + return + fi + if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ]; then + CX_BACKEND_SOURCE_STEP="source seed validation" + return 1 + fi + CX_BACKEND_SOURCE_STEP="source checkout creation" + temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1 + CX_BACKEND_SOURCE_STEP="source pin resolution" + pin="$(cx_backend_source_pin "$backend")" || { + rm -rf -- "$temporary" + return 1 + } + IFS='|' read -r revision tree fmt <<< "$pin" + CX_BACKEND_SOURCE_STEP="revision fetch" + if ! cx_fetch_revision \ + https://github.com/deepseek-ai/DeepEP "$revision" "$temporary"; then + rm -rf -- "$temporary" + return 1 + fi + CX_BACKEND_SOURCE_STEP="submodule fetch" + if [ -n "$fmt" ] && ! cx_git_in_tree "$temporary" \ + -c "safe.directory=$temporary/third-party/fmt" \ + submodule update -q --init --depth 1 third-party/fmt; then + rm -rf -- "$temporary" + return 1 + fi + CX_BACKEND_SOURCE_STEP="source publication validation" + if ! cx_backend_source_is_valid "$backend" "$temporary" \ + || ! mv -- "$temporary" "$source"; then + rm -rf -- "$temporary" + return 1 + fi +} + +cx_prepare_backend_source() { + local log backend="$2" CX_BACKEND_SOURCE_STEP="initialization" + log="$(cx_private_log_path "backend-source-$backend")" || return 1 + if _cx_prepare_backend_source "$@" > "$log" 2>&1; then + return 0 + fi + printf '%s failed\n' "$CX_BACKEND_SOURCE_STEP" >> "$log" + cx_log "ERROR: backend-source-step=${CX_BACKEND_SOURCE_STEP// /-}" + cx_fail_stage backend-setup "$log" +} + +cx_materialize_backend_source() { + local backend="$1" destination="$2" source parent temporary + [ -n "${CX_BACKEND_SOURCE_ROOT:-}" ] || return 1 + source="$(cx_backend_source_path "$CX_BACKEND_SOURCE_ROOT" "$backend")" || return 1 + cx_backend_source_is_valid "$backend" "$source" || return 1 + parent="${destination%/*}" + [ "$parent" != "$destination" ] && [ -d "$parent" ] && [ ! -L "$parent" ] \ + || return 1 + temporary="$(mktemp -d "$parent/.collectivex-source.XXXXXX")" || return 1 + if ! cp -R -- "$source/." "$temporary/" \ + || ! cx_backend_source_is_valid "$backend" "$temporary"; then + rm -rf -- "$temporary" + return 1 + fi + if ! rm -rf -- "$destination" || ! mv -- "$temporary" "$destination"; then + rm -rf -- "$temporary" + return 1 + fi + if ! cx_backend_source_is_valid "$backend" "$destination"; then + rm -rf -- "$destination" + return 1 + fi + return 0 +} + +cx_lock_canonical_gha_env() { + local runner="$1" expected_nodes expected_gpn expected_world trusted_lock_dir="" + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || return 0 + [ "${GITHUB_ACTIONS:-}" = true ] \ + || cx_die "canonical CollectiveX execution requires GitHub Actions" + [ -n "${CX_SHARD_FILE:-}" ] && [ "${CX_SHARD_SKU:-}" = "$runner" ] \ + || cx_die "canonical CollectiveX execution requires a matched shard" + [[ "${GITHUB_RUN_ID:-}" =~ ^[1-9][0-9]*$ \ + && "${GITHUB_RUN_ATTEMPT:-}" =~ ^[1-9][0-9]*$ \ + && "${COLLECTIVEX_SOURCE_SHA:-}" =~ ^[0-9a-f]{40,64}$ ]] \ + || cx_die "canonical CollectiveX workflow identity is incomplete" + + # cx_load_operator_config clears inherited values before setting this process marker. + # Preserve only its validated AMD lock path; direct runner-service values stay untrusted. + [ "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" != "$$" ] \ + || trusted_lock_dir="${CX_LOCK_DIR:-}" + unset CX_NCCL_HOME CX_MASTER_PORT CX_MORI_KERNEL_TYPE CX_LOCK_DIR + unset MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA + unset MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL + unset NCCL_CUMEM_ENABLE NCCL_MNNVL_ENABLE MC_FORCE_MNNVL + unset CX_BACKEND_CACHE_ROOT CX_BACKEND_CACHE_SENTINEL_SHA256 + unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_SOURCE_ROOT + + [ -n "${CX_SQUASH_DIR:-}" ] \ + || cx_die "canonical CollectiveX execution requires shared container storage" + + case "$runner" in + h100-dgxc|h200-dgxc|b200-dgxc|b300) + expected_nodes=1; expected_gpn=8 + CX_IMAGE="$CX_IMAGE_MULTIARCH" + CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST" + CX_NCCL_HOME=/usr + ;; + gb200|gb300) + expected_nodes="${CX_NODES:-}"; expected_gpn=4 + [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \ + || cx_die "canonical GB execution requires one or two trays" + CX_IMAGE="$CX_IMAGE_MULTIARCH" + CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST" + CX_NCCL_HOME=/usr + CX_MASTER_PORT=29551 + ;; + mi325x) + expected_nodes=1; expected_gpn=8 + CX_STAGE_DIR="$(cx_gha_workspace_stage_root)" \ + || cx_die "canonical AMD staging workspace is unsafe" + CX_IMAGE="$CX_IMAGE_AMD_MORI_MI325" + CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_MI325_DIGEST" + CX_MORI_KERNEL_TYPE=asyncll + MORI_COMMIT="$CX_MORI_COMMIT_MI325" + MORI_DISABLE_AUTO_XGMI=0 + MORI_ENABLE_SDMA=1 + MORI_APP_LOG_LEVEL=info + MORI_SHMEM_LOG_LEVEL=info + MORI_IO_LOG_LEVEL=info + ;; + mi355x) + expected_nodes=1; expected_gpn=8 + CX_STAGE_DIR="$(cx_gha_workspace_stage_root)" \ + || cx_die "canonical AMD staging workspace is unsafe" + CX_IMAGE="$CX_IMAGE_AMD_MORI" + CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_DIGEST" + CX_MORI_KERNEL_TYPE=intranode + MORI_COMMIT="$CX_MORI_COMMIT_MI355" + ;; + *) cx_die "canonical CollectiveX runner is not registered" ;; + esac + case "$runner:$trusted_lock_dir" in + mi325x:?*|mi355x:?*) export CX_LOCK_DIR="$trusted_lock_dir" ;; + esac + CX_STAGE_DIR="${CX_STAGE_DIR:-$CX_SQUASH_DIR/.stage}" + export CX_STAGE_DIR + [ "${CX_NODES:-}" = "$expected_nodes" ] \ + && [ "${CX_GPUS_PER_NODE:-}" = "$expected_gpn" ] \ + || cx_die "canonical CollectiveX placement differs from the shard" + expected_world=$((expected_nodes * expected_gpn)) + CX_NGPUS="$expected_world" + CX_SEED=67 + case "$runner" in mi325x|mi355x) CX_RUN_TIMEOUT=1800 ;; *) CX_RUN_TIMEOUT=900 ;; esac + unset CX_PUBLIC_RUNNER CX_GB_PRODUCT CX_DRYRUN CX_TIMING CX_ALLOW_MNNVL + unset CX_ENROOT_LOCAL_IMPORT COLLECTIVEX_IMAGE COLLECTIVEX_IMAGE_DIGEST + unset COLLECTIVEX_IMAGE_DIGEST_VERIFIED COLLECTIVEX_SQUASH_SHA256 + export CX_IMAGE CX_IMAGE_DIGEST CX_NGPUS CX_SEED CX_RUN_TIMEOUT + case "$runner" in + h100-dgxc|h200-dgxc|b200-dgxc|b300) export CX_NCCL_HOME ;; + gb200|gb300) export CX_NCCL_HOME CX_MASTER_PORT ;; + mi325x) + export CX_MORI_KERNEL_TYPE MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA + export MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL + ;; + mi355x) export CX_MORI_KERNEL_TYPE MORI_COMMIT ;; + esac +} + +cx_reverify_registry_image() { + local image="$1" actual + [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \ + && [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" = 1 ] || return 1 + actual="$(cx_resolve_registry_digest "$image")" || return 1 + [ "$actual" = "$COLLECTIVEX_IMAGE_DIGEST" ] || { + cx_log "ERROR: configured image tag changed during container import" + return 1 + } +} + +cx_export_squash_identity() { + local image="$1" digest log + log="$(cx_private_log_path container-hash)" + digest="$(sha256sum "$image" 2>> "$log" | awk '{print $1}')" + [[ "$digest" =~ ^[0-9a-f]{64}$ ]] \ + || { cx_fail_stage container-hash "$log"; return 1; } + export COLLECTIVEX_SQUASH_SHA256="$digest" +} + +cx_squash_path() { + local squash_dir="$1" image="$2" key platform + [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || return 1 + case "${CX_IMAGE_PLATFORM:-}" in + linux/amd64) platform="" ;; + linux/arm64) platform="_linux_arm64" ;; + *) return 1 ;; + esac + key="${CX_SQUASH_FORMAT_VERSION}${platform}_${COLLECTIVEX_IMAGE_DIGEST#sha256:}_$( + printf '%s' "$image" | sed 's#[/:@#]#_#g' + )" + printf '%s' "$squash_dir/${key}.sqsh" +} + +# cx_ensure_squash -> echoes the squash file path. +# Imports via Enroot only if a valid squash is not already present, under a lock. +cx_ensure_squash() { + local squash_dir="$1" image="$2" key sq locks lock_fd log + local enroot_local="" import_rc=0 machine + log="$(cx_private_log_path container-import)" + machine="$(uname -m)" + case "${CX_IMAGE_PLATFORM:-}:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) cx_fail_stage container-import "$log"; return 1 ;; + esac + mkdir -p "$squash_dir" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + sq="$(cx_squash_path "$squash_dir" "$image")" \ + || { cx_fail_stage container-import "$log"; return 1; } + key="${sq##*/}" + key="${key%.sqsh}" + locks="$squash_dir/.locks" + mkdir -p "$locks" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + { exec {lock_fd}>"$locks/${key}.lock"; } 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + flock -w 900 "$lock_fd" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + if unsquashfs -l "$sq" >/dev/null 2>&1; then + cx_log "container squash ready" + else + cx_log "importing configured container image" + rm -f "$sq" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + # > "$log" 2>&1 || import_rc=$? + rm -rf -- "$enroot_local" >/dev/null 2>&1 || true + [ "$import_rc" = 0 ] \ + || { cx_fail_stage container-import "$log"; return 1; } + else + SOURCE_DATE_EPOCH="$CX_SQUASH_SOURCE_DATE_EPOCH" \ + enroot import -o "$sq" "docker://$image" > "$log" 2>&1 \ + || { cx_fail_stage container-import "$log"; return 1; } + fi + unsquashfs -l "$sq" >> "$log" 2>&1 \ + || { cx_fail_stage container-import "$log"; return 1; } + fi + if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then + flock -u "$lock_fd" >/dev/null 2>&1 || true + exec {lock_fd}>&- + cx_fail_stage container-import "$log" + return 1 + fi + flock -u "$lock_fd" + exec {lock_fd}>&- + echo "$sq" +} + +# Import on an allocated compute node so multiarch tags resolve for the target +# architecture. The squash directory must be shared with the submit host. +cx_ensure_squash_on_job() { + local job_id="$1" squash_dir="$2" image="$3" lock_dir="${4:-}" sq key lock log + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + sq="$(cx_squash_path "$squash_dir" "$image")" || return 1 + key="${sq##*/}" + key="${key%.sqsh}" + [ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks" + lock="$lock_dir/${key}.lock" + log="$(cx_private_log_path container-import)" + if ! srun --jobid="$job_id" --nodes=1 --ntasks=1 --chdir=/tmp \ + --export="$(cx_host_exports)" \ + bash -s -- "$sq" "$lock" "$image" "$CX_SQUASH_SOURCE_DATE_EPOCH" \ + "$CX_IMAGE_PLATFORM" \ + > "$log" 2>&1 <<'BASH' +set -euo pipefail +sq="$1"; lock="$2"; image="$3"; source_date_epoch="$4"; platform="$5" +machine="$(uname -m)" +case "$platform:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) exit 13 ;; +esac +compute_home="$(mktemp -d /tmp/inferencex-collectivex-home.XXXXXX)" +trap 'rm -rf -- "$compute_home"' EXIT +export HOME="$compute_home" XDG_CACHE_HOME="$compute_home/.cache" +export ENROOT_TEMP_PATH="$compute_home/enroot-tmp" +export ENROOT_CACHE_PATH="$compute_home/enroot-cache" +export ENROOT_DATA_PATH="$compute_home/enroot-data" +export ENROOT_RUNTIME_PATH="$compute_home/enroot-run" +mkdir -p "$(dirname "$sq")" "$(dirname "$lock")" \ + "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH" +exec 9>"$lock" +flock -w 900 9 +if unsquashfs -l "$sq" >/dev/null 2>&1; then + echo 'container squash ready' +else + rm -f -- "$sq" + SOURCE_DATE_EPOCH="$source_date_epoch" \ + enroot import -o "$sq" "docker://$image" /dev/null 2>&1 +fi +BASH + then + cx_fail_stage container-import "$log" + return 1 + fi + if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then + cx_fail_stage container-import "$log" + return 1 + fi + printf '%s' "$sq" +} + +cx_preflight_allocation() { + local job_id="$1" nodes="$2" mount_src="$3" squash="$4" shard="${5:-}" + local log rc=0 runtime shard_path="" + runtime="$mount_src/experimental/CollectiveX/runtime/run_in_container.sh" + [ -z "$shard" ] || shard_path="$mount_src/experimental/CollectiveX/$shard" + log="$(cx_private_log_path allocation-preflight)" + srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \ + --chdir=/tmp \ + --export="$(cx_host_exports)" bash -s -- "$runtime" "$shard_path" "$squash" \ + "$CX_IMAGE_PLATFORM" \ + > "$log" 2>&1 <<'BASH' || rc=$? +set -euo pipefail +machine="$(uname -m)" +case "$4:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) exit 13 ;; +esac +test -r "$1" || exit 10 +[ -z "$2" ] || test -r "$2" || exit 11 +test -r "$3" || exit 12 +unsquashfs -s "$3" >/dev/null 2>&1 || exit 12 +BASH + [ "$rc" = 0 ] && return 0 + case "$rc" in + 10|11) cx_fail_stage repository-stage "$log" ;; + 12) cx_fail_stage container-hash "$log" ;; + *) cx_fail_stage container-launch "$log" ;; + esac + return 1 +} + +# cx_stage_repo -> echoes the mount-source root. +# Stage only the public benchmark tree onto compute-visible storage. Canonical +# GHA requires an operator-configured base; manual diagnostics use an isolated +# directory under the already-required squash storage so ignored private notes +# are never mounted into a compute container. +cx_stage_repo() { + local repo_root="$1" stage_dir="${2:-}" log tag safe_tag + cx_validate_shard_control "$repo_root/experimental/CollectiveX" + if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] \ + && { [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; }; then + cx_die "canonical CollectiveX execution requires compute-visible staging" + fi + # Concurrency isolation. Under GHA the per-config concurrency fan-out runs many + # same-SKU dispatches at once, all staging into the SAME shared base dir; a + # shared dir + `rsync --delete` lets one job unlink/replace a file a peer is + # mid-read of -> "error reading input file: Stale file handle" on the next + # `srun ... run_in_container.sh`. Give each EXECUTING job its own subdir keyed on + # a workflow-provided execution id. Manual runs use the launcher PID. + tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}" + safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')" + if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then + [ -n "${CX_SQUASH_DIR:-}" ] \ + || cx_die "manual CollectiveX staging requires CX_SQUASH_DIR" + stage_dir="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag" + else + stage_dir="${stage_dir%/}/job_$safe_tag" + fi + mkdir -p "${stage_dir%/*}" 2>/dev/null \ + || cx_die "cannot create the configured stage base" + if [ -e "$stage_dir" ] || [ -L "$stage_dir" ]; then + cx_die "refusing to reuse a pre-existing execution stage" + fi + mkdir -m 700 "$stage_dir" 2>/dev/null \ + || cx_die "cannot create the configured stage directory" + mkdir -m 700 "$stage_dir/experimental" 2>/dev/null \ + || cx_die "cannot create the configured stage directory" + cx_log "staging CollectiveX on compute-visible storage" + log="$(cx_private_log_path repository-stage)" + if ! rsync -a --delete --delete-excluded \ + --exclude='__pycache__/' --exclude='results/' --exclude='.cx_workloads/' \ + --exclude='.cx_backend/' --exclude='.cx_sources/' \ + --exclude='configs/platforms.yaml' --exclude='private-infra.md' \ + --exclude='goal.md' --exclude='notes.md' \ + "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" > "$log" 2>&1; then + rm -rf -- "$stage_dir" >/dev/null 2>&1 \ + || cx_log "ERROR: cannot remove the incomplete execution stage" + cx_fail_stage repository-stage "$log" || true + return 1 + fi + echo "$stage_dir" +} + +# cx_collect_results +# When the run used a staged (compute-visible) mount, copy result JSONs back to +# the original checkout's results/ so the workflow's upload-artifact (which reads +# the checkout, not the stage dir) finds them. No-op when no staging was used. +cx_collect_results() { + local mount_src="$1" repo_root="$2" dst log + local -a files + [ "$mount_src" = "$repo_root" ] && return 0 + log="$(cx_private_log_path "artifact-collection-$$-${RANDOM}")" + dst="$repo_root/experimental/CollectiveX/results" + mkdir -p "$dst" 2>> "$log" \ + || { cx_log "ERROR: cannot create checkout result directory"; return 1; } + shopt -s nullglob + files=("$mount_src/experimental/CollectiveX/results/"*.json) + shopt -u nullglob + [ "${#files[@]}" -gt 0 ] || { cx_log "ERROR: staged run produced no result JSON"; return 1; } + cp -- "${files[@]}" "$dst/" >> "$log" 2>&1 \ + || { cx_log "ERROR: staged result collection failed"; return 1; } + cx_log "collected staged results for artifact validation" +} + +cx_cleanup_stage() { + local mount_src="$1" repo_root="$2" base="${CX_STAGE_DIR:-}" tag safe_tag expected + tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}" + safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')" + [ "$mount_src" != "$repo_root" ] || return 0 + if [ -n "$base" ] && [ "$base" != "$repo_root" ]; then + expected="${base%/}/job_$safe_tag" + else + [ -n "${CX_SQUASH_DIR:-}" ] \ + || { cx_log "ERROR: cannot identify the generated stage directory"; return 1; } + expected="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag" + fi + if [ "$mount_src" != "$expected" ] || [ "$mount_src" = / ] \ + || { [ -n "$base" ] && [ "$mount_src" = "$base" ]; }; then + cx_log "ERROR: refusing to remove an unrecognized stage directory" + return 1 + fi + rm -rf -- "$mount_src" >/dev/null 2>&1 || { + cx_log "ERROR: cannot remove generated stage directory" + return 1 + } + cx_log "removed generated per-execution stage directory" +} + +# Return success only when a benchmark output is a complete JSON result object. +# Callers use this before synthesizing a terminal outcome so an emitted invalid result +# is not shadowed by a second record for the same attempt. +cx_has_result_doc() { + local path="$1" + python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" >/dev/null 2>&1 +} + +cx_result_doc_is() { + local path="$1" expected="$2" + python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" --status "$expected" \ + >/dev/null 2>&1 +} + +# A rank-zero result can be written before another rank or backend teardown fails. Preserve its +# measurements, but make the distributed command's nonzero terminal status authoritative. +cx_demote_result_doc() { + local path="$1" rc="$2" + python3 "$_CX_COMMON_ROOT/contracts.py" demote "$path" --return-code "$rc" +} + +cx_quarantine_result_doc() { + python3 "$_CX_COMMON_ROOT/contracts.py" quarantine-invalid "$1" +} + +# cx_emit_ep_failed_case +# Preserve failures from rack launchers that invoke run_ep.py directly and therefore cannot use +# run_in_container.sh's emitter. Case identity is read from the exported CX_* variables. +cx_emit_ep_failed_case() { + local out="$1" backend="$2" phase="$3" rc="$4" + local -a args=(emit-terminal --out "$out" --backend "$backend" --phase "$phase" + --return-code "$rc") + [ -z "${CX_FAILURE_MODE:-}" ] || args+=(--failure-mode "$CX_FAILURE_MODE") + if ! python3 "$_CX_COMMON_ROOT/contracts.py" "${args[@]}" + then + cx_log "ERROR: could not preserve terminal outcome" + return 1 + fi +} + +cx_case_attempt_exists() { + local out_dir="$1" case_id="$2" + python3 - "$_CX_COMMON_ROOT" "$out_dir" "$case_id" <<'PY' +import pathlib, sys + +sys.path.insert(0, sys.argv[1]) +import contracts + +sample_paths = set() +referenced_samples = set() +found = False + +def quarantine(path, document): + sample = document.get("sample_artifact") if isinstance(document, dict) else None + if ( + isinstance(sample, dict) + and isinstance(sample.get("path"), str) + and pathlib.Path(sample["path"]).name == sample["path"] + ): + sample_path = path.with_name(sample["path"]) + if sample_path.is_file(): + sample_path.replace(sample_path.with_name(sample_path.name + ".quarantine")) + if path.is_file(): + path.replace(path.with_name(path.name + ".quarantine")) + +for path in pathlib.Path(sys.argv[2]).glob("*.json"): + document = None + try: + document = contracts.strict_load(path) + if not isinstance(document, dict): + continue + if document.get("format") == contracts.RAW_FORMAT: + document = contracts.load_raw_attempt(path) + referenced_samples.add(path.with_name(document["sample_artifact"]["path"])) + elif document.get("format") == contracts.TERMINAL_FORMAT: + document = contracts.validate_terminal_document(document) + elif document.get("format") == contracts.SAMPLES_FORMAT: + contracts.validate_samples_document(document) + sample_paths.add(path) + continue + else: + continue + except (contracts.ContractError, OSError, ValueError): + quarantine(path, document) + continue + if document["identity"]["case_id"] == sys.argv[3]: + found = True +for orphan in sample_paths - referenced_samples: + quarantine(orphan, {}) +raise SystemExit(0 if found else 1) +PY +} + +# Emit one setup-failure record per requested case. Rack launchers call this when +# backend preparation fails before rank processes can start. +cx_emit_setup_failures() { + local root="$1" out_dir="$2" backend="$3" rc="$4" shard="${CX_SHARD_FILE:-}" path + local phase case_id suite workload required routing eplb ep hidden topk experts nodes + local gpn domain ladder canonical timing + local cases_file expected emitted=0 covered=0 + mkdir -p "$out_dir" || return 1 + export CX_FAILURE_MODE="${CX_FAILSAFE_MODE:-setup}" CX_ATTEMPT_ID=1 + if [ -z "$shard" ]; then + local phases="${CX_PHASE:-decode}" + [ "$phases" = both ] && phases="decode prefill" + for phase in $phases; do + if [ -n "${CX_CASE_ID:-}" ] && cx_case_attempt_exists "$out_dir" "$CX_CASE_ID"; then + continue + fi + cx_emit_ep_failed_case "$out_dir/failed_${backend}_${phase}_${CX_TS:-setup}-a01.json" \ + "$backend" "$phase" "$rc" || return 1 + done + unset CX_FAILURE_MODE + return 0 + fi + path="$shard" + [ -f "$path" ] || path="${root%/}/$shard" + [ -f "$path" ] || { + unset CX_FAILURE_MODE + cx_log "ERROR: cannot emit setup failures without shard control" + return 1 + } + export COLLECTIVEX_CONTROL_SHA256 + COLLECTIVEX_CONTROL_SHA256="$(sha256sum "$path" | awk '{print $1}')" + [[ "$COLLECTIVEX_CONTROL_SHA256" =~ ^[0-9a-f]{64}$ ]] || { + unset CX_FAILURE_MODE COLLECTIVEX_CONTROL_SHA256 + cx_log "ERROR: cannot hash shard for setup-failure records" + return 1 + } + cases_file="$(mktemp)" || return 1 + if ! python3 - "$path" > "$cases_file" <<'PY' +import json, sys + +with open(sys.argv[1]) as handle: + cases = json.load(handle)["cases"] +for case in cases: + fields = ( + case["phase"], case["case_id"], case["suite"], case["workload"], + case["required_publication"], case["routing"], "1" if case["eplb"] else "", + case["ep"], case["hidden"], case["topk"], case["experts"], case["nodes"], + case["gpus_per_node"], case["scale_up_domain"], case["ladder"], + "1" if case["canonical"] else "", case["timing"], + ) + print("|".join(map(str, fields))) +PY + then + rm -f "$cases_file" + unset CX_FAILURE_MODE + return 1 + fi + expected="$(wc -l < "$cases_file" | tr -d ' ')" + [ "$expected" -gt 0 ] || { rm -f "$cases_file"; unset CX_FAILURE_MODE; return 1; } + while IFS='|' read -r phase case_id suite workload required routing eplb ep hidden topk experts \ + nodes gpn domain ladder canonical timing; do + export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload" + export CX_REQUIRED_PUBLICATION="$required" CX_ROUTING="$routing" CX_EPLB="$eplb" + export CX_EP="$ep" CX_NGPUS="$ep" CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts" + export CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn" CX_SCALE_UP_DOMAIN="$domain" + export CX_TOKENS_LADDER="$ladder" CX_CANONICAL="$canonical" + IFS=: read -r CX_ITERS CX_TRIALS CX_WARMUP <<< "$timing" + export CX_ITERS CX_TRIALS CX_WARMUP CX_SAMPLES_PER_POINT="$((CX_ITERS * CX_TRIALS))" + if cx_case_attempt_exists "$out_dir" "$case_id"; then + covered=$((covered + 1)) + continue + fi + cx_emit_ep_failed_case "$out_dir/failed_${case_id}-a01.json" "$backend" "$phase" "$rc" || return 1 + emitted=$((emitted + 1)) + done < "$cases_file" + rm -f "$cases_file" + unset CX_FAILURE_MODE + [ "$((emitted + covered))" -eq "$expected" ] || { + cx_log "ERROR: covered $((emitted + covered))/$expected terminal cases" + return 1 + } +} + +cx_launcher_cleanup() { + local rc="$1" source_root="${MOUNT_SRC:-${REPO_ROOT:-}}" out_dir allocation_stopped=1 + trap - EXIT + if [ -n "${COLLECTIVEX_EPHEMERAL_CONFIG_PATH:-}" ]; then + rm -f -- "$COLLECTIVEX_EPHEMERAL_CONFIG_PATH" >/dev/null 2>&1 || true + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + fi + if [ -n "${JOB_ID:-}" ]; then + if ! cx_cancel_job "$JOB_ID"; then + allocation_stopped=0 + [ "$rc" != 0 ] || rc=1 + fi + elif [ "${CX_ALLOCATION_REQUESTED:-0}" = 1 ]; then + allocation_stopped=0 + [ "$rc" != 0 ] || rc=1 + fi + if [ "$allocation_stopped" = 1 ]; then + cx_write_cleanup_guard safe || true + else + cx_write_cleanup_guard unsafe || true + fi + [ "$allocation_stopped" = 1 ] || source_root="${REPO_ROOT:-$source_root}" + if [ "$rc" != 0 ] && [ -n "${REPO_ROOT:-}" ] && [ -n "${CX_BENCH:-}" ]; then + cx_log "ERROR: terminal-failure-class=${CX_FAILSAFE_MODE:-setup}" + [ -d "$source_root/experimental/CollectiveX" ] || source_root="$REPO_ROOT" + out_dir="$source_root/experimental/CollectiveX/results" + cx_emit_setup_failures \ + "$source_root/experimental/CollectiveX" "$out_dir" "$CX_BENCH" "$rc" || true + [ "$source_root" = "$REPO_ROOT" ] \ + || cx_collect_results "$source_root" "$REPO_ROOT" || true + fi + if [ "$allocation_stopped" = 1 ] && [ -n "${REPO_ROOT:-}" ] \ + && [ "$source_root" != "$REPO_ROOT" ]; then + if ! cx_cleanup_stage "$source_root" "$REPO_ROOT"; then + [ "$rc" != 0 ] || rc=1 + fi + fi + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || cx_cleanup_private_logs "$rc" + exit "$rc" +} + +cx_install_launcher_fail_safe() { + trap 'cx_launcher_cleanup "$?"' EXIT +} diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh new file mode 100644 index 0000000000..119efa7ffc --- /dev/null +++ b/experimental/CollectiveX/runtime/run_in_container.sh @@ -0,0 +1,1002 @@ +#!/usr/bin/env bash +# CollectiveX — generic in-container benchmark dispatcher (single-node). +# +# Runs INSIDE the container under `srun` for single-node shards. The GB EP8 launcher invokes +# run_ep.py directly across nodes. The SKU adapter handles allocation/container/transport-env; +# this script selects one EP backend from CX_BENCH and writes result JSON under results/. +# +# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO +# Selector: CX_BENCH = deepep | deepep-v2 | mori | uccl | nccl-ep | deepep-hybrid +# EP knobs passed to tests/run_ep.py: +# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep +# CX_TOKENS_LADDER (space/comma sep; blank = phase default) +# CX_HIDDEN CX_TOPK CX_EXPERTS CX_ROUTING CX_SEED CX_ITERS +set -euo pipefail + +cd /ix/experimental/CollectiveX +# shellcheck source=../runtime/common.sh +source runtime/common.sh +mkdir -p results +cx_write_runtime_stage backend-setup || cx_die "cannot record runtime stage" + +: "${CX_RUNNER:?CX_RUNNER not set}" +: "${CX_NGPUS:?CX_NGPUS not set}" +: "${CX_TS:?CX_TS not set}" +: "${CX_TOPO:?CX_TOPO not set}" +CX_BENCH="${CX_BENCH:-deepep}" +CX_TRANSPORT="${CX_TRANSPORT:-}" + +cx_apply_timing_profile + +cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO" + +# Blank ladders use the phase default in tests/run_ep.py. +cx_ep_ladder() { + printf '%s' "${CX_TOKENS_LADDER:-}" +} + +# Canonical workload staging. Every SKU/backend generates identical canonical array bytes and +# content IDs in-container; the NPZ container bytes themselves are not an identity boundary. When CX_CANONICAL=1 +# (and CX_WORKLOAD_DIR not already provided) we generate routing traces for the run's ladder +# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and +# point run_ep at it. Raw attempts remain diagnostic until the publisher validates full coverage. +cx_stage_canonical() { + [ "${CX_CANONICAL:-0}" = "1" ] || return 0 + [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0 + local dir="$PWD/.cx_workloads" + local ladder; ladder="$(cx_ep_ladder)" + # cover both phase ladders when none is given, so either phase finds its files. + [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096" + cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')" + python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \ + --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \ + --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \ + || { cx_log "ERROR: canonical workload staging failed"; return 1; } + export CX_WORKLOAD_DIR="$dir" + cx_log "canonical workloads staged at $dir" +} + +# run_ep_suite +# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and +# combine are timed separately inside it. One JSON per (backend, phase). +# Preserve a failed case with its full scheduled identity instead of letting it vanish. +emit_failed_case() { # backend phase rc + cx_emit_ep_failed_case \ + "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" "$1" "$2" "$3" || true +} + +run_ep_suite() { + local backend="$1" phase phases ladder failure_kind rc=0 rc_run + ladder="$(cx_ep_ladder)" + phases="${CX_PHASE:-decode}" + [ "$phases" = "both" ] && phases="decode prefill" + if ! cx_stage_canonical; then + for phase in $phases; do + emit_failed_case "$backend" "$phase" 2 + done + return 1 + fi + for phase in $phases; do + cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'" + local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json" + local -a EPARGS=(--backend "$backend" --phase "$phase" --tokens-ladder "$ladder" + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" + --routing "${CX_ROUTING:-uniform}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-8}" + --trials "${CX_TRIALS:-64}" --warmup "${CX_WARMUP:-32}" + --gpus-per-node "${CX_GPUS_PER_NODE:-0}" --scale-up-domain "${CX_SCALE_UP_DOMAIN:-0}" + --case-id "${CX_CASE_ID:-}" --suite "${CX_SUITE:-}" --workload-name "${CX_WORKLOAD_NAME:-}" + --required-publication "${CX_REQUIRED_PUBLICATION:-}" + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" + --out "$out") + [ -n "${CX_EPLB:-}" ] && EPARGS+=(--eplb) + [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR") + cx_write_runtime_stage execution || cx_die "cannot record runtime stage" + if timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"; then + rc_run=0 + else + rc_run=$? + fi + if [ "$rc_run" = 0 ] && cx_result_doc_is "$out" invalid; then + cx_log "WARN: $backend $phase completed with invalid semantic evidence" + rc=1 + continue + fi + if [ "$rc_run" = 0 ] && ! cx_result_doc_is "$out" success; then + rc_run=1 + fi + if [ "$rc_run" != 0 ]; then + failure_kind=failed + [ "$rc_run" != 124 ] && [ "$rc_run" != 137 ] || failure_kind="timed out" + if [ "$failure_kind" = "timed out" ]; then + cx_log "WARN: $backend $phase run timed out rc=$rc_run (limit=${CX_RUN_TIMEOUT:-900}s)" + else + cx_log "WARN: $backend $phase run failed rc=$rc_run" + fi + if cx_has_result_doc "$out"; then + cx_demote_result_doc "$out" "$rc_run" \ + || { cx_quarantine_result_doc "$out"; emit_failed_case "$backend" "$phase" "$rc_run"; } + cx_log "preserved benchmark output as a failed attempt" + else + cx_quarantine_result_doc "$out" + emit_failed_case "$backend" "$phase" "$rc_run" + fi + rc=1 + fi + done + return "$rc" +} + +# Resolve and verify the actual CUDA target before compiling source kernels. +cx_cuda_arch() { + local expected detected + case "$CX_RUNNER" in + h100*|h200*) expected="9.0" ;; + b200*|gb200*) expected="10.0" ;; + b300*|gb300*) expected="10.3" ;; + *) cx_log "ERROR: no CUDA target registered for $CX_RUNNER"; return 1 ;; + esac + detected="$(python3 - <<'PY' +import torch + +major, minor = torch.cuda.get_device_capability() +print(f"{major}.{minor}") +PY +)" || return 1 + [ "$detected" = "$expected" ] || { + cx_log "ERROR: $CX_RUNNER expected CUDA target $expected, detected $detected" + return 1 + } + printf '%s' "$detected" +} + +cx_nvidia_package_root() { + local package="$1" component="$2" + python3 - "$package" "$component" <<'PY' +from importlib import metadata +from pathlib import Path, PurePosixPath +import sys + +package, component = sys.argv[1:] +try: + distribution = metadata.distribution(package) + prefix = f"nvidia/{component}/" + entries = [str(entry).replace("\\", "/") for entry in distribution.files or ()] + if not any(entry.startswith(prefix) for entry in entries): + raise ValueError + root = Path(distribution.locate_file(PurePosixPath("nvidia") / component)).resolve() + if not root.is_dir(): + raise ValueError +except (metadata.PackageNotFoundError, OSError, TypeError, ValueError): + raise SystemExit(1) +print(root, end="") +PY +} + +cx_prepare_cuda_cccl() { + local cccl="" candidate cuda_home nvcc + nvcc="$(command -v nvcc)" \ + || { cx_log "ERROR: CUDA nvcc is unavailable"; return 1; } + nvcc="$(readlink -f -- "$nvcc")" \ + || { cx_log "ERROR: CUDA nvcc cannot be resolved"; return 1; } + case "$nvcc" in + */bin/nvcc) cuda_home="${nvcc%/bin/nvcc}" ;; + *) cx_log "ERROR: CUDA nvcc has an unexpected path"; return 1 ;; + esac + [ -x "$cuda_home/bin/nvcc" ] && [ -d "$cuda_home/include" ] \ + && [ -d "$cuda_home/lib64" ] \ + || { cx_log "ERROR: CUDA toolkit root is incomplete"; return 1; } + for candidate in "$cuda_home"/targets/*/include/cccl; do + if [ -d "$candidate" ]; then + cccl="$candidate" + break + fi + done + [ -n "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; } + export CUDA_HOME="$cuda_home" CX_CUDA_CCCL="$cccl" + export CPATH="$cccl:${CPATH:-}" + export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}" +} + +cx_prepare_deepep_toolchain() { + local packaged overlay path root temporary + packaged="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \ + || { cx_log "ERROR: nvidia.nvshmem is unavailable"; return 1; } + root="$(cx_deepep_v2_root)" || return 1 + overlay="$root/nvshmem-overlay" + if ! ( + umask 077 + exec 8>"$root/nvshmem-overlay.lock" || exit 1 + flock 8 || exit 1 + if [ ! -d "$overlay" ]; then + temporary="$root/.nvshmem-overlay.$$" + rm -rf "$temporary" || exit 1 + mkdir -p "$temporary/lib" || exit 1 + ln -s "$packaged/include" "$temporary/include" || exit 1 + for path in "$packaged"/lib/*; do + ln -s "$path" "$temporary/lib/${path##*/}" || exit 1 + done + [ ! -e "$packaged/lib/libnvshmem_host.so.3" ] \ + || ln -sf "$packaged/lib/libnvshmem_host.so.3" \ + "$temporary/lib/libnvshmem_host.so" || exit 1 + mv "$temporary" "$overlay" || exit 1 + fi + [ ! -L "$overlay" ] \ + && [ "$(readlink -f "$overlay/include")" = "$(readlink -f "$packaged/include")" ] \ + && [ -e "$overlay/lib/libnvshmem_host.so" ] \ + && [ -e "$overlay/lib/libnvshmem_device.a" ] + ); then + cx_log "ERROR: DeepEP V2 NVSHMEM overlay is invalid" + return 1 + fi + NVSHMEM_DIR="$overlay" + export NVSHMEM_DIR + cx_prepare_cuda_cccl || return 1 + export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}" +} + +cx_probe_deepep() { + local expected_record_sha256 expected_version expected_wheel_sha256 + if [ "${COLLECTIVEX_IMAGE:-}" != "$CX_IMAGE_MULTIARCH" ] \ + || [ "${COLLECTIVEX_IMAGE_DIGEST:-}" != "$CX_IMAGE_MULTIARCH_DIGEST" ] \ + || [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" != 1 ]; then + cx_log "ERROR: DeepEP V1 requires the exact pinned multi-architecture image" + return 1 + fi + cx_cuda_arch >/dev/null || return 1 + case "$CX_RUNNER" in + gb200|gb300) + expected_version="1.1.0+814e508" + expected_wheel_sha256="784dabec0877b6cf72619b7e93eda7e2f365648487bd37fc3ff6960e53669313" + expected_record_sha256="2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882" + DEEPEP_COMMIT="814e508537c6ffc775d59f6f1b9ba43f3a65968c" + ;; + *) + expected_version="1.2.1" + expected_wheel_sha256="7c02c29306ea0fe2dd474618e72e0f310f260187a9c0700a656d2f6964e8c307" + expected_record_sha256="6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac" + DEEPEP_COMMIT="9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee" + ;; + esac + export DEEPEP_COMMIT + python3 - "$expected_version" "$expected_wheel_sha256" "$expected_record_sha256" <<'PY' || { +import base64 +import csv +import hashlib +import importlib.metadata as metadata +import io +import json +from pathlib import Path +import sys + +import deep_ep +from deep_ep import Buffer + +distribution = metadata.distribution("deep_ep") +assert distribution.version == sys.argv[1] +assert Buffer.__name__ == "Buffer" +recorded_files = { + Path(distribution.locate_file(entry)).resolve() for entry in distribution.files or () +} +buffer_module = sys.modules.get(Buffer.__module__) +assert Path(deep_ep.__file__).resolve() in recorded_files +assert buffer_module is not None and Path(buffer_module.__file__).resolve() in recorded_files +direct_url = json.loads(distribution.read_text("direct_url.json")) +assert direct_url["archive_info"]["hashes"]["sha256"] == sys.argv[2] +record_entry = next( + entry for entry in distribution.files or () + if str(entry).endswith(".dist-info/RECORD") +) +record = distribution.locate_file(record_entry).read_bytes() +assert hashlib.sha256(record).hexdigest() == sys.argv[3] +for path, encoded_digest, size in csv.reader(io.StringIO(record.decode())): + if not encoded_digest: + continue + algorithm, expected = encoded_digest.split("=", 1) + assert algorithm == "sha256" + payload = distribution.locate_file(path).read_bytes() + observed = base64.urlsafe_b64encode(hashlib.sha256(payload).digest()).decode().rstrip("=") + assert observed == expected + assert not size or len(payload) == int(size) +PY + cx_log "ERROR: container DeepEP build does not match its pinned image contract" + return 1 + } + cx_log "DeepEP image build ready ($DEEPEP_COMMIT)" +} + +# DeepEP V2 is PR #605's ElasticBuffer implementation with upstream PR #630's pure scale-up +# initialization fix. Canonical launchers stage the pinned source and mount a private cluster-local +# build cache at /cx-cache. +cx_deepep_v2_root() { + local arch cpu base identity key image_digest + arch="$(cx_cuda_arch)" || return 1 + cpu="$(uname -m)" + [[ "$cpu" =~ ^[A-Za-z0-9._-]+$ ]] || return 1 + base="${CX_BACKEND_CACHE_ROOT:-}" + [[ "$base" = /* ]] || return 1 + image_digest="${COLLECTIVEX_IMAGE_DIGEST:-manual-unverified}" + [[ "$image_digest" = manual-unverified || "$image_digest" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || return 1 + # Bump the recipe generation whenever the build procedure changes. Benchmark-only + # source revisions must reuse the same immutable environment instead of leaking GBs. + identity="deepep-v2-cache-v2|$cpu|sm${arch/./}|image=$image_digest|recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2|$CX_DEEPEP_V2_COMMIT|$CX_DEEPEP_V2_TREE|$CX_DEEPEP_V2_FMT_COMMIT|pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0|numpy=2.2.6|torch=2.10.0+cu130|nccl=2.30.4|nvshmem=3.3.9|max-jobs=16" + key="$(printf '%s' "$identity" | sha256sum | awk '{print $1}')" + [[ "$key" =~ ^[0-9a-f]{64}$ ]] || return 1 + printf '%s/deepep-v2-%s' "$base" "$key" +} + +cx_activate_deepep_v2() { + local root venv stage_root + root="$(cx_deepep_v2_root)" || return 1 + venv="$root/venv" + [ -x "$venv/bin/python" ] \ + || { cx_log "ERROR: DeepEP V2 venv interpreter is unavailable"; return 1; } + export VIRTUAL_ENV="$venv" + export PATH="$venv/bin:${PATH#"$venv/bin:"}" + EP_NCCL_ROOT_DIR="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" \ + || { cx_log "ERROR: DeepEP V2 NCCL package root is unavailable"; return 1; } + EP_NVSHMEM_ROOT_DIR="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \ + || { cx_log "ERROR: DeepEP V2 NVSHMEM package root is unavailable"; return 1; } + export EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR + export LD_LIBRARY_PATH="$EP_NCCL_ROOT_DIR/lib:$EP_NVSHMEM_ROOT_DIR/lib:${LD_LIBRARY_PATH:-}" + case "${CX_BACKEND_SOURCE_ROOT:-}" in + /*/.cx_sources) stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}" ;; + *) cx_log "ERROR: DeepEP V2 job-local source root is unavailable"; return 1 ;; + esac + [ -d "$stage_root" ] && [ ! -L "$stage_root" ] \ + || { cx_log "ERROR: DeepEP V2 job-local stage is invalid"; return 1; } + # JIT CUBINs are evidence from this shard, not part of the persistent AOT environment. + # Keeping them on the isolated staged tree prevents a prior driver/topology attempt + # from seeding a later run; all ranks and cases in this shard still share one cold build. + export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit" + export EP_REUSE_NCCL_COMM=1 + export DEEPEP_V2_PR=605 DEEPEP_V2_FIX_PR=630 + DEEPEP_V2_COMMIT="$CX_DEEPEP_V2_COMMIT" + DEEPEP_V2_TREE="$CX_DEEPEP_V2_TREE" + DEEPEP_V2_FMT_COMMIT="$CX_DEEPEP_V2_FMT_COMMIT" + export DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT + [ ! -L "$stage_root/.cx_backend" ] && [ ! -L "$EP_JIT_CACHE_DIR" ] \ + || { cx_log "ERROR: DeepEP V2 JIT cache path is unsafe"; return 1; } + if ! mkdir -p "$EP_JIT_CACHE_DIR" \ + || ! chmod 700 "$stage_root/.cx_backend" "$EP_JIT_CACHE_DIR"; then + cx_log "ERROR: DeepEP V2 JIT cache is unavailable" + return 1 + fi + unset EP_SUPPRESS_NCCL_CHECK +} + +cx_enable_deepep_v2_jit_reproducibility() { + local seed="collectivex-deepep-v2-fa8a9b1" cccl + [ -n "${CUDA_HOME:-}" ] \ + || { cx_log "ERROR: active CUDA toolkit is unavailable"; return 1; } + cccl="${CX_CUDA_CCCL:-}" + case "$cccl" in + "$CUDA_HOME"/targets/*/include/cccl) ;; + *) cx_log "ERROR: CUDA CCCL headers differ from the active toolkit"; return 1 ;; + esac + [ -d "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; } + CPATH="$cccl" + NVCC_PREPEND_FLAGS="--frandom-seed=$seed -I$cccl" + DEEPEP_V2_JIT_RANDOM_SEED="$seed" + EP_JIT_DUMP_SASS=1 + unset EP_JIT_DEBUG EP_JIT_DUMP_ASM EP_JIT_DUMP_PTX EP_JIT_WITH_LINEINFO + unset EP_JIT_PTXAS_VERBOSE EP_JIT_PRINT_COMPILER_COMMAND EP_JIT_NVCC_COMPILER + unset EP_JIT_CPP_STANDARD EP_JIT_PTXAS_CHECK EP_GIN_GDAKI_DEBUG EP_NUM_TOPK_IDX_BITS + export CPATH DEEPEP_V2_JIT_RANDOM_SEED EP_JIT_DUMP_SASS NVCC_PREPEND_FLAGS +} + +cx_probe_deepep_v2() { + python3 - <<'PY' +import ctypes +import importlib.metadata as metadata +import inspect +import os + +import torch + +assert torch.__version__ == "2.10.0+cu130", torch.__version__ +assert metadata.version("nvidia-nccl-cu13") == "2.30.4" +assert metadata.version("nvidia-nvshmem-cu12") == "3.3.9" +assert metadata.version("numpy") == "2.2.6" + +import deep_ep +assert deep_ep.__version__ == "2.0.0", deep_ep.__version__ +assert metadata.version("deep_ep") == "2.0.0+fa8a9b1" +assert inspect.isclass(deep_ep.ElasticBuffer) +assert deep_ep.ElasticBuffer.__name__ == "ElasticBuffer" +assert os.environ.get("EP_SUPPRESS_NCCL_CHECK") is None +with open("/proc/self/maps", encoding="utf-8") as handle: + loaded_nccl = { + os.path.realpath(line.rstrip().split()[-1]) + for line in handle + if "libnccl.so" in line and os.path.isfile(line.rstrip().split()[-1]) + } +assert len(loaded_nccl) == 1 +runtime_version = ctypes.c_int() +assert ctypes.CDLL(loaded_nccl.pop()).ncclGetVersion(ctypes.byref(runtime_version)) == 0 +assert runtime_version.value == 23004, runtime_version.value +PY +} + +cx_deepep_v2_content_sha256() { + python3 - <<'PY' +import hashlib +from importlib import metadata +import os +from pathlib import Path, PurePosixPath +import stat + +distribution = metadata.distribution("deep_ep") +entries = sorted(distribution.files or (), key=lambda entry: entry.as_posix()) +if not entries: + raise SystemExit(1) +venv_path = Path(os.environ["VIRTUAL_ENV"]).absolute() +if venv_path.is_symlink() or not venv_path.is_dir(): + raise SystemExit(1) +venv = venv_path.resolve(strict=True) +digest = hashlib.sha256() +extension = False +for entry in entries: + relative = PurePosixPath(entry.as_posix()) + if ( + relative.is_absolute() + or ".." in relative.parts + or not relative.parts + or not ( + relative.parts[0] == "deep_ep" + or relative.parts[0].startswith("deep_ep-") + and relative.parts[0].endswith(".dist-info") + ) + ): + raise SystemExit(1) + path = Path(distribution.locate_file(entry)).absolute() + resolved = path.resolve(strict=True) + try: + path.relative_to(venv_path) + resolved.relative_to(venv) + except ValueError: + raise SystemExit(1) + parent = path.parent + while parent != venv_path: + if parent.is_symlink(): + raise SystemExit(1) + parent = parent.parent + item = os.lstat(path) + if not stat.S_ISREG(item.st_mode): + raise SystemExit(1) + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (item.st_dev, item.st_ino): + raise SystemExit(1) + file_digest = hashlib.sha256() + while chunk := os.read(descriptor, 1024 * 1024): + file_digest.update(chunk) + finally: + os.close(descriptor) + name = relative.as_posix() + extension |= name.startswith("deep_ep/") and name.endswith(".so") + digest.update(name.encode()) + digest.update(b"\0") + digest.update(str(item.st_size).encode()) + digest.update(b"\0") + digest.update(file_digest.digest()) +if not extension: + raise SystemExit(1) +print(digest.hexdigest(), end="") +PY +} + +cx_deepep_v2_marker_content_sha256() { + local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6" + python3 - "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" <<'PY' +import os +import re +import stat +import sys + +root, marker, revision, tree, fmt_revision, cache_key = sys.argv[1:] +try: + root_item = os.lstat(root) + marker_item = os.lstat(marker) + children = [os.lstat(os.path.join(root, name)) for name in ("source", "venv")] + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + or not stat.S_ISREG(marker_item.st_mode) + or marker_item.st_uid != root_item.st_uid + or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600 + or marker_item.st_size > 1024 + or any( + not stat.S_ISDIR(child.st_mode) + or child.st_uid != root_item.st_uid + or stat.S_IMODE(child.st_mode) & 0o022 + for child in children + ) + ): + raise OSError + descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino): + raise OSError + payload = os.read(descriptor, 1025) + finally: + os.close(descriptor) + lines = payload.decode("ascii").splitlines() + if lines[:4] != [revision, tree, fmt_revision, cache_key] or len(lines) != 5: + raise ValueError + if not re.fullmatch(r"[0-9a-f]{64}", lines[4]): + raise ValueError +except (OSError, UnicodeError, ValueError): + raise SystemExit(1) +print(lines[4], end="") +PY +} + +cx_deepep_v2_cache_is_valid() { + local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6" + local expected_content actual_content + expected_content="$( + cx_deepep_v2_marker_content_sha256 \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" + )" || return 1 + [ -d "$root/source" ] && [ ! -L "$root/source" ] \ + && [ "$(cx_git_in_tree "$root/source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + && [ "$(cx_git_in_tree "$root/source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt_revision" ] \ + || return 1 + cx_activate_deepep_v2 || return 1 + actual_content="$(cx_deepep_v2_content_sha256)" || return 1 + [ "$actual_content" = "$expected_content" ] +} + +cx_build_deepep_v2() { + local root venv source marker marker_tmp lock_path arch cache_key cache_ready content_sha256 + local revision="fa8a9b16898204afd347c663b89e65ef87dc6ce6" + local tree="29809e75c5874e6609dac4804e7b651d5226959f" + local fmt_revision="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" + cx_verify_backend_cache_mount \ + || { cx_log "ERROR: DeepEP V2 cache mount identity validation failed"; return 1; } + arch="$(cx_cuda_arch)" || return 1 + root="$(cx_deepep_v2_root)" || return 1 + cache_key="${root##*/deepep-v2-}" + [[ "$cache_key" =~ ^[0-9a-f]{64}$ ]] || return 1 + venv="$root/venv"; source="$root/source"; marker="$root/.collectivex-complete" + lock_path="${root}.lock" + command -v flock >/dev/null || { cx_log "ERROR: flock is required for DeepEP V2"; return 1; } + mkdir -p "${root%/*}" || return 1 + cx_log "DeepEP V2: preparing PR #605 implementation with upstream PR #630 fix ($revision)" + if ! ( + [ ! -L "$lock_path" ] \ + || { cx_log "ERROR: DeepEP V2 cache lock is unsafe"; exit 1; } + (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" \ + || { cx_log "ERROR: DeepEP V2 cache-lock-create failed"; exit 1; } + exec 9<>"$lock_path" \ + || { cx_log "ERROR: DeepEP V2 cache-lock-open failed"; exit 1; } + flock 9 \ + || { cx_log "ERROR: DeepEP V2 cache-lock-acquire failed"; exit 1; } + cache_ready=0 + if [ -e "$marker" ] || [ -L "$marker" ]; then + if ( + cx_deepep_v2_cache_is_valid \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" + ); then + cache_ready=1 + else + cx_log "ERROR: published DeepEP V2 cache failed integrity validation; refusing reset" + exit 1 + fi + fi + if [ "$cache_ready" != 1 ]; then + if [ -e "$root" ] || [ -L "$root" ]; then + rm -rf "$root" \ + || { cx_log "ERROR: incomplete DeepEP V2 cache-reset failed"; exit 1; } + fi + mkdir -m 700 "$root" \ + || { cx_log "ERROR: DeepEP V2 cache-create failed"; exit 1; } + python3 -m venv "$venv" \ + || { cx_log "ERROR: DeepEP V2 venv creation failed"; exit 1; } + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + "pip==26.1.2" "setuptools==82.0.1" "wheel==0.47.0" "ninja==1.13.0" \ + "numpy==2.2.6" "nvidia-nvshmem-cu12==3.3.9" >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 build-tool installation failed"; exit 1; } + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + --index-url https://download.pytorch.org/whl/cu130 \ + --extra-index-url https://pypi.org/simple "torch==2.10.0" >&2 2>&1 \ + || { cx_log "ERROR: torch 2.10.0+cu130 installation failed"; exit 1; } + # Torch pins NCCL 2.28.9; the PR #605 ElasticBuffer implementation requires 2.30.4. + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + --force-reinstall --no-deps "nvidia-nccl-cu13==2.30.4" >&2 2>&1 \ + || { cx_log "ERROR: NCCL 2.30.4 installation failed"; exit 1; } + cx_activate_deepep_v2 \ + || { cx_log "ERROR: DeepEP V2 environment activation failed"; exit 1; } + cx_prepare_deepep_toolchain \ + || { cx_log "ERROR: DeepEP V2 toolchain preparation failed"; exit 1; } + EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR" + export EP_NVSHMEM_ROOT_DIR + cx_materialize_backend_source deepep-v2 "$source" \ + || { cx_log "ERROR: DeepEP V2 staged source is invalid"; exit 1; } + (cd "$source" && SOURCE_DATE_EPOCH="$(cx_git_in_tree "$source" show -s --format=%ct HEAD)" \ + TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + python3 -m pip install -q --no-build-isolation --no-deps --force-reinstall .) >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 build failed"; exit 1; } + cx_probe_deepep_v2 \ + || { cx_log "ERROR: DeepEP V2 ElasticBuffer/runtime probe failed"; exit 1; } + content_sha256="$(cx_deepep_v2_content_sha256)" \ + || { cx_log "ERROR: DeepEP V2 installed-content hashing failed"; exit 1; } + marker_tmp="$(mktemp "$root/.collectivex-complete.tmp.XXXXXX")" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-create failed"; exit 1; } + chmod 600 "$marker_tmp" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-permission failed"; exit 1; } + printf '%s\n%s\n%s\n%s\n%s\n' \ + "$revision" "$tree" "$fmt_revision" "$cache_key" "$content_sha256" > "$marker_tmp" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-write failed"; exit 1; } + mv -f -- "$marker_tmp" "$marker" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-publish failed"; exit 1; } + fi + cx_deepep_v2_cache_is_valid \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" \ + || { cx_log "ERROR: DeepEP V2 cache validation failed"; exit 1; } + ); then + cx_log "ERROR: shared DeepEP V2 environment is incomplete" + return 1 + fi + cx_activate_deepep_v2 || return 1 + cx_prepare_deepep_toolchain || return 1 + cx_enable_deepep_v2_jit_reproducibility || return 1 + EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR" + export EP_NVSHMEM_ROOT_DIR + cx_probe_deepep_v2 || { cx_log "ERROR: DeepEP V2 shared runtime probe failed"; return 1; } + cx_log "DeepEP V2 ready ($DEEPEP_V2_COMMIT, ElasticBuffer, NCCL Device API; LSA/Gin selected by adapter)" +} + +# Build the pinned DeepEP `hybrid-ep` implementation for one NVLink/MNNVL domain. CUDA 13 moved +# libcudacxx headers under cccl, but this intradomain path does not use the separate NVSHMEM +# toolchain required by DeepEP V2. +cx_deepep_hybrid_marker_content_sha256() { + python3 - "$1" "$2" "$3" "$4" <<'PY' +import os +import re +import stat +import sys + +root, marker, revision, tree = sys.argv[1:] +try: + root_item = os.lstat(root) + marker_item = os.lstat(marker) + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + or not stat.S_ISREG(marker_item.st_mode) + or marker_item.st_uid != root_item.st_uid + or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600 + or marker_item.st_size > 512 + ): + raise OSError + descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino): + raise OSError + payload = os.read(descriptor, 513) + finally: + os.close(descriptor) + lines = payload.decode("ascii").splitlines() + if len(lines) != 3 or lines[:2] != [revision, tree]: + raise ValueError + if not re.fullmatch(r"[0-9a-f]{64}", lines[2]): + raise ValueError +except (OSError, UnicodeError, ValueError): + raise SystemExit(1) +print(lines[2], end="") +PY +} + +cx_deepep_hybrid_cache_is_valid() { + local root="$1" marker="$2" revision="$3" tree="$4" expected actual status extra + expected="$(cx_deepep_hybrid_marker_content_sha256 \ + "$root" "$marker" "$revision" "$tree")" || return 1 + [ "$(cx_git_in_tree "$root" rev-parse HEAD 2>/dev/null)" = "$revision" ] \ + && [ "$(cx_git_in_tree "$root" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + || return 1 + status="$(cx_git_in_tree "$root" status --porcelain --untracked-files=no \ + --ignore-submodules=none 2>/dev/null)" || return 1 + [ -z "$status" ] || return 1 + extra="$(cx_git_in_tree "$root" ls-files --others --exclude-standard -- \ + 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1 + [ -z "$extra" ] || return 1 + extra="$(cx_git_in_tree "$root" ls-files --others --ignored --exclude-standard -- \ + 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1 + [ -z "$extra" ] || return 1 + actual="$(cx_extension_pair_sha256 "$root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" \ + || return 1 + [ "$actual" = "$expected" ] +} + +cx_build_deepep_hybrid() { + local arch revision="$CX_DEEPEP_HYBRID_COMMIT" tree="$CX_DEEPEP_HYBRID_TREE" + local build_root marker marker_tmp lock_path content_sha256 cache_ready + export DEEPEP_COMMIT="$revision" DEEPEP_TREE="$tree" + arch="$(cx_cuda_arch)" || return 1 + build_root="$PWD/.cx_backend/deepep-hybrid-${arch/./}" + marker="$build_root/.collectivex-complete" + lock_path="${build_root}.lock" + cx_log "DeepEP hybrid-ep: building $revision for CUDA target $arch" + unset NVSHMEM_DIR HYBRID_EP_MULTINODE USE_NIXL + cx_prepare_cuda_cccl || return 1 + command -v flock >/dev/null || { cx_log "ERROR: flock is required for hybrid-ep"; return 1; } + mkdir -p "$PWD/.cx_backend" || return 1 + if ! ( + [ ! -L "$lock_path" ] || exit 1 + (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" || exit 1 + exec 9<>"$lock_path" || exit 1 + flock 9 || exit 1 + cache_ready=0 + if [ -e "$marker" ] || [ -L "$marker" ]; then + cx_deepep_hybrid_cache_is_valid "$build_root" "$marker" "$revision" "$tree" \ + || exit 1 + cache_ready=1 + fi + if [ "$cache_ready" != 1 ]; then + cx_materialize_backend_source deepep-hybrid "$build_root" \ + || { cx_log "ERROR: hybrid-ep staged source is invalid"; exit 1; } + (cd "$build_root" && \ + SOURCE_DATE_EPOCH="$(cx_git_in_tree "$build_root" show -s --format=%ct HEAD)" \ + TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + python3 setup.py build_ext --inplace) >&2 2>&1 \ + || { cx_log "ERROR: hybrid-ep build failed"; exit 1; } + content_sha256="$(cx_extension_pair_sha256 \ + "$build_root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" || exit 1 + marker_tmp="$(mktemp "$build_root/.collectivex-complete.tmp.XXXXXX")" || exit 1 + chmod 600 "$marker_tmp" || exit 1 + printf '%s\n%s\n%s\n' "$revision" "$tree" "$content_sha256" > "$marker_tmp" \ + || exit 1 + mv -f -- "$marker_tmp" "$marker" || exit 1 + fi + cx_deepep_hybrid_cache_is_valid "$build_root" "$marker" "$revision" "$tree" + ); then + cx_log "ERROR: shared hybrid-ep build is incomplete" + return 1 + fi + export PYTHONPATH="$build_root:${PYTHONPATH:-}" + python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \ + || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; } + cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT)" +} + +# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13 +# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668 +# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly. +cx_build_uccl() { + if [ -f /tmp/.cx_built_uccl ]; then + cx_log "UCCL EP already prepared this allocation — skip rebuild" + python3 -c "import torch; from uccl_deepep import Buffer" 2>/dev/null || return 1 + return 0 + fi + local version="0.1.1" tag="v0.1.1" + local wheel_sha256="390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec" + cx_log "UCCL EP: installing uccl==$version + cu12 runtime shim" + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install -q --no-deps "sortedcontainers==2.4.0" "intervaltree==3.1.0" >&2 2>&1 \ + || { cx_log "ERROR: UCCL support dependency installation failed"; return 1; } + printf 'uccl==%s --hash=sha256:%s\n' "$version" "$wheel_sha256" \ + | pip install -q --no-deps --only-binary=:all: --require-hashes -r /dev/stdin >&2 2>&1 \ + || { cx_log "ERROR: pip install uccl==$version failed"; return 1; } + pip install -q --no-deps "nvidia-cuda-runtime-cu12==12.9.79" >&2 2>&1 \ + || { cx_log "ERROR: CUDA 12 runtime shim install failed"; return 1; } + local cu12lib + cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)" + [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}" + local installed + installed="$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))')" \ + || { cx_log "ERROR: cannot read installed UCCL version"; return 1; } + [ "$installed" = "$version" ] \ + || { cx_log "ERROR: expected UCCL $version, installed $installed"; return 1; } + UCCL_COMMIT="pkg-$installed" + export UCCL_COMMIT + # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader + # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too. + python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \ + || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; } + # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name + # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes, + # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full + # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks. + rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg + # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's + # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel + # mismatches signatures. Match them. + if git clone --depth 1 --branch "$tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \ + && [ "$(git -C /tmp/uccl_src rev-parse HEAD)" = "73ee4f12ba71717d6de34ba06806e1baaabe3f42" ] \ + && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then + mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep + cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null + export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}" + python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2 \ + || { cx_log "ERROR: uccl_deepep wrapper import failed"; return 1; } + export CX_UCCL_WRAPPER=1 + export UCCL_WRAPPER_COMMIT="73ee4f12ba71717d6de34ba06806e1baaabe3f42" + else + cx_log "ERROR: uccl deep_ep_wrapper not available" + return 1 + fi + : > /tmp/.cx_built_uccl + cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})" +} + +# Rack build and rank steps may enter different container instances. Persist each node's +# loader/import path and build identity on the shared staged mount, then require it from every rank. +cx_persist_backend_env() { + local root="$PWD/.cx_backend/env" node_id="${SLURM_NODEID:-0}" path temporary name + local -a names=(PATH VIRTUAL_ENV LD_LIBRARY_PATH PYTHONPATH CUDA_HOME CPATH NVCC_PREPEND_FLAGS + NVSHMEM_DIR DEEPEP_COMMIT DEEPEP_TREE + EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR EP_JIT_CACHE_DIR EP_REUSE_NCCL_COMM + EP_JIT_DUMP_SASS + DEEPEP_V2_PR DEEPEP_V2_FIX_PR DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT + DEEPEP_V2_JIT_RANDOM_SEED + UCCL_COMMIT UCCL_WRAPPER_COMMIT CX_UCCL_WRAPPER) + [[ "$node_id" =~ ^[0-9]+$ ]] || return 1 + mkdir -p "$root" || return 1 + chmod 700 "$root" || return 1 + temporary="$(mktemp "$root/.node-${node_id}.XXXXXX")" || return 1 + chmod 600 "$temporary" || { rm -f "$temporary"; return 1; } + for name in "${names[@]}"; do + if declare -p "$name" >/dev/null 2>&1; then + printf 'export %s=%q\n' "$name" "${!name}" >> "$temporary" \ + || { rm -f "$temporary"; return 1; } + fi + done + path="$root/node-${node_id}.sh" + mv -f -- "$temporary" "$path" || { rm -f "$temporary"; return 1; } +} + +# Prepare and probe one backend without running a benchmark. The same hook is used +# by normal in-container runs and by rack launchers' persistent build-only step. +cx_prepare_backend() { + local backend="${1:-}" + case "$backend" in + deepep) + cx_probe_deepep || return 1 + ;; + deepep-v2) + cx_build_deepep_v2 || return 1 + ;; + deepep-hybrid) + cx_build_deepep_hybrid || return 1 + ;; + uccl) + cx_build_uccl || return 1 + ;; + mori) + python3 -c "import mori" 2>/dev/null || return 1 + ;; + nccl-ep) + ;; + *) + cx_log "ERROR: unknown backend preparation request" + return 1 + ;; + esac +} + +prepare_backend_or_record() { + local backend="$1" phases="${CX_PHASE:-decode}" phase + cx_write_runtime_stage backend-setup || return 1 + if cx_prepare_backend "$backend"; then + return 0 + fi + cx_log "WARN: $backend preparation failed" + [ "$phases" = "both" ] && phases="decode prefill" + for phase in $phases; do + CX_FAILURE_MODE=backend-setup emit_failed_case "$backend" "$phase" 6 + done + return 1 +} + +# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many +# of these per allocation (SHARD mode below), reusing this single container + its built backend. +dispatch_bench() { + case "$CX_BENCH" in + nccl-ep) + run_ep_suite "$CX_BENCH" + ;; + deepep|deepep-v2|deepep-hybrid|mori|uccl) + prepare_backend_or_record "$CX_BENCH" && run_ep_suite "$CX_BENCH" + ;; + *) + cx_die "unknown CX_BENCH=$CX_BENCH (want deepep|deepep-v2|mori|uccl|nccl-ep|deepep-hybrid)" + ;; + esac +} + +rc=0 +cx_validate_shard_control "$PWD" +# Build-only mode: rack launchers run the shared backend preparation hook once per +# node inside a persistent named container, then direct rank processes reuse it. +if [ -n "${CX_BUILD_ONLY:-}" ]; then + if cx_prepare_backend "${CX_BENCH:-}"; then + cx_persist_backend_env || rc=1 + else + rc=1 + fi + cx_log "backend preparation: bench=${CX_BENCH:-unknown} rc=$rc" + exit "$rc" +fi +if [ -n "${CX_SHARD_FILE:-}" ]; then + # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation. + # All cases share (sku, backend, nodes), so backend preparation is paid once and cached. + ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE'))['cases']))")" + cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)" + _cx_ts_base="$CX_TS" # per-case CX_TS suffix below keeps each case's result file UNIQUE (else + # cases sharing backend+phase overwrite each other at the same timestamp). + ci=0 + failed_cases=0 + while [ "$ci" -lt "$ncases" ]; do + CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")" + export CX_TS + # Map varying case fields plus the frozen v1 defaults into CX_* env. + _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY' +import json, sys, shlex +c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])] +def g(k, d=""): + v = c.get(k, d); return "" if v is None else str(v) +env = { + "CX_BENCH": g("backend"), + "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"), + "CX_EP": g("ep", "1"), + "CX_EPLB": "1" if c.get("eplb") else "", + "CX_CASE_ID": g("case_id"), "CX_SUITE": g("suite"), "CX_WORKLOAD_NAME": g("workload"), + "CX_REQUIRED_PUBLICATION": g("required_publication"), + "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"), + "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""), +} +lines = [f"export {k}={shlex.quote(v)}" for k, v in env.items()] +# Per-case timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32 everywhere); +# cases without one must fall back to the harness defaults, so UNSET rather than export-empty +# (an empty CX_ITERS would defeat the 8-iter default and break the run_ep argparse; NOTE no +# apostrophes in this heredoc — bash command-substitution scanning chokes on unbalanced quotes). +timing = g("timing") +if timing: + parts = (timing.split(":") + ["", "", ""])[:3] + for k, v in zip(("CX_ITERS", "CX_TRIALS", "CX_WARMUP"), parts): + if v: + lines.append(f"export {k}={shlex.quote(v)}") +else: + lines.append("unset CX_ITERS CX_TRIALS CX_WARMUP 2>/dev/null || true") +print("\n".join(lines)) +PY +)" + eval "$_exports" + # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical + # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's + # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests + # (FileNotFoundError .cx_workloads/.manifest.json). Unset so every case re-stages its own. + unset CX_WORKLOAD_DIR 2>/dev/null || true + cx_log " [$((ci+1))/$ncases] $CX_BENCH $CX_PHASE routing=$CX_ROUTING eplb=${CX_EPLB:-0}" + _cx_case_ts="$CX_TS" + CX_TS="${_cx_case_ts}-a01" + export CX_ATTEMPT_ID=1 CX_TS + dispatch_bench || { + failed_cases=$((failed_cases+1)) + cx_log " [$((ci+1))/$ncases] $CX_BENCH case FAILED; failed-case record preserved" + } + export CX_TS="$_cx_case_ts" + ci=$((ci + 1)) + done + if [ "${failed_cases:-0}" -gt 0 ]; then + cx_log "SHARD done: $failed_cases/$ncases case(s) failed" + rc=1 + fi + # The base timestamp matches every per-case file, so the final summary covers the whole shard. + export CX_TS="$_cx_ts_base" +else + _cx_single_ts="$CX_TS" + CX_TS="${_cx_single_ts}-a01" + export CX_ATTEMPT_ID=1 CX_TS + dispatch_bench || rc=1 +fi + +# Summary table for the log; also fails the job if no valid results were produced. +python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1 +exit "$rc" diff --git a/experimental/CollectiveX/schemas/channel-v1.schema.json b/experimental/CollectiveX/schemas/channel-v1.schema.json new file mode 100644 index 0000000000..663e22914b --- /dev/null +++ b/experimental/CollectiveX/schemas/channel-v1.schema.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/channel-v1.schema.json", + "title": "CollectiveX public channel v1", + "type": "object", + "additionalProperties": false, + "required": ["format","channel","dataset","generated_at"], + "properties": { + "format": {"const": "collectivex.channel.v1"}, + "channel": {"enum": ["latest-attempt","dev-latest"]}, + "dataset": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes"], + "properties": { + "path": {"type": "string","pattern": "^datasets/[0-9a-f]{64}/dataset\\.json$"}, + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "bytes": {"type": "integer","minimum": 1,"maximum": 33554432} + } + }, + "generated_at": {"type": "string","format": "date-time"} + } +} diff --git a/experimental/CollectiveX/schemas/private-bundle-v1.schema.json b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json new file mode 100644 index 0000000000..166c808930 --- /dev/null +++ b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json @@ -0,0 +1,162 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/private-bundle-v1.schema.json", + "title": "CollectiveX private attempt bundle v1", + "type": "object", + "additionalProperties": false, + "required": [ + "format", + "schema_version", + "created_at", + "ingest_id", + "run", + "matrix", + "sources", + "attempts", + "coverage", + "runtime_fingerprints", + "checksums", + "validation" + ], + "properties": { + "format": {"const": "collectivex.private.bundle.v1"}, + "schema_version": {"const": 1}, + "created_at": {"type": "string","format": "date-time"}, + "ingest_id": {"$ref": "#/$defs/sha256"}, + "run": { + "type": "object", + "additionalProperties": false, + "required": ["repository","run_id","run_attempt","source_sha"], + "properties": { + "repository": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_attempt": {"type": "integer","minimum": 1}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"} + } + }, + "matrix": {"$ref": "#/$defs/file"}, + "sources": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/source"}}, + "attempts": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "attempt_id", + "allocation_id", + "case_id", + "outcome", + "reason", + "selected", + "document", + "samples", + "runtime_fingerprint_sha256", + "series_ids", + "evidence_ids" + ], + "properties": { + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "case_id": {"$ref": "#/$defs/caseId"}, + "outcome": {"$ref": "#/$defs/outcome"}, + "reason": {"$ref": "#/$defs/reason"}, + "selected": {"type": "boolean"}, + "document": {"$ref": "#/$defs/file"}, + "samples": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/file"}]}, + "runtime_fingerprint_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "series_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}}, + "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}} + } + } + }, + "coverage": { + "type": "object", + "additionalProperties": false, + "required": ["expected_cases","terminal_cases","complete","outcome_counts","selections"], + "properties": { + "expected_cases": {"type": "integer","minimum": 1}, + "terminal_cases": {"type": "integer","minimum": 0}, + "complete": {"type": "boolean"}, + "outcome_counts": {"$ref": "#/$defs/outcomeCounts"}, + "selections": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["case_id","selected_attempt_id","outcome"], + "properties": { + "case_id": {"$ref": "#/$defs/caseId"}, + "selected_attempt_id": {"$ref": "#/$defs/attemptId"}, + "outcome": {"$ref": "#/$defs/outcome"} + } + } + } + } + }, + "runtime_fingerprints": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}}, + "checksums": {"$ref": "#/$defs/file"}, + "validation": { + "type": "object", + "additionalProperties": false, + "required": ["policy","passed","checks"], + "properties": { + "policy": {"const": "collectivex-publisher-v1"}, + "passed": {"const": true}, + "checks": { + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$"} + } + } + } + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]}, + "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]}, + "outcomeCounts": { + "type": "object", + "additionalProperties": false, + "required": ["success","unsupported","failed","invalid","diagnostic"], + "properties": { + "success": {"type": "integer","minimum": 0}, + "unsupported": {"type": "integer","minimum": 0}, + "failed": {"type": "integer","minimum": 0}, + "invalid": {"type": "integer","minimum": 0}, + "diagnostic": {"type": "integer","minimum": 0} + } + }, + "file": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes"], + "properties": { + "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"}, + "sha256": {"$ref": "#/$defs/sha256"}, + "bytes": {"type": "integer","minimum": 1} + } + }, + "source": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes","artifact_name"], + "properties": { + "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"}, + "sha256": {"$ref": "#/$defs/sha256"}, + "bytes": {"type": "integer","minimum": 1}, + "artifact_name": { + "type": "string", + "pattern": "^cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*$" + } + } + } + } +} diff --git a/experimental/CollectiveX/schemas/public-dataset-v1.schema.json b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json new file mode 100644 index 0000000000..87abf403d1 --- /dev/null +++ b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json @@ -0,0 +1,562 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/public-dataset-v1.schema.json", + "title": "CollectiveX sanitized public dataset v1", + "type": "object", + "additionalProperties": false, + "required": [ + "format", + "schema_version", + "generated_at", + "source_bundle_ids", + "promotion", + "coverage", + "attempts", + "series", + "cohorts", + "rankings", + "recommendations", + "sensitivities" + ], + "properties": { + "format": {"const": "collectivex.public.v1"}, + "schema_version": {"const": 1}, + "generated_at": {"type": "string","format": "date-time"}, + "source_bundle_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}}, + "promotion": { + "type": "object", + "additionalProperties": false, + "required": [ + "status", + "reason", + "matrix_id", + "allocation_ids", + "required_allocations", + "requested_cases", + "terminal_cases", + "policy" + ], + "properties": { + "status": {"enum": ["promoted","diagnostic","quarantined"]}, + "reason": {"$ref": "#/$defs/reason"}, + "matrix_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "required_allocations": {"const": 3}, + "requested_cases": {"type": "integer","minimum": 0}, + "terminal_cases": {"type": "integer","minimum": 0}, + "policy": {"const": "collectivex-decision-grade-v1"} + } + }, + "coverage": {"type": "array","items": {"$ref": "#/$defs/coverage"}}, + "attempts": {"type": "array","items": {"$ref": "#/$defs/attempt"}}, + "series": {"type": "array","items": {"$ref": "#/$defs/series"}}, + "cohorts": {"type": "array","items": {"$ref": "#/$defs/cohort"}}, + "rankings": {"type": "array","items": {"$ref": "#/$defs/ranking"}}, + "recommendations": {"type": "array","items": {"$ref": "#/$defs/recommendation"}}, + "sensitivities": {"type": "array","items": {"$ref": "#/$defs/sensitivity"}} + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "workloadId": {"type": "string","pattern": "^cxwork-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128}, + "publicationTier": {"enum": ["official","comparable-experimental"]}, + "label": {"type": "string","minLength": 1,"maxLength": 160}, + "nullableLabel": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/label"}]}, + "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]}, + "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]}, + "coverage": { + "type": "object", + "additionalProperties": false, + "required": [ + "case_id", + "label", + "required", + "sku", + "backend", + "phase", + "disposition", + "selected_attempt_id", + "outcome", + "failure_mode", + "reason", + "attempt_ids" + ], + "properties": { + "case_id": {"$ref": "#/$defs/caseId"}, + "label": {"$ref": "#/$defs/label"}, + "required": {"type": "boolean"}, + "sku": {"$ref": "#/$defs/safeId"}, + "backend": {"$ref": "#/$defs/safeId"}, + "phase": {"enum": ["decode","prefill"]}, + "disposition": {"enum": ["runnable","unsupported"]}, + "selected_attempt_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/attemptId"}]}, + "outcome": {"$ref": "#/$defs/outcome"}, + "failure_mode": {"$ref": "#/$defs/reason"}, + "reason": {"$ref": "#/$defs/reason"}, + "attempt_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/attemptId"}} + } + }, + "attempt": { + "type": "object", + "additionalProperties": false, + "required": [ + "attempt_id", + "evidence", + "case_id", + "allocation_id", + "run_id", + "run_attempt", + "attempt_index", + "selected", + "outcome", + "failure_mode", + "reason", + "series_id", + "completed_at" + ], + "properties": { + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "evidence": { + "type": "array", + "uniqueItems": true, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["evidence_id","point_id"], + "properties": {"evidence_id": {"$ref": "#/$defs/evidenceId"},"point_id": {"$ref": "#/$defs/pointId"}} + } + }, + "case_id": {"$ref": "#/$defs/caseId"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_attempt": {"type": "integer","minimum": 1}, + "attempt_index": {"type": "integer","minimum": 1}, + "selected": {"type": "boolean"}, + "outcome": {"$ref": "#/$defs/outcome"}, + "failure_mode": {"$ref": "#/$defs/reason"}, + "reason": {"$ref": "#/$defs/reason"}, + "series_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/seriesId"}]}, + "completed_at": {"oneOf": [{"type": "null"},{"type": "string","format": "date-time"}]} + } + }, + "eligibility": { + "type": "object", + "additionalProperties": false, + "allOf": [{ + "if": {"properties": {"decision_grade": {"const": true}},"required": ["decision_grade"]}, + "then": {"properties": {"reasons": {"maxItems": 0}}}, + "else": {"properties": {"reasons": {"minItems": 1}}} + }], + "required": [ + "decision_grade", + "allocation_ids", + "complete", + "correct", + "measured_roundtrip_p99", + "stable_p50", + "stable_p99", + "stable_ordering", + "p50_max_min_ratio", + "p99_max_min_ratio", + "reasons" + ], + "properties": { + "decision_grade": {"type": "boolean"}, + "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "complete": {"type": "boolean"}, + "correct": {"type": "boolean"}, + "measured_roundtrip_p99": {"type": "boolean"}, + "stable_p50": {"type": "boolean"}, + "stable_p99": {"type": "boolean"}, + "stable_ordering": {"type": "boolean"}, + "p50_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]}, + "p99_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]}, + "reasons": { + "type": "array", + "uniqueItems": true, + "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96} + } + } + }, + "percentiles": { + "type": "object", + "additionalProperties": false, + "required": ["p50","p90","p95","p99"], + "properties": { + "p50": {"type": "number","exclusiveMinimum": 0}, + "p90": {"type": "number","exclusiveMinimum": 0}, + "p95": {"type": "number","exclusiveMinimum": 0}, + "p99": {"type": "number","exclusiveMinimum": 0} + } + }, + "component": { + "type": "object", + "additionalProperties": false, + "required": ["origin","latency_us","logical_bytes","logical_payload_rate_gbps_at_latency_percentile","sample_count"], + "properties": { + "origin": {"enum": ["measured","derived"]}, + "latency_us": {"$ref": "#/$defs/percentiles"}, + "logical_bytes": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}, + "logical_payload_rate_gbps_at_latency_percentile": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/percentiles"}]}, + "sample_count": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]} + } + }, + "nullableComponent": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/component"}]}, + "point": { + "type": "object", + "additionalProperties": false, + "required": [ + "point_id", + "tokens_per_rank", + "global_tokens", + "correct", + "routing", + "components", + "roundtrip_token_rate_at_latency_percentile", + "evidence_ids" + ], + "properties": { + "point_id": {"$ref": "#/$defs/pointId"}, + "tokens_per_rank": {"type": "integer","minimum": 1}, + "global_tokens": {"type": "integer","minimum": 1}, + "correct": {"type": "boolean"}, + "routing": { + "type": "object", + "additionalProperties": false, + "required": [ + "fanout_mean", + "recv_tokens_max", + "expert_load_cv", + "payload_rank_cv", + "hotspot_ratio", + "empty_expert_count", + "empty_rank_count", + "routed_copies" + ], + "properties": { + "fanout_mean": {"type": "number","minimum": 0}, + "recv_tokens_max": {"type": "integer","minimum": 0}, + "expert_load_cv": {"type": "number","minimum": 0}, + "payload_rank_cv": {"type": "number","minimum": 0}, + "hotspot_ratio": {"type": "number","minimum": 0}, + "empty_expert_count": {"type": "integer","minimum": 0}, + "empty_rank_count": {"type": "integer","minimum": 0}, + "routed_copies": {"type": "integer","minimum": 1} + } + }, + "components": { + "type": "object", + "additionalProperties": false, + "required": ["dispatch","combine","roundtrip","isolated_sum"], + "properties": { + "dispatch": {"$ref": "#/$defs/nullableComponent"}, + "combine": {"$ref": "#/$defs/nullableComponent"}, + "roundtrip": {"$ref": "#/$defs/nullableComponent"}, + "isolated_sum": {"$ref": "#/$defs/nullableComponent"} + } + }, + "roundtrip_token_rate_at_latency_percentile": {"$ref": "#/$defs/percentiles"}, + "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}} + } + }, + "series": { + "type": "object", + "additionalProperties": false, + "required": [ + "series_id", + "label", + "status", + "case_ids", + "allocation_ids", + "model", + "suite", + "phase", + "publication_tier", + "backend", + "build", + "system", + "workload", + "eplb", + "resource", + "measurement", + "points", + "eligibility" + ], + "properties": { + "series_id": {"$ref": "#/$defs/seriesId"}, + "label": {"$ref": "#/$defs/label"}, + "status": {"enum": ["decision-grade","diagnostic"]}, + "case_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/caseId"}}, + "allocation_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "model": {"$ref": "#/$defs/safeId"}, + "suite": {"$ref": "#/$defs/safeId"}, + "phase": {"enum": ["decode","prefill"]}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "backend": { + "type": "object", + "additionalProperties": false, + "required": ["id","label","role","generation","version"], + "properties": { + "id": {"$ref": "#/$defs/safeId"}, + "label": {"$ref": "#/$defs/label"}, + "role": {"enum": ["library","reference"]}, + "generation": {"$ref": "#/$defs/nullableLabel"}, + "version": {"$ref": "#/$defs/nullableLabel"} + } + }, + "build": { + "type": "object", + "additionalProperties": false, + "required": ["implementation_contract_sha256","public_config_sha256","routing_control_sha256","runtime_fingerprint_sha256","image_digest","source_sha","squash_sha256"], + "properties": { + "implementation_contract_sha256": {"$ref": "#/$defs/sha256"}, + "public_config_sha256": {"$ref": "#/$defs/sha256"}, + "routing_control_sha256": {"$ref": "#/$defs/sha256"}, + "runtime_fingerprint_sha256": {"$ref": "#/$defs/sha256"}, + "image_digest": {"type": "string","pattern": "^sha256:[0-9a-f]{64}$"}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40,64}$"}, + "squash_sha256": {"$ref": "#/$defs/sha256"} + } + }, + "system": { + "type": "object", + "additionalProperties": false, + "required": ["sku","label","vendor","topology_class","transport","world_size","ep_size","placement"], + "properties": { + "sku": {"$ref": "#/$defs/safeId"}, + "label": {"$ref": "#/$defs/label"}, + "vendor": {"enum": ["nvidia","amd"]}, + "topology_class": {"$ref": "#/$defs/safeId"}, + "transport": {"$ref": "#/$defs/safeId"}, + "world_size": {"type": "integer","minimum": 1}, + "ep_size": {"type": "integer","minimum": 1}, + "placement": {"enum": ["packed"]} + } + }, + "workload": { + "type": "object", + "additionalProperties": false, + "required": [ + "workload_id", + "hidden", + "top_k", + "experts", + "routing", + "eplb", + "dispatch_dtype", + "combine_dtype", + "activation_profile" + ], + "properties": { + "workload_id": {"$ref": "#/$defs/workloadId"}, + "hidden": {"type": "integer","minimum": 1}, + "top_k": {"type": "integer","minimum": 1}, + "experts": {"type": "integer","minimum": 1}, + "routing": {"enum": ["uniform","zipf"]}, + "eplb": {"type": "boolean"}, + "dispatch_dtype": {"const": "bf16"}, + "combine_dtype": {"const": "bf16"}, + "activation_profile": {"const": "canonical-counter-source-v3"} + } + }, + "eplb": { + "type": "object", + "additionalProperties": false, + "required": [ + "enabled", + "planner", + "mapping_sha256", + "logical_experts", + "physical_experts", + "redundant_experts", + "reference_tokens_per_rank", + "replicated_experts", + "max_replicas", + "imbalance_before", + "imbalance_after" + ], + "properties": { + "enabled": {"type": "boolean"}, + "planner": {"$ref": "#/$defs/nullableLabel"}, + "mapping_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "logical_experts": {"type": "integer","minimum": 1}, + "physical_experts": {"type": "integer","minimum": 1}, + "redundant_experts": {"type": "integer","minimum": 0}, + "reference_tokens_per_rank": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}, + "replicated_experts": {"type": "integer","minimum": 0}, + "max_replicas": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 0}]}, + "imbalance_before": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]}, + "imbalance_after": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]} + } + }, + "resource": { + "type": "object", + "additionalProperties": false, + "required": ["mode","profile","comm_units_kind","configured_units"], + "properties": { + "mode": {"const": "tuned"}, + "profile": {"$ref": "#/$defs/safeId"}, + "comm_units_kind": {"$ref": "#/$defs/nullableLabel"}, + "configured_units": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]} + } + }, + "measurement": { + "type": "object", + "additionalProperties": false, + "required": [ + "contract", + "sampling_contract", + "iters", + "trials", + "warmups", + "samples_per_component", + "headline_component", + "headline_percentile" + ], + "properties": { + "contract": {"const": "layout-and-dispatch-v1"}, + "sampling_contract": {"const": "fixed-512-v1"}, + "iters": {"const": 8}, + "trials": {"const": 64}, + "warmups": {"const": 32}, + "samples_per_component": {"const": 512}, + "headline_component": {"const": "roundtrip"}, + "headline_percentile": {"const": "p99"} + } + }, + "points": {"type": "array","minItems": 1,"items": {"$ref": "#/$defs/point"}}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "cohort": { + "type": "object", + "additionalProperties": false, + "required": [ + "cohort_id", + "kind", + "label", + "description", + "series_ids", + "controlled_factors", + "varying_factors", + "publication_tier", + "eligibility" + ], + "properties": { + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "kind": {"enum": ["library","chip","system","routing"]}, + "label": {"$ref": "#/$defs/label"}, + "description": {"$ref": "#/$defs/label"}, + "series_ids": {"type": "array","minItems": 2,"uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}}, + "controlled_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}}, + "varying_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "metric": { + "type": "object", + "additionalProperties": false, + "required": ["operation","statistic","measure","objective","tokens_per_rank","phase"], + "properties": { + "operation": {"const": "roundtrip"}, + "statistic": {"enum": ["p50","p99"]}, + "measure": {"enum": ["latency_us","logical_payload_rate_gbps_at_latency_percentile"]}, + "objective": {"enum": ["min","max"]}, + "tokens_per_rank": {"type": "integer","minimum": 1}, + "phase": {"enum": ["decode","prefill"]} + } + }, + "ranking": { + "type": "object", + "additionalProperties": false, + "required": ["ranking_id","cohort_id","label","metric","entries","publication_tier","eligibility"], + "properties": { + "ranking_id": {"type": "string","pattern": "^cxranking-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "metric": {"$ref": "#/$defs/metric"}, + "entries": { + "type": "array", + "minItems": 2, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["rank","series_id","point_id","value","unit"], + "properties": { + "rank": {"type": "integer","minimum": 1}, + "series_id": {"$ref": "#/$defs/seriesId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "value": {"type": "number","exclusiveMinimum": 0}, + "unit": {"enum": ["us","GB/s"]} + } + } + }, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "recommendation": { + "type": "object", + "additionalProperties": false, + "required": [ + "recommendation_id", + "cohort_id", + "label", + "objective", + "series_id", + "point_id", + "value", + "unit", + "rationale", + "publication_tier", + "eligibility" + ], + "properties": { + "recommendation_id": {"type": "string","pattern": "^cxrecommendation-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "objective": {"enum": ["min-p50-latency","min-p99-latency","max-payload-rate-at-p50-latency","max-payload-rate-at-p99-latency"]}, + "series_id": {"$ref": "#/$defs/seriesId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "value": {"type": "number","exclusiveMinimum": 0}, + "unit": {"enum": ["us","GB/s"]}, + "rationale": {"$ref": "#/$defs/label"}, + "publication_tier": {"const": "official"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "sensitivity": { + "type": "object", + "additionalProperties": false, + "required": [ + "sensitivity_id", + "cohort_id", + "label", + "baseline_series_id", + "candidate_series_id", + "metric", + "signed_change_ratio", + "publication_tier", + "eligibility" + ], + "properties": { + "sensitivity_id": {"type": "string","pattern": "^cxsensitivity-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "baseline_series_id": {"$ref": "#/$defs/seriesId"}, + "candidate_series_id": {"$ref": "#/$defs/seriesId"}, + "metric": {"$ref": "#/$defs/metric"}, + "signed_change_ratio": {"type": "number"}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + } + } +} diff --git a/experimental/CollectiveX/schemas/raw-case-v1.schema.json b/experimental/CollectiveX/schemas/raw-case-v1.schema.json new file mode 100644 index 0000000000..ccf85b19ad --- /dev/null +++ b/experimental/CollectiveX/schemas/raw-case-v1.schema.json @@ -0,0 +1,1142 @@ +{ + "$id": "https://inferencex.com/schemas/collectivex/raw-case-v1.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$defs": { + "deepep_v2_jit_cubin": { + "additionalProperties": false, + "properties": { + "cache_key": { + "pattern":"^kernel\\.[A-Za-z0-9_+-]+\\.[0-9a-f]{32}$", + "type":"string" + }, + "cubin_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "sass_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "source_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["cache_key","cubin_sha256","sass_sha256","source_sha256"], + "type": "object" + }, + "hybrid_jit_rank_artifact": { + "additionalProperties": false, + "properties": { + "bytes": {"minimum":1,"type":"integer"}, + "rank": {"minimum":0,"type":"integer"}, + "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["bytes","rank","sha256"], + "type": "object" + }, + "hybrid_realized_config": { + "additionalProperties": false, + "properties": { + "backward_combine_api": {"type":"boolean"}, + "device_side_sync_combine_api": {"type":"boolean"}, + "device_side_sync_dispatch_api": {"type":"boolean"}, + "forward_dispatch_api": {"type":"boolean"}, + "hidden_dim": {"minimum":1,"type":"integer"}, + "max_num_of_tokens_per_rank": {"minimum":1,"type":"integer"}, + "num_of_additional_in_flight_s2g_combine_api": {"minimum":0,"type":"integer"}, + "num_of_additional_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_combine_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_permute": {"minimum":0,"type":"integer"}, + "num_of_blocks_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_unpermute": {"minimum":0,"type":"integer"}, + "num_of_experts_per_rank": {"minimum":1,"type":"integer"}, + "num_of_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_in_flight_s2g_permute_block_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_nodes": {"minimum":1,"type":"integer"}, + "num_of_ranks_per_node": {"minimum":1,"type":"integer"}, + "num_of_stages_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_stages_g2s_combine_api": {"minimum":0,"type":"integer"}, + "num_of_stages_permute_block_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_stages_s2g_combine_api": {"minimum":0,"type":"integer"}, + "num_of_threads_per_block_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_combine_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_group_combine_api": {"minimum":0,"type":"integer"}, + "pad_multiple": {"minimum":0,"type":"integer"}, + "token_data_type": {"enum":["UINT8","UINT16"]} + }, + "required": [ + "backward_combine_api","device_side_sync_combine_api","device_side_sync_dispatch_api", + "forward_dispatch_api","hidden_dim","max_num_of_tokens_per_rank", + "num_of_additional_in_flight_s2g_combine_api", + "num_of_additional_in_flight_s2g_dispatch_api","num_of_blocks_combine_api", + "num_of_blocks_dispatch_api","num_of_blocks_permute","num_of_blocks_preprocessing_api", + "num_of_blocks_unpermute","num_of_experts_per_rank", + "num_of_in_flight_s2g_dispatch_api","num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_nodes","num_of_ranks_per_node","num_of_stages_dispatch_api", + "num_of_stages_g2s_combine_api","num_of_stages_permute_block_dispatch_api", + "num_of_stages_s2g_combine_api","num_of_threads_per_block_preprocessing_api", + "num_of_tokens_per_chunk_combine_api","num_of_tokens_per_chunk_dispatch_api", + "num_of_tokens_per_chunk_preprocessing_api","num_of_tokens_per_group_combine_api", + "pad_multiple","token_data_type" + ], + "type": "object" + }, + "nullable_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "oracle": { + "additionalProperties": false, + "properties": { + "checks": { + "additionalProperties": false, + "properties": { + "combine_values": {"type":"boolean"}, + "counts": {"type":"boolean"}, + "metadata": {"type":"boolean"}, + "multiplicity": {"type":"boolean"}, + "payload": {"type":"boolean"}, + "source_set": {"type":"boolean"}, + "weights": {"type":"boolean"} + }, + "required": ["combine_values","counts","metadata","multiplicity","payload","source_set","weights"], + "type": "object" + }, + "atol": {"const":0.02}, + "combine_weight_semantics": {"const":"unweighted-rank-sum"}, + "contract": {"const":"expert-specific-transform-v1"}, + "dispatch_sha256": {"$ref":"#/$defs/nullable_sha256"}, + "max_absolute_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_elementwise_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_weight_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "order_sha256": {"$ref":"#/$defs/nullable_sha256"}, + "ordering_contract": {"minLength":1,"type":"string"}, + "passed": {"type":"boolean"}, + "receive_count": {"minimum":0,"type":"integer"}, + "rtol": {"const":0.05} + }, + "required": [ + "atol", + "checks", + "combine_weight_semantics", + "contract", + "dispatch_sha256", + "max_absolute_error", + "max_elementwise_relative_error", + "max_relative_error", + "max_weight_error", + "order_sha256", + "ordering_contract", + "passed", + "receive_count", + "rtol" + ], + "type": "object" + }, + "percentiles": { + "additionalProperties": false, + "properties": { + "p50": {"minimum":0,"type":"number"}, + "p90": {"minimum":0,"type":"number"}, + "p95": {"minimum":0,"type":"number"}, + "p99": {"minimum":0,"type":"number"} + }, + "required": ["p50","p90","p95","p99"], + "type": "object" + }, + "component": { + "additionalProperties": false, + "properties": { + "availability": {"enum":["measured","derived","unavailable"]}, + "origin": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "percentiles_us": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]}, + "sample_count": {"minimum":0,"type":"integer"} + }, + "required": ["availability","origin","percentiles_us","sample_count"], + "type": "object" + }, + "histogram": { + "additionalProperties": false, + "properties": { + "bins": {"minimum":1,"type":"integer"}, + "counts": {"items":{"minimum":0,"type":"integer"},"minItems":1,"type":"array"}, + "max": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"number"}, + "n": {"minimum":1,"type":"integer"} + }, + "required": ["n","min","max","bins","counts"], + "type": "object" + }, + "scheduled_case": { + "additionalProperties": false, + "properties": { + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "canonical": {"const":true}, + "ep": {"minimum":1,"type":"integer"}, + "eplb": {"type":"boolean"}, + "experts": {"minimum":1,"type":"integer"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "hidden": {"minimum":1,"type":"integer"}, + "ladder": {"pattern":"^[1-9][0-9]*( [1-9][0-9]*)*$","type":"string"}, + "nodes": {"minimum":1,"type":"integer"}, + "phase": {"enum":["decode","prefill"]}, + "required_publication": {"enum":["official","comparable-experimental"]}, + "routing": {"enum":["uniform","zipf"]}, + "samples_per_point": {"const":512}, + "scale_up_domain": {"minimum":1,"type":"integer"}, + "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "timing": {"const":"8:64:32"}, + "topk": {"minimum":1,"type":"integer"}, + "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"}, + "workload": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": [ + "backend", + "canonical", + "eplb", + "ep", + "experts", + "gpus_per_node", + "hidden", + "ladder", + "nodes", + "phase", + "required_publication", + "routing", + "samples_per_point", + "scale_up_domain", + "suite", + "timing", + "topk", + "warmup_semantics", + "workload" + ], + "type": "object" + }, + "git_run": { + "additionalProperties": false, + "properties": { + "artifact": {"minLength":1,"type":"string"}, + "job": {"minLength":1,"type":"string"}, + "ref": {"minLength":1,"type":"string"}, + "repo": {"pattern":"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$","type":"string"}, + "run_attempt": {"pattern":"^[1-9][0-9]*$","type":"string"}, + "run_id": {"pattern":"^[1-9][0-9]*$","type":"string"}, + "source_sha": {"pattern":"^[0-9a-f]{40}$","type":"string"} + }, + "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"], + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "case": { + "additionalProperties": false, + "properties": { + "attempt_ordinal": {"minimum":1,"type":"integer"}, + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "ep_size": {"minimum":1,"type":"integer"}, + "eplb": { + "additionalProperties": false, + "properties": { + "enabled": {"type":"boolean"}, + "imbalance_after": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "imbalance_before": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "mapping_hash": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "max_replicas": {"oneOf":[{"type":"null"},{"minimum":0,"type":"integer"}]}, + "num_logical_experts": {"minimum":1,"type":"integer"}, + "num_physical_experts": {"minimum":1,"type":"integer"}, + "num_redundant": {"minimum":0,"type":"integer"}, + "planner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "reference_tokens_per_rank": {"oneOf":[{"type":"null"},{"minimum":1,"type":"integer"}]}, + "replicated_experts": {"minimum":0,"type":"integer"} + }, + "required": [ + "enabled", + "imbalance_after", + "imbalance_before", + "mapping_hash", + "max_replicas", + "num_logical_experts", + "num_physical_experts", + "num_redundant", + "planner", + "reference_tokens_per_rank", + "replicated_experts" + ], + "type": "object" + }, + "mode": {"const":"normal"}, + "phase": {"enum":["decode","prefill"]}, + "required_publication": {"enum":["official","comparable-experimental"]}, + "resource_mode": {"const":"tuned"}, + "runner": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "shape": { + "additionalProperties": false, + "properties": { + "activation_profile": {"const":"canonical-counter-source-v3"}, + "dispatch_dtype": {"const":"bf16"}, + "eplb": {"type":"boolean"}, + "experts": {"minimum":1,"type":"integer"}, + "experts_per_rank": {"minimum":1,"type":"integer"}, + "hidden": {"minimum":1,"type":"integer"}, + "kernel_gen": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "num_logical_experts": {"minimum":1,"type":"integer"}, + "quant": { + "additionalProperties": false, + "properties": { + "combine_accum_dtype": {"minLength":1,"type":"string"}, + "combine_input_dtype": {"const":"bf16"}, + "combine_output_dtype": {"const":"bf16"}, + "combine_quant_mode": {"const":"none"}, + "scale_layout": {"type":"null"} + }, + "required": [ + "combine_accum_dtype", + "combine_input_dtype", + "combine_output_dtype", + "combine_quant_mode", + "scale_layout" + ], + "type": "object" + }, + "routing": {"enum":["uniform","zipf"]}, + "topk": {"minimum":1,"type":"integer"} + }, + "required": [ + "activation_profile", + "dispatch_dtype", + "eplb", + "experts", + "experts_per_rank", + "hidden", + "kernel_gen", + "num_logical_experts", + "quant", + "routing", + "topk" + ], + "type": "object" + }, + "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "workload_name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": [ + "attempt_ordinal", + "backend", + "eplb", + "ep_size", + "mode", + "phase", + "required_publication", + "resource_mode", + "runner", + "shape", + "suite", + "workload_name" + ], + "type": "object" + }, + "format": {"const":"collectivex.ep.v1"}, + "generated_at": {"format":"date-time","type":"string"}, + "identity": { + "additionalProperties": false, + "properties": { + "allocation_factors": { + "additionalProperties": false, + "properties": { + "artifact": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "execution_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "job": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "repo": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "run_attempt": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "run_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "runner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "source_sha": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"], + "type": "object" + }, + "allocation_id": {"pattern":"^cxallocation-v1-[0-9a-f]{64}$","type":"string"}, + "attempt_id": {"pattern":"^cxattempt-v1-[0-9a-f]{64}$","type":"string"}, + "attempt_ordinal": {"minimum":1,"type":"integer"}, + "case_factors": { + "additionalProperties": false, + "properties": { + "case": {"$ref":"#/$defs/scheduled_case"}, + "profile": { + "const": { + "activation_generator": "collectivex-activation-counter-v3", + "activation_profile": "canonical-counter-source-v3", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2", + "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1", + "contract": "layout-and-dispatch-v1", + "dtype": "bf16", + "eplb_planner": "greedy-rank-major-v1", + "eplb_redundant_experts": 32, + "eplb_reference_tokens_per_rank": 2048, + "mode": "normal", + "oracle_contract": "expert-specific-transform-v1", + "oracle_tolerances": "rtol=0.05,atol=0.02", + "placement": "packed", + "percentile_method": "nearest-rank", + "rank_reduction": "cross-rank-max-per-iteration", + "resource_mode": "tuned", + "routing_generator": "collectivex-routing-counter-v3", + "sampling_contract": "fixed-512-v1", + "seed": 67 + } + }, + "sku": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": ["case","profile","sku"], + "type": "object" + }, + "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"}, + "series_factors": { + "additionalProperties": false, + "properties": { + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "implementation_contract_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "public_config_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "routing_control_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"}, + "image_digest": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "runtime_fingerprint_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "source_sha": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{40}$","type":"string"}]}, + "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "workload_id": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"} + }, + "required": [ + "backend", + "implementation_contract_sha256", + "public_config_sha256", + "routing_control_sha256", + "case_id", + "image_digest", + "runtime_fingerprint_sha256", + "source_sha", + "squash_sha256", + "workload_id" + ], + "type": "object" + }, + "series_id": {"pattern":"^cxseries-v1-[0-9a-f]{64}$","type":"string"} + }, + "required": [ + "allocation_factors", + "allocation_id", + "attempt_id", + "attempt_ordinal", + "case_factors", + "case_id", + "series_factors", + "series_id" + ], + "type": "object" + }, + "implementation": { + "additionalProperties": false, + "properties": { + "kernel_generation": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "provenance": { + "properties": { + "allow_hybrid_mode": {"const":false}, + "communication_backend": {"const":"nccl-device-lsa"}, + "deepep_fix_pr": {"const":630}, + "deepep_pr": {"const":605}, + "deterministic": {"type": "boolean"}, + "gin_enabled": {"const":false}, + "jit_cubins": { + "items": {"$ref":"#/$defs/deepep_v2_jit_cubin"}, + "maxItems": 5, + "minItems": 5, + "type": "array", + "uniqueItems": true + }, + "jit_kernel_keys": { + "items": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"}, + "maxItems": 3, + "minItems": 3, + "type": "array", + "uniqueItems": true + }, + "jit_random_seed": {"const":"collectivex-deepep-v2-fa8a9b1"}, + "jit_shared_objects": { + "items": { + "additionalProperties": false, + "properties": { + "kernel_key": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"}, + "rank_artifacts": { + "items": {"$ref":"#/$defs/hybrid_jit_rank_artifact"}, + "minItems": 1, + "type": "array" + } + }, + "required": ["kernel_key","rank_artifacts"], + "type": "object" + }, + "maxItems": 3, + "minItems": 3, + "type": "array" + }, + "num_experts": {"minimum": 1, "type": "integer"}, + "realized_config": {"$ref":"#/$defs/hybrid_realized_config"}, + "tuning_num_experts": {"minimum": 1, "type": "integer"}, + "uccl_dependency_versions": { + "additionalProperties": false, + "properties": { + "intervaltree": {"const":"3.1.0"}, + "nvidia-cuda-runtime-cu12": {"const":"12.9.79"}, + "sortedcontainers": {"const":"2.4.0"} + }, + "required": ["intervaltree","nvidia-cuda-runtime-cu12","sortedcontainers"], + "type": "object" + } + }, + "type": "object", + "propertyNames": { + "enum": [ + "allocated_qps", + "allow_hybrid_mode", + "allow_mnnvl", + "allow_multiple_reduction", + "api", + "api_signature_sha256", + "backend", + "backend_lineage", + "block_num", + "block_num_floored", + "block_num_target", + "branch", + "collective_library", + "combine_dtype", + "combine_warps", + "communication_backend", + "cuda_version", + "deepep_commit", + "deepep_distribution_version", + "deepep_fix_pr", + "deepep_pr", + "deepep_tree", + "deepep_version", + "deterministic", + "device_cus", + "device_sms", + "dispatch_dtype", + "dispatch_warps", + "enable_sdma", + "fmt_commit", + "gpus_per_node", + "gin_enabled", + "heap_size", + "impl", + "jit_cache_key", + "jit_cubins", + "jit_kernel_keys", + "jit_random_seed", + "jit_shared_objects", + "kernel_type", + "loaded_libraries", + "local_experts", + "logical_scaleout_ranks", + "logical_scaleup_ranks", + "mapping_variant", + "max_num_inp_token_per_rank", + "max_num_tokens", + "max_total_recv_tokens", + "mnnvl_comm", + "mode", + "mori_commit", + "nccl_communicator", + "nccl_package_version", + "nccl_version", + "num_experts", + "num_max_tokens_per_rank", + "num_nvl_bytes", + "num_qps", + "num_sms", + "nvshmem_package_version", + "path", + "physical_nvlink_ranks", + "physical_rdma_ranks", + "prefer_overlap_with_compute", + "reference_semantics", + "realized_config", + "requested_num_sms", + "resource_mode", + "routing_factor", + "routing_metadata", + "sm_fraction", + "top_k", + "torch_git_version", + "torch_version", + "transport", + "trtllm", + "tuned_source", + "tuning_num_experts", + "uccl_commit", + "uccl_dependency_versions", + "uccl_version", + "uccl_wrapper_commit", + "workspace" + ] + } + }, + "resource_profile": { + "additionalProperties": false, + "properties": { + "achieved_fraction": {}, + "comm_units_kind": {}, + "configured_units": {}, + "conformance_class": {}, + "device_units": {}, + "fixed_kernel": {}, + "nonconforming": {}, + "pareto_eligible": {}, + "persistent_bytes": {}, + "qps_per_rank": {}, + "requested_fraction": {}, + "tuned_source": {}, + "target_achieved_within_tol": {}, + "tolerance": {}, + "resource_class": {}, + "warps_combine": {}, + "warps_dispatch": {} + }, + "required": [ + "comm_units_kind", + "requested_fraction", + "configured_units", + "device_units", + "achieved_fraction", + "warps_dispatch", + "warps_combine", + "qps_per_rank", + "persistent_bytes", + "tuned_source", + "resource_class", + "conformance_class", + "tolerance", + "target_achieved_within_tol", + "nonconforming", + "fixed_kernel", + "pareto_eligible" + ], + "type": "object" + } + }, + "required": ["kernel_generation","name","provenance","resource_profile"], + "type": "object" + }, + "measurement": { + "additionalProperties": false, + "properties": { + "component_order_contract": {"const":"roundtrip-dispatch-activation-only-combine-v2"}, + "conditioning": { + "additionalProperties": false, + "properties": { + "contract": {"const":"fixed-phase-ramp-8-roundtrips-v1"}, + "ladder": {"items":{"minimum":1,"type":"integer"},"minItems":1,"type":"array"}, + "roundtrips_per_shape": {"const":8} + }, + "required": ["contract","ladder","roundtrips_per_shape"], + "type": "object" + }, + "contract": {"const":"layout-and-dispatch-v1"}, + "rows": { + "items": { + "additionalProperties": false, + "properties": { + "anomalies": { + "items": { + "additionalProperties": false, + "properties": { + "T": {"minimum":1,"type":"integer"}, + "component_floor_p50": {"minimum":0,"type":"number"}, + "isolated_sum_p99": {"minimum":0,"type":"number"}, + "ratio": {"minimum":0,"type":"number"}, + "roundtrip_p50": {"minimum":0,"type":"number"}, + "roundtrip_p99": {"minimum":0,"type":"number"}, + "threshold": {"minimum":0,"type":"number"}, + "type": {"enum":["roundtrip_gt_isolated_sum","roundtrip_lt_component_floor"]} + }, + "required": ["type","T"], + "type": "object" + }, + "type": "array" + }, + "components": { + "additionalProperties": false, + "properties": { + "combine": {"$ref":"#/$defs/component"}, + "dispatch": {"$ref":"#/$defs/component"}, + "isolated_sum": {"$ref":"#/$defs/component"}, + "roundtrip": {"$ref":"#/$defs/component"} + }, + "required": ["combine","dispatch","isolated_sum","roundtrip"], + "type": "object" + }, + "correctness": { + "additionalProperties": false, + "properties": { + "contract": {"const":"expert-specific-transform-v1"}, + "max_relative_error": {"minimum":0,"type":"number"}, + "passed": {"type":"boolean"}, + "rank_evidence": { + "items": { + "additionalProperties": false, + "properties": { + "input_unchanged": {"type":"boolean"}, + "order_stable": {"type":"boolean"}, + "post_timing": {"$ref":"#/$defs/oracle"}, + "pre_timing": {"$ref":"#/$defs/oracle"}, + "rank": {"minimum":0,"type":"integer"} + }, + "required": ["input_unchanged","order_stable","post_timing","pre_timing","rank"], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "scope": {"const":"dispatch-metadata-and-transformed-combine"} + }, + "required": ["contract","max_relative_error","passed","rank_evidence","scope"], + "type": "object" + }, + "evidence_id": {"pattern":"^cxevidence-v1-[0-9a-f]{64}$","type":"string"}, + "global_tokens": {"minimum":1,"type":"integer"}, + "logical_bytes": { + "additionalProperties": false, + "properties": { + "combine": {"minimum":1,"type":"integer"}, + "dispatch": {"minimum":1,"type":"integer"}, + "roundtrip": {"minimum":1,"type":"integer"} + }, + "required": ["combine","dispatch","roundtrip"], + "type": "object" + }, + "point_id": {"pattern":"^cxpoint-v1-[0-9a-f]{64}$","type":"string"}, + "receive": { + "additionalProperties": false, + "properties": { + "max": {"minimum":0,"type":"integer"}, + "mean": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"integer"}, + "total": {"minimum":0,"type":"integer"} + }, + "required": ["max","mean","min","total"], + "type": "object" + }, + "routing": { + "additionalProperties": false, + "properties": { + "empty_expert_count": {"minimum":0,"type":"integer"}, + "empty_rank_count": {"minimum":0,"type":"integer"}, + "expert_assignment_rank_cv": {"minimum":0,"type":"number"}, + "expert_assignments_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "expert_load_cv": {"minimum":0,"type":"number"}, + "expert_load_max": {"minimum":0,"type":"integer"}, + "expert_load_mean": {"minimum":0,"type":"number"}, + "expert_load_min": {"minimum":0,"type":"integer"}, + "fanout_histogram": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "fanout_max": {"minimum":1,"type":"integer"}, + "fanout_mean": {"minimum":0,"type":"number"}, + "fanout_min": {"minimum":1,"type":"integer"}, + "hash": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "hotspot_ratio": {"minimum":0,"type":"number"}, + "locality": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": false, + "properties": { + "copies": {"minimum":0,"type":"integer"}, + "cross_domain_fraction": {"minimum":0,"type":"number"}, + "cross_node_fraction": {"minimum":0,"type":"number"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "local_rank_fraction": {"minimum":0,"type":"number"}, + "placement": {"const":"packed"}, + "same_node_fraction": {"minimum":0,"type":"number"}, + "same_scaleup_domain_fraction": {"minimum":0,"type":"number"}, + "scale_up_domain": {"minimum":1,"type":"integer"} + }, + "required": [ + "placement", + "local_rank_fraction", + "same_node_fraction", + "same_scaleup_domain_fraction", + "cross_node_fraction", + "cross_domain_fraction", + "gpus_per_node", + "scale_up_domain", + "copies" + ], + "type": "object" + } + ] + }, + "payload_copies_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "payload_rank_cv": {"minimum":0,"type":"number"}, + "routed_copies": {"minimum":1,"type":"integer"}, + "source_token_stats": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": false, + "properties": { + "cv": {"minimum":0,"type":"number"}, + "empty_ranks": {"minimum":0,"type":"integer"}, + "max": {"minimum":0,"type":"integer"}, + "mean": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"integer"}, + "ranks": {"minimum":1,"type":"integer"}, + "total": {"minimum":0,"type":"integer"} + }, + "required": ["min","mean","max","cv","empty_ranks","total","ranks"], + "type": "object" + } + ] + } + }, + "required": [ + "empty_expert_count", + "empty_rank_count", + "expert_assignment_rank_cv", + "expert_assignments_per_rank", + "expert_load_cv", + "expert_load_max", + "expert_load_mean", + "expert_load_min", + "fanout_histogram", + "fanout_max", + "fanout_mean", + "fanout_min", + "hash", + "hotspot_ratio", + "locality", + "payload_copies_per_rank", + "payload_rank_cv", + "routed_copies", + "source_token_stats" + ], + "type": "object" + }, + "sample_histograms": { + "additionalProperties": false, + "properties": { + "combine": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]}, + "dispatch": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]}, + "roundtrip": {"$ref":"#/$defs/histogram"} + }, + "required": ["dispatch","combine","roundtrip"], + "type": "object" + }, + "sample_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "token_rate_at_latency_percentile": {"$ref":"#/$defs/percentiles"}, + "tokens_per_rank": {"minimum":1,"type":"integer"} + }, + "required": [ + "anomalies", + "components", + "correctness", + "evidence_id", + "global_tokens", + "logical_bytes", + "point_id", + "receive", + "routing", + "sample_histograms", + "sample_sha256", + "token_rate_at_latency_percentile", + "tokens_per_rank" + ], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "sampling": { + "additionalProperties": false, + "properties": { + "contract": {"const":"fixed-512-v1"}, + "iterations_per_trial": {"const":8}, + "percentile_method": {"const":"nearest-rank"}, + "reduction": {"const":"cross-rank-max-per-iteration"}, + "samples_per_component": {"const":512}, + "trials": {"const":64}, + "warmup_iterations": {"const":32}, + "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"} + }, + "required": [ + "contract", + "iterations_per_trial", + "percentile_method", + "reduction", + "samples_per_component", + "trials", + "warmup_iterations", + "warmup_semantics" + ], + "type": "object" + }, + "source_allocation": {"const":"even"} + }, + "required": [ + "component_order_contract", + "conditioning", + "contract", + "rows", + "sampling", + "source_allocation" + ], + "type": "object" + }, + "outcome": { + "additionalProperties": false, + "properties": { + "publication_status": {"enum":["diagnostic","invalid"]}, + "reasons": {"items":{"type":"string"},"type":"array"}, + "status": {"enum":["success","invalid"]}, + "validity": { + "additionalProperties": false, + "properties": { + "anomaly_free": {"type":"boolean"}, + "execution_status": {"enum":["complete","failed"]}, + "measurement_conformance": {"enum":["conformant","nonconformant"]}, + "provenance_complete": {"type":"boolean"}, + "resource_conformance": {"minLength":1,"type":"string"}, + "sampling_conformance": {"enum":["conformant","nonconformant"]}, + "semantic_correctness": {"enum":["pass","fail"]}, + "workload_identity": {"enum":["consistent-across-ranks","inconsistent"]}, + "workload_source": {"enum":["canonical-serialized","seeded-runtime"]} + }, + "required": [ + "execution_status", + "semantic_correctness", + "workload_identity", + "workload_source", + "measurement_conformance", + "sampling_conformance", + "resource_conformance", + "provenance_complete", + "anomaly_free" + ], + "type": "object" + } + }, + "required": ["publication_status","reasons","status","validity"], + "type": "object" + }, + "provenance": { + "additionalProperties": false, + "properties": { + "command": {"minLength":1,"type":"string"}, + "distributed_launcher": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "git_run": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/git_run"}]}, + "image": { + "additionalProperties": false, + "properties": { + "arch": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "digest": { + "oneOf": [{"type":"null"},{"pattern":"^sha256:[0-9a-f]{64}$","type":"string"}] + }, + "digest_verified": {"type":"boolean"}, + "reference": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]} + }, + "required": ["arch","digest","digest_verified","reference","squash_sha256"], + "type": "object" + }, + "redaction": {"const":"sanitized-v1"} + }, + "required": ["command","distributed_launcher","git_run","image","redaction"], + "type": "object" + }, + "record_type": {"const":"case-attempt"}, + "runtime_fingerprint": { + "additionalProperties": false, + "properties": { + "accelerator_runtime": { + "additionalProperties": false, + "properties": { + "kind": {"enum":["cuda","hip"]}, + "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["kind","version"], + "type": "object" + }, + "collective_library": { + "additionalProperties": false, + "properties": { + "kind": {"enum":["nccl","rccl"]}, + "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["kind","version"], + "type": "object" + }, + "device": { + "additionalProperties": false, + "properties": { + "arch": {"minLength":1,"type":"string"}, + "compute_units": {"minimum":1,"type":"integer"}, + "memory_bytes": {"minimum":1,"type":"integer"}, + "product": {"minLength":1,"type":"string"}, + "warp_size": {"minimum":1,"type":"integer"} + }, + "required": ["arch","compute_units","memory_bytes","product","warp_size"], + "type": "object" + }, + "driver_version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "framework": { + "additionalProperties": false, + "properties": {"kind":{"const":"torch"},"version":{"minLength":1,"type":"string"}}, + "required": ["kind","version"], + "type": "object" + }, + "machine": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "python_version": {"minLength":1,"type":"string"}, + "vendor": {"enum":["nvidia","amd"]} + }, + "required": [ + "accelerator_runtime", + "collective_library", + "device", + "driver_version", + "framework", + "machine", + "python_version", + "vendor" + ], + "type": "object" + }, + "sample_artifact": { + "additionalProperties": false, + "properties": { + "bytes": {"minimum":1,"type":"integer"}, + "format": {"const":"collectivex.samples.v1"}, + "path": {"pattern":"^[A-Za-z0-9_.-]+$","type":"string"}, + "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["bytes","format","path","sha256"], + "type": "object" + }, + "schema_version": {"const":1}, + "topology": { + "additionalProperties": false, + "properties": { + "device_count": {"minimum":1,"type":"integer"}, + "device_product": {"minLength":1,"type":"string"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "nodes": {"minimum":1,"type":"integer"}, + "placement": {"const":"packed"}, + "realized_placement": { + "additionalProperties": false, + "properties": { + "gpus_per_node": {"minimum":1,"type":"integer"}, + "nodes": {"minimum":1,"type":"integer"}, + "ranks_per_node": {"minimum":1,"type":"integer"}, + "unique_local_ranks": {"const":true}, + "valid": {"const":true} + }, + "required": ["gpus_per_node","nodes","ranks_per_node","unique_local_ranks","valid"], + "type": "object" + }, + "scale_up_domain": {"minimum":1,"type":"integer"}, + "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "world_size": {"minimum":1,"type":"integer"} + }, + "required": [ + "device_count", + "device_product", + "gpus_per_node", + "nodes", + "placement", + "realized_placement", + "scale_up_domain", + "topology_class", + "transport", + "world_size" + ], + "type": "object" + }, + "workload": { + "additionalProperties": false, + "properties": { + "activation_generator": {"const":"collectivex-activation-counter-v3"}, + "activation_identity": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "activation_profile": {"const":"canonical-counter-source-v3"}, + "cross_rank_consistent": {"const":true}, + "manifest_checksums": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": { + "additionalProperties": false, + "properties": { + "topk_idx": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "topk_weights": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "trace": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["topk_idx", "topk_weights", "trace"], + "type": "object" + }, + "type": "object" + } + ] + }, + "members": { + "oneOf": [ + {"type":"null"}, + { + "items": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}, + "minItems": 1, + "uniqueItems": true, + "type": "array" + } + ] + }, + "routing_generator": {"const":"collectivex-routing-counter-v3"}, + "source": {"enum":["canonical-serialized","seeded-runtime"]}, + "trace_hashes": { + "items": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "minItems": 1, + "type": "array" + }, + "trace_signature": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "workload_id": { + "oneOf": [{"type":"null"},{"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}] + } + }, + "required": [ + "activation_generator", + "activation_identity", + "activation_profile", + "cross_rank_consistent", + "manifest_checksums", + "members", + "routing_generator", + "source", + "trace_hashes", + "trace_signature", + "workload_id" + ], + "type": "object" + } + }, + "required": [ + "case", + "format", + "generated_at", + "identity", + "implementation", + "measurement", + "outcome", + "provenance", + "record_type", + "runtime_fingerprint", + "sample_artifact", + "schema_version", + "topology", + "workload" + ], + "title": "CollectiveX raw case attempt v1", + "type": "object" +} diff --git a/experimental/CollectiveX/schemas/samples-v1.schema.json b/experimental/CollectiveX/schemas/samples-v1.schema.json new file mode 100644 index 0000000000..b9a1df0541 --- /dev/null +++ b/experimental/CollectiveX/schemas/samples-v1.schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/samples-v1.schema.json", + "title": "CollectiveX exact private samples v1", + "type": "object", + "additionalProperties": false, + "required": ["allocation_id","attempt_id","case_id","format","points","sampling","schema_version","series_id"], + "properties": { + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "case_id": {"$ref": "#/$defs/caseId"}, + "format": {"const": "collectivex.samples.v1"}, + "points": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["components","evidence_id","point_id","sample_sha256","tokens_per_rank"], + "properties": { + "components": { + "type": "object", + "additionalProperties": false, + "required": ["combine","dispatch","roundtrip"], + "properties": { + "combine": {"$ref": "#/$defs/component"}, + "dispatch": {"$ref": "#/$defs/component"}, + "roundtrip": {"$ref": "#/$defs/component"} + } + }, + "evidence_id": {"$ref": "#/$defs/evidenceId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "sample_sha256": {"$ref": "#/$defs/sha256"}, + "tokens_per_rank": {"type": "integer","minimum": 1} + } + } + }, + "sampling": { + "type": "object", + "additionalProperties": false, + "required": ["iterations_per_trial","reduction","trials"], + "properties": { + "iterations_per_trial": {"const": 8}, + "reduction": {"const": "cross-rank-max-per-iteration"}, + "trials": {"const": 64} + } + }, + "schema_version": {"const": 1}, + "series_id": {"$ref": "#/$defs/seriesId"} + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "component": { + "type": "object", + "additionalProperties": false, + "required": ["availability","sample_count","trials"], + "properties": { + "availability": {"enum": ["measured","unavailable"]}, + "sample_count": {"type": "integer","minimum": 0,"maximum": 512}, + "trials": { + "oneOf": [ + {"type": "null"}, + { + "type": "array", + "minItems": 64, + "maxItems": 64, + "items": {"type": "array","minItems": 8,"maxItems": 8,"items": {"type": "number","minimum": 0}} + } + ] + } + } + } + } +} diff --git a/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json new file mode 100644 index 0000000000..9c28613064 --- /dev/null +++ b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json @@ -0,0 +1,246 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/terminal-outcome-v1.schema.json", + "title": "CollectiveX terminal outcome v1", + "type": "object", + "additionalProperties": false, + "required": ["case","format","generated_at","identity","outcome","provenance","record_type","schema_version"], + "properties": { + "case": {"$ref": "#/$defs/case"}, + "format": {"const": "collectivex.terminal.v1"}, + "generated_at": {"type": "string","format": "date-time"}, + "identity": { + "type": "object", + "additionalProperties": false, + "required": ["allocation_factors","allocation_id","attempt_id","attempt_ordinal","case_factors","case_id"], + "properties": { + "allocation_factors": {"$ref": "#/$defs/allocationFactors"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "attempt_ordinal": {"type": "integer","minimum": 1}, + "case_factors": { + "type": "object", + "additionalProperties": false, + "required": ["case","profile","sku"], + "properties": { + "case": {"$ref": "#/$defs/case"}, + "profile": { + "const": { + "activation_generator": "collectivex-activation-counter-v3", + "activation_profile": "canonical-counter-source-v3", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2", + "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1", + "contract": "layout-and-dispatch-v1", + "dtype": "bf16", + "eplb_planner": "greedy-rank-major-v1", + "eplb_redundant_experts": 32, + "eplb_reference_tokens_per_rank": 2048, + "mode": "normal", + "oracle_contract": "expert-specific-transform-v1", + "oracle_tolerances": "rtol=0.05,atol=0.02", + "placement": "packed", + "percentile_method": "nearest-rank", + "rank_reduction": "cross-rank-max-per-iteration", + "resource_mode": "tuned", + "routing_generator": "collectivex-routing-counter-v3", + "sampling_contract": "fixed-512-v1", + "seed": 67 + } + }, + "sku": {"$ref": "#/$defs/safeId"} + } + }, + "case_id": {"$ref": "#/$defs/caseId"} + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "required": ["failure_mode","reason","return_code","status"], + "properties": { + "failure_mode": {"$ref": "#/$defs/safeId"}, + "reason": {"type": "string","minLength": 1,"maxLength": 240}, + "return_code": {"type": "integer","minimum": 0}, + "status": {"enum": ["failed","invalid","unsupported"]} + } + }, + "provenance": { + "type": "object", + "additionalProperties": false, + "required": ["git_run","control_sha256","redaction","source"], + "properties": { + "git_run": {"$ref": "#/$defs/gitRun"}, + "control_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "redaction": {"const": "sanitized-v1"}, + "source": { + "enum": [ + "runtime-emitter", + "post-emit-command", + "matrix-capability-resolver" + ] + } + } + }, + "record_type": {"const": "terminal-outcome"}, + "schema_version": {"const": 1} + }, + "allOf": [ + { + "oneOf": [ + { + "properties": { + "provenance": { + "properties": {"source": {"const": "runtime-emitter"}} + }, + "outcome": {"$ref": "#/$defs/runtimeOutcome"} + } + }, + { + "properties": { + "provenance": { + "properties": {"source": {"const": "post-emit-command"}} + }, + "outcome": {"$ref": "#/$defs/postEmitOutcome"} + } + }, + { + "properties": { + "provenance": { + "properties": {"source": {"const": "matrix-capability-resolver"}} + }, + "outcome": {"$ref": "#/$defs/capabilityOutcome"} + } + } + ] + } + ], + "$defs": { + "runtimeOutcome": { + "type": "object", + "properties": {"status": {"const": "failed"}}, + "allOf": [ + { + "oneOf": [ + {"properties": {"failure_mode": {"const": "setup"}, "reason": {"const": "launcher-setup-failed"}}}, + {"properties": {"failure_mode": {"const": "repository-stage"}, "reason": {"const": "repository-staging-failed"}}}, + {"properties": {"failure_mode": {"const": "registry-verification"}, "reason": {"const": "container-registry-verification-failed"}}}, + {"properties": {"failure_mode": {"const": "scheduler-allocation"}, "reason": {"const": "scheduler-allocation-failed"}}}, + {"properties": {"failure_mode": {"const": "container-import"}, "reason": {"const": "container-image-preparation-failed"}}}, + {"properties": {"failure_mode": {"const": "container-hash"}, "reason": {"const": "container-image-identity-failed"}}}, + {"properties": {"failure_mode": {"const": "container-launch"}, "reason": {"const": "container-runtime-launch-failed"}}}, + {"properties": {"failure_mode": {"const": "backend-setup"}, "reason": {"const": "backend-setup-failed"}}}, + {"properties": {"failure_mode": {"const": "artifact-collection"}, "reason": {"const": "artifact-collection-failed"}}}, + {"properties": {"failure_mode": {"const": "runtime-identity"}, "reason": {"const": "runtime-identity-mismatch"}}}, + {"properties": {"failure_mode": {"const": "timeout"}, "reason": {"const": "execution-timeout"}}}, + {"properties": {"failure_mode": {"const": "deadlock"}, "reason": {"const": "execution-deadlock"}}}, + {"properties": {"failure_mode": {"const": "execution"}, "reason": {"const": "distributed-command-failed"}}} + ] + } + ] + }, + "postEmitOutcome": { + "type": "object", + "properties": { + "status": {"const": "failed"}, + "failure_mode": {"enum": ["runtime-identity", "timeout", "deadlock", "execution"]}, + "reason": {"const": "post-emit-distributed-command-failed"} + } + }, + "capabilityOutcome": { + "type": "object", + "properties": { + "status": {"const": "unsupported"}, + "failure_mode": {"const": "capability"}, + "reason": { + "enum": [ + "backend-platform-unsupported", + "backend-token-capacity" + ] + } + } + }, + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "nullableText": {"oneOf": [{"type": "null"},{"type": "string","minLength": 1}]}, + "allocationFactors": { + "type": "object", + "additionalProperties": false, + "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"], + "properties": { + "artifact": {"$ref": "#/$defs/nullableText"}, + "execution_id": {"$ref": "#/$defs/nullableText"}, + "job": {"$ref": "#/$defs/nullableText"}, + "repo": {"$ref": "#/$defs/nullableText"}, + "run_attempt": {"$ref": "#/$defs/nullableText"}, + "run_id": {"$ref": "#/$defs/nullableText"}, + "runner": {"$ref": "#/$defs/nullableText"}, + "source_sha": {"$ref": "#/$defs/nullableText"} + } + }, + "gitRun": { + "type": "object", + "additionalProperties": false, + "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"], + "properties": { + "artifact": {"type": "string","minLength": 1}, + "job": {"type": "string","minLength": 1}, + "ref": {"type": "string","minLength": 1}, + "repo": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"}, + "run_attempt": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"} + } + }, + "case": { + "type": "object", + "additionalProperties": false, + "required": [ + "backend", + "canonical", + "eplb", + "ep", + "experts", + "gpus_per_node", + "hidden", + "ladder", + "nodes", + "phase", + "required_publication", + "routing", + "samples_per_point", + "scale_up_domain", + "suite", + "timing", + "topk", + "warmup_semantics", + "workload" + ], + "properties": { + "backend": {"$ref": "#/$defs/safeId"}, + "canonical": {"const": true}, + "eplb": {"type": "boolean"}, + "ep": {"type": "integer","minimum": 1}, + "experts": {"type": "integer","minimum": 1}, + "gpus_per_node": {"type": "integer","minimum": 1}, + "hidden": {"type": "integer","minimum": 1}, + "ladder": {"type": "string","pattern": "^[1-9][0-9]*( [1-9][0-9]*)*$"}, + "nodes": {"type": "integer","minimum": 1}, + "phase": {"enum": ["decode","prefill"]}, + "required_publication": {"enum": ["official","comparable-experimental"]}, + "routing": {"enum": ["uniform","zipf"]}, + "samples_per_point": {"const": 512}, + "scale_up_domain": {"type": "integer","minimum": 1}, + "suite": {"$ref": "#/$defs/safeId"}, + "timing": {"const": "8:64:32"}, + "topk": {"type": "integer","minimum": 1}, + "warmup_semantics": {"const": "full-roundtrip-before-each-component-trial-point-v1"}, + "workload": {"$ref": "#/$defs/safeId"} + } + } + } +} diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py new file mode 100644 index 0000000000..3752db6b9d --- /dev/null +++ b/experimental/CollectiveX/summarize.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""Render a small native-v1 shard summary and gate on a successful case.""" +from __future__ import annotations + +import argparse +from pathlib import Path + +import contracts + + +def load_results(directory: str, runner: str | None, timestamp: str | None) -> list[dict]: + documents: list[dict] = [] + for path in sorted(Path(directory).glob("*.json")): + if runner and not path.name.startswith(f"{runner}_"): + continue + if timestamp and timestamp not in path.name: + continue + try: + document = contracts.strict_load(path) + if document.get("format") == contracts.RAW_FORMAT: + documents.append(contracts.load_raw_attempt(path)) + elif document.get("format") == contracts.TERMINAL_FORMAT: + documents.append(contracts.validate_terminal_document(document)) + except (contracts.ContractError, OSError): + continue + return documents + + +def _identity(document: dict) -> tuple[str, str, str, str, bool, str, int]: + case = document["case"] + if document["format"] == contracts.RAW_FORMAT: + routing = case["shape"]["routing"] + eplb = case["eplb"]["enabled"] + else: + routing = case["routing"] + eplb = case["eplb"] + sku = document["identity"]["case_factors"]["sku"] + return ( + sku, case["suite"], routing, case["phase"], eplb, + case["required_publication"], case.get("ep_size", case.get("ep", 0)), + ) + + +def _headline(document: dict) -> tuple[int | str, float | str, float | str]: + if document["format"] != contracts.RAW_FORMAT: + return "-", "-", "-" + rows = document["measurement"]["rows"] + row = next((item for item in rows if item["tokens_per_rank"] == 64), rows[len(rows) // 2]) + latency = row["components"]["roundtrip"]["percentiles_us"] + return row["tokens_per_rank"], latency["p50"], latency["p99"] + + +def render(documents: list[dict], markdown: bool) -> str: + documents = sorted(documents, key=_identity) + if markdown: + lines = [ + "## CollectiveX EP results", "", + "| sku | backend | suite | phase | routing | tier | ep | outcome | T* | p50 us | p99 us |", + "|---|---|---|---|---|---|--:|---|--:|--:|--:|", + ] + for document in documents: + sku, suite, routing, phase, eplb, tier, ep = _identity(document) + backend = document["case"]["backend"] + token, p50, p99 = _headline(document) + lines.append( + f"| {sku} | `{backend}` | {suite} | {phase} | " + f"{routing}{'+eplb' if eplb else ''} | {tier} | {ep} | " + f"{document['outcome']['status']} | {token} | {p50} | {p99} |" + ) + if not documents: + lines.append("\n> No valid native v1 outcome documents found.") + return "\n".join(lines) + lines = ["CollectiveX EP results", "======================"] + for document in documents: + sku, suite, routing, phase, eplb, tier, ep = _identity(document) + backend = document["case"]["backend"] + token, _, p99 = _headline(document) + lines.append( + f" {sku:<10} {backend:<16} {suite:<13} {phase:<7} " + f"{routing}{'+eplb' if eplb else ''} {tier} ep{ep} " + f"{document['outcome']['status']} T={token} roundtrip_p99_us={p99}" + ) + return "\n".join(lines) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Summarize CollectiveX native v1 outcomes") + parser.add_argument("--results-dir", default="results") + parser.add_argument("--runner") + parser.add_argument("--ts") + parser.add_argument("--markdown", action="store_true") + args = parser.parse_args() + documents = load_results(args.results_dir, args.runner, args.ts) + print(render(documents, args.markdown)) + if args.markdown: + return 0 + return 0 if any( + document["format"] == contracts.RAW_FORMAT + and document["outcome"]["status"] == "success" + for document in documents + ) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py new file mode 100644 index 0000000000..17aa80c94a --- /dev/null +++ b/experimental/CollectiveX/sweep_matrix.py @@ -0,0 +1,974 @@ +#!/usr/bin/env python3 +"""Resolve CollectiveX v1 suites and extract validated execution shards. + +The promoted v1 profile is intentionally narrow: normal-mode BF16, +layout-and-dispatch-v1, tuned resources, and unquantized BF16 combine. Those +constants are runtime defaults, not matrix axes. The matrix contains only the +dimensions that vary between scheduled cases. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import itertools +import json +import os +from pathlib import Path +import re +import sys +from typing import Any + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) +sys.path.insert(0, str(HERE / "tests")) + +try: # Shard extraction on GPU runners is intentionally stdlib-only. + import yaml # type: ignore +except ModuleNotFoundError: # pragma: no cover - exercised by the workflow environment + yaml = None + +import capability as cap # noqa: E402 +import contracts # noqa: E402 +import ep_harness # noqa: E402 +import identity # noqa: E402 + + +EP_TIMING_PROFILE = ( + f"{ep_harness.TIMED_ITERS_PER_TRIAL}:{ep_harness.TRIALS_PER_POINT}:" + f"{ep_harness.WARMUP_ITERS_PER_TRIAL}" +) +V1_PROFILE = dict(identity.V1_CASE_PROFILE) +V1_WORKLOAD = ("deepseek-v3-v1", 7168, 8, 256) +V1_SUITE_CONTRACTS = { + "ep-core-v1": { + "publication": "official", + "coordinates": {("decode", "uniform", False), ("prefill", "uniform", False)}, + "ladders": { + "decode": tuple(ep_harness.DECODE_LADDER), + "prefill": (256, 512), + }, + }, + "ep-routing-v1": { + "publication": "comparable-experimental", + "coordinates": { + ("decode", "zipf", False), ("decode", "zipf", True), + ("prefill", "zipf", False), ("prefill", "zipf", True), + }, + "ladders": {"decode": (128,), "prefill": (512,)}, + }, +} +IDENTIFIER = re.compile(r"[a-z0-9][a-z0-9.-]*") +SUITE_FIELDS = { + "ep_degrees", "eplb", "phases", "platforms", "required_publication", "routings", "token_points", + "token_points_decode", "token_points_prefill", "workloads", +} +SUITE_REQUIRED = { + "phases", "platforms", "required_publication", "routings", "workloads", +} + + +class MatrixError(ValueError): + """A matrix or shard-control document violates the execution contract.""" + + +if yaml is not None: + class _UniqueKeyLoader(yaml.SafeLoader): + pass + + def _unique_mapping(loader: Any, node: Any, deep: bool = False) -> dict[Any, Any]: + result: dict[Any, Any] = {} + for key_node, value_node in node.value: + key = loader.construct_object(key_node, deep=deep) + if key in result: + raise SystemExit(f"duplicate YAML key {key!r} at line {key_node.start_mark.line + 1}") + result[key] = loader.construct_object(value_node, deep=deep) + return result + + _UniqueKeyLoader.add_constructor( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _unique_mapping + ) + + +def _load(name: str) -> dict[str, Any]: + if yaml is None: + raise SystemExit("matrix generation requires PyYAML; shard extraction does not") + try: + with (HERE / "configs" / name).open() as fh: + document = yaml.load(fh, Loader=_UniqueKeyLoader) + except yaml.YAMLError as exc: + raise SystemExit(f"configs/{name} is not valid YAML: {exc}") from exc + if not isinstance(document, dict): + raise SystemExit(f"configs/{name} must contain a YAML object") + return document + + +def _workload_registry(workloads: dict[str, Any]) -> dict[str, dict[str, Any]]: + return { + name: cfg + for section in ("synthetic", "model_derived") + for name, cfg in (workloads.get(section) or {}).items() + } + + +def _fields(value: Any, path: str, allowed: set[str], required: set[str]) -> dict[str, Any]: + if not isinstance(value, dict): + raise SystemExit(f"{path} must be an object") + if any(not isinstance(key, str) for key in value): + raise SystemExit(f"{path} field names must be strings") + unknown, missing = set(value) - allowed, required - set(value) + if unknown or missing: + raise SystemExit(f"{path} fields: unknown={sorted(unknown)}, missing={sorted(missing)}") + return value + + +def _list(value: Any, path: str, item_type: type, allowed: set[Any] | None = None) -> list[Any]: + if (not isinstance(value, list) or not value + or any(type(item) is not item_type for item in value) + or len(value) != len(set(value)) + or (allowed is not None and any(item not in allowed for item in value))): + raise SystemExit(f"{path} must be a non-empty unique list of valid {item_type.__name__}s") + return value + + +def validate_config_documents( + suites_document: dict[str, Any], workloads: dict[str, Any] +) -> None: + """Reject configuration that is ambiguous, unused, or outside the v1 grid.""" + _fields( + suites_document, "configs/suites.yaml", + {"schema_version", "suites"}, {"schema_version", "suites"}, + ) + _fields( + workloads, "configs/workloads.yaml", + {"schema_version", "synthetic", "model_derived"}, {"schema_version"}, + ) + if type(suites_document["schema_version"]) is not int or suites_document["schema_version"] != 1: + raise SystemExit("configs/suites.yaml schema_version must be integer 1") + if type(workloads["schema_version"]) is not int or workloads["schema_version"] != 1: + raise SystemExit("configs/workloads.yaml schema_version must be integer 1") + registry: dict[str, dict[str, Any]] = {} + for section, expert_field in ( + ("synthetic", "experts"), + ("model_derived", "routed_experts"), + ): + entries = workloads.get(section, {}) + if not isinstance(entries, dict): + raise SystemExit(f"workloads.{section} must be an object") + for name, value in entries.items(): + if not isinstance(name, str) or not IDENTIFIER.fullmatch(name) or name in registry: + raise SystemExit(f"workloads.{section} has invalid or duplicate name {name!r}") + fields = {"hidden", "topk", expert_field, "verified_against"} + config = _fields(value, f"workload {name}", fields, fields - {"verified_against"}) + dimensions = [config[key] for key in ("hidden", "topk", expert_field)] + if any(type(item) is not int or item <= 0 for item in dimensions): + raise SystemExit(f"workload {name} dimensions must be positive integers") + if dimensions[1] > dimensions[2]: + raise SystemExit(f"workload {name}.topk exceeds its expert count") + source = config.get("verified_against") + if source is not None and (not isinstance(source, str) or not source.strip()): + raise SystemExit(f"workload {name}.verified_against must be a non-empty string") + registry[name] = config + if not registry: + raise SystemExit("configs/workloads.yaml must define at least one workload") + + suites = suites_document["suites"] + if not isinstance(suites, dict) or not suites: + raise SystemExit("configs/suites.yaml suites must be a non-empty object") + referenced: set[str] = set() + for name, value in suites.items(): + if not isinstance(name, str) or not IDENTIFIER.fullmatch(name): + raise SystemExit(f"invalid suite name {name!r}") + suite = _fields(value, f"suite {name}", SUITE_FIELDS, SUITE_REQUIRED) + suite_workloads = _list(suite["workloads"], f"suite {name}.workloads", str) + unknown = sorted(set(suite_workloads) - set(registry)) + if unknown: + raise SystemExit(f"suite {name}: unknown workloads {unknown}") + referenced.update(suite_workloads) + platforms = _list( + suite["platforms"], f"suite {name}.platforms", str, set(cap.PLATFORMS) + ) + phases = _list(suite["phases"], f"suite {name}.phases", str, {"decode", "prefill"}) + routings = _list(suite["routings"], f"suite {name}.routings", str, {"uniform", "zipf"}) + eplb = _list(suite.get("eplb", [False]), f"suite {name}.eplb", bool) + if True in eplb and routings != ["zipf"]: + raise SystemExit(f"suite {name}: EPLB is only valid for Zipf routing") + if suite["required_publication"] not in {"official", "comparable-experimental"}: + raise SystemExit(f"suite {name}.required_publication is invalid") + if suite["required_publication"] == "official": + unverified = [item for item in suite_workloads if not registry[item].get("verified_against")] + if unverified: + raise SystemExit(f"suite {name}: official workloads need verified_against: {unverified}") + if "ep_degrees" in suite: + degrees = _list(suite["ep_degrees"], f"suite {name}.ep_degrees", int) + if any(degree <= 0 for degree in degrees): + raise SystemExit(f"suite {name}.ep_degrees must be positive") + for platform in platforms: + if not set(degrees).issubset(cap.PLATFORMS[platform]["ep_degrees"]): + raise SystemExit(f"suite {name}: invalid EP degree for {platform}") + for phase in {"decode", "prefill"} - set(phases): + if f"token_points_{phase}" in suite: + raise SystemExit(f"suite {name}.token_points_{phase} is unreachable") + if "token_points" in suite and all( + f"token_points_{phase}" in suite for phase in phases + ): + raise SystemExit(f"suite {name}.token_points is unreachable") + for phase in phases: + _ladder(suite, phase) + unused = sorted(set(registry) - referenced) + if unused: + raise SystemExit(f"unreferenced workloads: {unused}") + + +def _dims(workloads: dict[str, Any], name: str) -> tuple[int, int, int]: + config = _workload_registry(workloads)[name] + values = ( + config.get("hidden"), + config.get("topk"), + config.get("experts", config.get("routed_experts")), + ) + return values # type: ignore[return-value] + + +def _ladder(suite: dict[str, Any], phase: str) -> str: + points = suite.get(f"token_points_{phase}", suite.get("token_points")) + if points is None: + points = ep_harness.DECODE_LADDER if phase == "decode" else ep_harness.PREFILL_LADDER + if (not isinstance(points, list) or not points + or any(isinstance(point, bool) or not isinstance(point, int) or point <= 0 + for point in points) + or points != sorted(set(points))): + raise SystemExit(f"invalid {phase} token ladder: {points!r}") + return " ".join(map(str, points)) + + +def _v1_requested_ladder(case: dict[str, Any]) -> str: + """Bind extracted controls to the frozen v1 suite and workload catalog.""" + suite = V1_SUITE_CONTRACTS.get(case.get("suite")) + coordinate = (case.get("phase"), case.get("routing"), case.get("eplb")) + if ( + suite is None + or coordinate not in suite["coordinates"] + or case.get("required_publication") != suite["publication"] + or ( + case.get("workload"), case.get("hidden"), case.get("topk"), case.get("experts") + ) != V1_WORKLOAD + ): + raise MatrixError("case differs from the frozen v1 suite/workload catalog") + return " ".join(map(str, suite["ladders"][case["phase"]])) + + +def _expected_disposition( + sku: str, case: dict[str, Any] +) -> tuple[str, str | None, str | None]: + requested_ladder = _v1_requested_ladder(case) + ok, detail = cap.resolve( + sku, case["backend"], nodes=case["nodes"], + routing=case["routing"], eplb=case["eplb"], + ) + if ok: + if case["ladder"] != requested_ladder: + raise MatrixError("case ladder differs from the frozen v1 suite catalog") + return "runnable", None, None + if case["ladder"] != requested_ladder: + raise MatrixError("unsupported case ladder differs from the frozen v1 suite catalog") + return "unsupported", "backend-platform-unsupported", detail + + +def _case_id(sku: str, case: dict[str, Any]) -> str: + return identity.case_id(sku=sku, profile=V1_PROFILE, case=case) + + +def _semantic_points(sku: str, case: dict[str, Any]) -> list[str]: + execution = { + key: value for key, value in case.items() + if key not in {"canonical", "case_id", "ladder", "required_publication", "suite", "workload"} + } + return [ + json.dumps( + {"sku": sku, "tokens_per_rank": int(point), **execution}, + sort_keys=True, + separators=(",", ":"), + ) + for point in case["ladder"].split() + ] + + +def _select_backends(backend: str, backends: str) -> list[str]: + available = list(cap.SWEEP_BACKENDS) + if backend and backends: + raise SystemExit("--backend and --backends are mutually exclusive") + if backends: + names = available if backends == "all" else [ + value.strip() for value in backends.split(",") if value.strip() + ] + else: + names = [backend or "deepep"] + unknown = sorted(set(names) - set(available)) + if unknown: + raise SystemExit(f"unknown backend values {unknown}; have {available}") + if len(names) != len(set(names)): + raise SystemExit("backend selection contains duplicates") + return names + + +def resolve_matrix( + suites: str = "all", + backend: str = "", + backends: str = "", + only_sku: str = "", + min_nodes: int = 0, + max_nodes: int = 0, + max_cases: int = 128, +) -> dict[str, Any]: + """Resolve suite configuration into allocation-sized workflow shards.""" + if max_cases <= 0: + raise SystemExit("--max-cases must be positive") + if min_nodes < 0 or max_nodes < 0 or (min_nodes and max_nodes and min_nodes > max_nodes): + raise SystemExit("invalid node bounds") + if only_sku and only_sku not in cap.PLATFORMS: + raise SystemExit(f"unknown --only-sku {only_sku!r}; have {sorted(cap.PLATFORMS)}") + + workloads = _load("workloads.yaml") + suites_document = _load("suites.yaml") + validate_config_documents(suites_document, workloads) + registry = suites_document["suites"] + names = list(registry) if suites == "all" else [ + value.strip() for value in suites.split(",") if value.strip() + ] + if not names or len(names) != len(set(names)): + raise SystemExit("suite selection must be non-empty and unique") + unknown = sorted(set(names) - set(registry)) + if unknown: + raise SystemExit(f"unknown suites {unknown}; have {sorted(registry)}") + targets = _select_backends(backend, backends) + + shards: dict[tuple[str, str, int], list[dict[str, Any]]] = {} + requested_cases: list[dict[str, Any]] = [] + scheduled: set[str] = set() + for suite_name in names: + suite = registry[suite_name] + phases = suite["phases"] + routings = suite["routings"] + eplb_values = suite.get("eplb", [False]) + for platform_name in suite["platforms"]: + platform = cap.PLATFORMS[platform_name] + if only_sku and platform_name != only_sku: + continue + gpus_per_node = int(platform["gpus_per_node"]) + scale_up_domain = int(platform["scale_up_domain"]) + ep_degrees = suite.get("ep_degrees") or platform["ep_degrees"] + for workload, ep, phase, routing, eplb, target in itertools.product( + suite["workloads"], ep_degrees, phases, routings, eplb_values, targets + ): + if ep not in platform["ep_degrees"]: + raise SystemExit( + f"suite {suite_name}: {platform_name} EP{ep} is not registered" + ) + nodes_int = (ep + gpus_per_node - 1) // gpus_per_node + if min_nodes and nodes_int < min_nodes: + continue + if max_nodes and nodes_int > max_nodes: + continue + ok, capability_detail = cap.resolve( + platform_name, + target, + nodes=nodes_int, + routing=routing, + eplb=bool(eplb), + ) + hidden, topk, experts = _dims(workloads, workload) + nodes = nodes_int + + def add_case( + case_ladder: str, + disposition: str, + reason: str | None, + detail: str | None, + ) -> None: + case: dict[str, Any] = { + "suite": suite_name, + "workload": workload, + "required_publication": suite["required_publication"], + "backend": target, + "routing": routing, + "phase": phase, + "ep": ep, + "eplb": eplb, + "hidden": hidden, + "topk": topk, + "experts": experts, + "samples_per_point": ep_harness.TIMED_SAMPLES_PER_POINT, + "warmup_semantics": ep_harness.WARMUP_SEMANTICS, + "ladder": case_ladder, + "timing": EP_TIMING_PROFILE, + "canonical": True, + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + } + for signature in _semantic_points(platform_name, case): + if signature in scheduled: + raise SystemExit( + f"suite {suite_name}: duplicate semantic point for {platform_name}" + ) + scheduled.add(signature) + case["case_id"] = _case_id(platform_name, case) + requested_cases.append( + { + "sku": platform_name, + "case": case, + "disposition": disposition, + "reason": reason, + "detail": detail, + } + ) + if disposition == "runnable": + shards.setdefault((platform_name, target, nodes), []).append(case) + + requested_ladder = _ladder(suite, phase) + if not ok: + add_case( + requested_ladder, + "unsupported", + "backend-platform-unsupported", + capability_detail, + ) + continue + add_case(requested_ladder, "runnable", None, None) + + shards_by_sku: dict[str, list[dict[str, Any]]] = {} + for (sku, target, nodes), cases in sorted(shards.items()): + chunk_size = max_cases + for offset in range(0, len(cases), chunk_size): + chunk = cases[offset:offset + chunk_size] + part = offset // chunk_size + shard_id = f"{sku}-{target}-n{nodes}" + if len(cases) > chunk_size: + shard_id += f"-p{part}" + shards_by_sku.setdefault(sku, []).append({ + "id": shard_id, + "sku": sku, + "backend": target, + "launcher": cap.PLATFORMS[sku]["launcher"], + "gpus_per_node": cap.PLATFORMS[sku]["gpus_per_node"], + "scale_up_domain": cap.PLATFORMS[sku]["scale_up_domain"], + "nodes": nodes, + "n": len(chunk), + "case_ids": [case["case_id"] for case in chunk], + }) + include = [ + shards_by_sku[sku][round_index] + for round_index in range(max(map(len, shards_by_sku.values()), default=0)) + for sku in sorted(shards_by_sku) + if round_index < len(shards_by_sku[sku]) + ] + return { + "format": "collectivex.matrix.v1", + "schema_version": 1, + "requested_cases": requested_cases, + "include": include, + } + + +def _strict_json_load(path: Path) -> Any: + def reject_constant(value: str) -> None: + raise MatrixError(f"non-finite JSON number {value}") + + def reject_duplicates(pairs: list[tuple[str, Any]]) -> dict[str, Any]: + result: dict[str, Any] = {} + for key, value in pairs: + if key in result: + raise MatrixError(f"duplicate JSON key {key!r}") + result[key] = value + return result + + if not path.is_file(): + raise MatrixError(f"matrix does not exist: {path}") + if path.stat().st_size == 0: + raise MatrixError(f"matrix is empty: {path}") + try: + with path.open() as fh: + return json.load( + fh, parse_constant=reject_constant, object_pairs_hook=reject_duplicates + ) + except (OSError, json.JSONDecodeError) as exc: + raise MatrixError(f"matrix is not valid JSON: {exc}") from exc + + +def _positive_int(value: Any, field: str) -> int: + if type(value) is not int: + raise MatrixError(f"{field} must be a positive integer") + if value <= 0: + raise MatrixError(f"{field} must be a positive integer") + return value + + +def validate_shard_control( + shard: dict[str, Any], + *, + sku: str, + backend: str, + nodes: int, + require_runnable: bool = True, +) -> None: + """Validate one shard against the workflow cell that requested it.""" + if not isinstance(shard, dict): + raise MatrixError("shard must be a JSON object") + if sku not in cap.PLATFORMS or backend not in cap.SWEEP_BACKENDS: + raise MatrixError("shard platform/backend is not registered") + top_fields = {"schema_version", "id", "sku", "backend", "nodes", "n", "cases"} + if ( + set(shard) != top_fields + or type(shard.get("schema_version")) is not int + or shard["schema_version"] != 1 + ): + raise MatrixError("shard fields or schema version differ from v1 contract") + if not isinstance(shard.get("id"), str) or not IDENTIFIER.fullmatch(shard["id"]): + raise MatrixError("shard has invalid id") + for field, expected in (("sku", sku), ("backend", backend)): + if shard.get(field) != expected: + raise MatrixError( + f"shard {field} mismatch: expected {expected!r}, got {shard.get(field)!r}" + ) + if _positive_int(shard.get("nodes"), "shard.nodes") != nodes: + raise MatrixError( + f"shard nodes mismatch: expected {nodes}, got {shard.get('nodes')!r}" + ) + cases = shard.get("cases") + if not isinstance(cases, list) or not cases: + raise MatrixError("shard must contain at least one case") + if _positive_int(shard.get("n"), "shard.n") != len(cases): + raise MatrixError("shard.n does not match the number of cases") + seen: set[str] = set() + required = { + "case_id", "suite", "workload", "required_publication", "backend", "routing", + "phase", "ep", "eplb", "hidden", "topk", "experts", "samples_per_point", + "warmup_semantics", "ladder", "timing", "canonical", "nodes", + "gpus_per_node", "scale_up_domain", + } + for index, case in enumerate(cases): + if not isinstance(case, dict): + raise MatrixError(f"case {index} must be a JSON object") + fields = set(case) + if fields != required: + raise MatrixError( + f"case {index} fields differ from v1 contract: " + f"missing={sorted(required - fields)}, extra={sorted(fields - required)}" + ) + case_id = case["case_id"] + if not identity.is_typed_id(case_id, "case"): + raise MatrixError(f"case {index} has invalid case_id") + if case_id in seen: + raise MatrixError(f"duplicate case_id {case_id}") + seen.add(case_id) + for field in ("suite", "workload", "required_publication", "backend", "routing", "phase", + "warmup_semantics", "ladder", "timing"): + if not isinstance(case[field], str) or not case[field]: + raise MatrixError(f"case {index}.{field} must be a non-empty string") + for field in ("suite", "workload", "required_publication", "backend", "routing", "phase"): + if not IDENTIFIER.fullmatch(case[field]): + raise MatrixError(f"case {index}.{field} is not a safe identifier") + if case["required_publication"] not in {"official", "comparable-experimental"}: + raise MatrixError(f"case {index} has invalid publication requirement") + case_identity = {key: value for key, value in case.items() if key != "case_id"} + if case_id != _case_id(sku, case_identity): + raise MatrixError(f"case {index} case_id does not match its contents") + if case["backend"] != backend: + raise MatrixError(f"case {index} backend does not match shard") + if _positive_int(case["nodes"], f"case {index}.nodes") != nodes: + raise MatrixError(f"case {index} nodes does not match shard") + ep = _positive_int(case["ep"], f"case {index}.ep") + gpus_per_node = _positive_int( + case["gpus_per_node"], f"case {index}.gpus_per_node" + ) + platform = cap.PLATFORMS[sku] + if ( + gpus_per_node != platform["gpus_per_node"] + or case["scale_up_domain"] != platform["scale_up_domain"] + or ep not in platform["ep_degrees"] + ): + raise MatrixError(f"case {index} differs from the platform registry") + if ep != nodes * gpus_per_node: + raise MatrixError(f"case {index} ep does not equal nodes * gpus_per_node") + if case["samples_per_point"] != ep_harness.TIMED_SAMPLES_PER_POINT: + raise MatrixError(f"case {index} violates fixed-512-v1") + if case["timing"] != EP_TIMING_PROFILE: + raise MatrixError(f"case {index} has invalid timing profile") + if case["warmup_semantics"] != ep_harness.WARMUP_SEMANTICS: + raise MatrixError(f"case {index} has invalid warmup semantics") + if case["phase"] not in {"decode", "prefill"}: + raise MatrixError(f"case {index} has invalid phase") + if case["routing"] not in {"uniform", "zipf"}: + raise MatrixError(f"case {index} has invalid routing") + if not isinstance(case["eplb"], bool) or (case["eplb"] and case["routing"] != "zipf"): + raise MatrixError(f"case {index} has invalid EPLB setting") + if not isinstance(case["canonical"], bool) or not case["canonical"]: + raise MatrixError(f"case {index} must use a canonical workload") + for field in ("ep", "nodes", "gpus_per_node", "hidden", "topk", "experts", + "samples_per_point", "scale_up_domain"): + if isinstance(case[field], bool) or not isinstance(case[field], int): + raise MatrixError(f"case {index}.{field} must be an integer") + _positive_int(case[field], f"case {index}.{field}") + if ep > _positive_int(case["scale_up_domain"], f"case {index}.scale_up_domain"): + raise MatrixError(f"case {index} exceeds its scale-up domain") + try: + ladder = [int(value) for value in case["ladder"].split()] + except (AttributeError, ValueError) as exc: + raise MatrixError(f"case {index} has invalid token ladder") from exc + if (not ladder or any(value <= 0 for value in ladder) + or ladder != sorted(set(ladder)) + or case["ladder"] != " ".join(map(str, ladder))): + raise MatrixError(f"case {index} has invalid token ladder") + if require_runnable: + disposition, reason, _ = _expected_disposition(sku, case) + if disposition != "runnable": + raise MatrixError(f"case {index} violates capability registry: {reason}") + else: + _v1_requested_ladder(case) + + +def validate_matrix_document(document: Any) -> dict[str, Any]: + """Validate the complete requested grid and its runnable shard partition.""" + if not isinstance(document, dict) or set(document) != { + "format", "schema_version", "requested_cases", "include" + }: + raise MatrixError("matrix fields differ from the v1 contract") + if ( + document["format"] != "collectivex.matrix.v1" + or type(document["schema_version"]) is not int + or document["schema_version"] != 1 + ): + raise MatrixError("matrix format/schema differs from v1") + requested = document["requested_cases"] + include = document["include"] + if not isinstance(requested, list) or not requested: + raise MatrixError("matrix.requested_cases must be non-empty") + if not isinstance(include, list): + raise MatrixError("matrix.include must be an array") + + cases_by_id: dict[str, dict[str, Any]] = {} + runnable_ids: set[str] = set() + semantic_points: set[str] = set() + for index, value in enumerate(requested): + path = f"matrix.requested_cases[{index}]" + if not isinstance(value, dict) or set(value) != { + "sku", "case", "disposition", "reason", "detail" + }: + raise MatrixError(f"{path} fields differ from the v1 contract") + sku = value["sku"] + case = value["case"] + disposition = value["disposition"] + if sku not in cap.PLATFORMS: + raise MatrixError(f"{path}.sku is unknown") + if disposition not in {"runnable", "unsupported"}: + raise MatrixError(f"{path}.disposition is invalid") + if disposition == "runnable": + if value["reason"] is not None or value["detail"] is not None: + raise MatrixError(f"{path} runnable cases cannot have a reason") + else: + if ( + not isinstance(value["reason"], str) + or not IDENTIFIER.fullmatch(value["reason"]) + or not isinstance(value["detail"], str) + or not value["detail"] + ): + raise MatrixError(f"{path} unsupported cases need a public reason and detail") + if not isinstance(case, dict): + raise MatrixError(f"{path}.case must be an object") + backend = case.get("backend") + nodes = case.get("nodes") + if not isinstance(backend, str) or type(nodes) is not int: + raise MatrixError(f"{path}.case backend/nodes are invalid") + validate_shard_control( + { + "schema_version": 1, + "id": "requested-case", + "sku": sku, + "backend": backend, + "nodes": nodes, + "n": 1, + "cases": [case], + }, + sku=sku, + backend=backend, + nodes=nodes, + require_runnable=disposition == "runnable", + ) + case_id = case["case_id"] + if case_id in cases_by_id: + raise MatrixError(f"duplicate requested case_id {case_id}") + for signature in _semantic_points(sku, case): + if signature in semantic_points: + raise MatrixError(f"{path} duplicates a semantic token point") + semantic_points.add(signature) + cases_by_id[case_id] = value + expected = _expected_disposition(sku, case) + if (disposition, value["reason"], value["detail"]) != expected: + raise MatrixError(f"{path} disposition differs from the frozen v1 catalog") + if disposition == "runnable": + runnable_ids.add(case_id) + + shard_ids: set[str] = set() + assigned: list[str] = [] + for index, shard in enumerate(include): + path = f"matrix.include[{index}]" + expected = { + "id", "sku", "backend", "launcher", "gpus_per_node", "scale_up_domain", + "nodes", "n", "case_ids", + } + if not isinstance(shard, dict) or set(shard) != expected: + raise MatrixError(f"{path} fields differ from the v1 contract") + shard_id = shard["id"] + if not isinstance(shard_id, str) or not IDENTIFIER.fullmatch(shard_id): + raise MatrixError(f"{path}.id is invalid") + if shard_id in shard_ids: + raise MatrixError(f"duplicate shard id {shard_id}") + shard_ids.add(shard_id) + sku = shard["sku"] + if sku not in cap.PLATFORMS: + raise MatrixError(f"{path}.sku is unknown") + platform = cap.PLATFORMS[sku] + for field in ("launcher", "gpus_per_node", "scale_up_domain"): + if shard[field] != platform[field]: + raise MatrixError(f"{path}.{field} differs from the platform registry") + case_ids = shard["case_ids"] + if not isinstance(case_ids, list) or not case_ids or len(case_ids) != len(set(case_ids)): + raise MatrixError(f"{path}.case_ids must be a non-empty unique array") + if _positive_int(shard["n"], f"{path}.n") != len(case_ids): + raise MatrixError(f"{path}.n differs from case_ids") + nodes = _positive_int(shard["nodes"], f"{path}.nodes") + for case_id in case_ids: + wrapper = cases_by_id.get(case_id) + if wrapper is None or wrapper["disposition"] != "runnable": + raise MatrixError(f"{path} references a missing or unsupported case") + case = wrapper["case"] + if ( + wrapper["sku"] != sku + or case["backend"] != shard["backend"] + or case["nodes"] != nodes + ): + raise MatrixError(f"{path} case does not match shard coordinates") + assigned.append(case_id) + if len(assigned) != len(set(assigned)): + raise MatrixError("a runnable case is assigned to more than one shard") + if set(assigned) != runnable_ids: + raise MatrixError("runnable requested cases and shard assignments differ") + return document + + +def extract_shard( + matrix_path: str | os.PathLike[str], + shard_id: str, + output_path: str | os.PathLike[str], + *, + sku: str, + backend: str, + nodes: int, +) -> dict[str, Any]: + """Extract one strictly matched shard control file, writing it atomically.""" + document = validate_matrix_document(_strict_json_load(Path(matrix_path))) + include = document["include"] + matches = [item for item in include if isinstance(item, dict) and item.get("id") == shard_id] + if len(matches) != 1: + raise MatrixError(f"expected exactly one shard {shard_id!r}, found {len(matches)}") + source = matches[0] + requested = { + item["case"]["case_id"]: item + for item in document["requested_cases"] + } + cases = [requested[case_id]["case"] for case_id in source["case_ids"]] + control = { + "schema_version": 1, + "id": source.get("id"), + "sku": source.get("sku"), + "backend": source.get("backend"), + "nodes": source.get("nodes"), + "n": source.get("n"), + "cases": cases, + } + validate_shard_control(control, sku=sku, backend=backend, nodes=nodes) + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + temporary = output.with_name(f".{output.name}.tmp-{os.getpid()}") + try: + with temporary.open("w") as fh: + json.dump(control, fh, sort_keys=True, separators=(",", ":")) + fh.write("\n") + os.replace(temporary, output) + finally: + temporary.unlink(missing_ok=True) + return control + + +def emit_unsupported( + matrix_path: str | os.PathLike[str], output_dir: str | os.PathLike[str] +) -> list[Path]: + """Materialize one strict terminal outcome for each unsupported requested case.""" + source = Path(matrix_path) + document = validate_matrix_document(_strict_json_load(source)) + control_sha256 = hashlib.sha256(source.read_bytes()).hexdigest() + generated_at = dt.datetime.now(dt.timezone.utc).isoformat() + git_run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + allocation_factors = { + "artifact": git_run["artifact"], + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"), + "job": git_run["job"], + "repo": git_run["repo"], + "run_attempt": git_run["run_attempt"], + "run_id": git_run["run_id"], + "runner": "capability-resolver", + "source_sha": git_run["source_sha"], + } + destination = Path(output_dir) + destination.mkdir(parents=True, exist_ok=True) + written: list[Path] = [] + for wrapper in document["requested_cases"]: + if wrapper["disposition"] != "unsupported": + continue + scheduled = wrapper["case"] + case = {key: value for key, value in scheduled.items() if key != "case_id"} + case_factors = {"case": case, "profile": V1_PROFILE, "sku": wrapper["sku"]} + case_id = identity.digest("case", case_factors) + if case_id != scheduled["case_id"]: + raise MatrixError(f"unsupported case identity differs for {scheduled['case_id']}") + attempt_ordinal = 1 + record = contracts.make_terminal_document( + allocation_factors=allocation_factors, + attempt_ordinal=attempt_ordinal, + case=case, + case_factors=case_factors, + control_sha256=control_sha256, + failure_mode="capability", + generated_at=generated_at, + git_run=git_run, + reason=wrapper["reason"], + return_code=5, + source="matrix-capability-resolver", + status="unsupported", + expected_case_id=case_id, + ) + path = destination / f"unsupported_{case_id}.json" + temporary = path.with_name(f".{path.name}.tmp-{os.getpid()}") + try: + with temporary.open("x") as handle: + json.dump(record, handle, allow_nan=False, sort_keys=True, separators=(",", ":")) + handle.write("\n") + handle.flush() + os.fsync(handle.fileno()) + os.replace(temporary, path) + finally: + temporary.unlink(missing_ok=True) + written.append(path) + return written + + +def main() -> int: + parser = argparse.ArgumentParser(description="CollectiveX v1 matrix resolver") + parser.add_argument("--suites", default="all", help="'all' or comma-list of suites") + parser.add_argument("--backend", default="", help="select one EP backend") + parser.add_argument("--backends", default="", help="'all' or comma-list of EP backends") + parser.add_argument("--only-sku", default="") + parser.add_argument("--min-nodes", type=int, default=0) + parser.add_argument("--max-nodes", type=int, default=0) + parser.add_argument("--max-cases", type=int, default=128) + parser.add_argument("--extract-from", default="", metavar="MATRIX") + parser.add_argument("--validate-control", default="", metavar="SHARD") + parser.add_argument("--emit-unsupported-from", default="", metavar="MATRIX") + parser.add_argument("--out-dir", default="") + parser.add_argument("--shard-id", default="") + parser.add_argument("--expect-sku", default="") + parser.add_argument("--expect-backend", default="") + parser.add_argument("--expect-nodes", type=int, default=0) + parser.add_argument("--out", default="") + args = parser.parse_args() + + if args.emit_unsupported_from: + if not args.out_dir: + parser.error("unsupported outcome emission requires --out-dir") + try: + written = emit_unsupported(args.emit_unsupported_from, args.out_dir) + except MatrixError as exc: + parser.error(str(exc)) + print(f"emitted {len(written)} unsupported terminal outcomes", file=sys.stderr) + return 0 + + if args.validate_control: + if not all((args.expect_sku, args.expect_backend, args.expect_nodes)): + parser.error( + "control validation requires --expect-sku, --expect-backend, and --expect-nodes" + ) + try: + control = _strict_json_load(Path(args.validate_control)) + validate_shard_control( + control, + sku=args.expect_sku, + backend=args.expect_backend, + nodes=args.expect_nodes, + ) + except MatrixError as exc: + parser.error(str(exc)) + print(f"validated {control.get('id')}: {control['n']} cases", file=sys.stderr) + return 0 + + if args.extract_from: + if not all((args.shard_id, args.expect_sku, args.expect_backend, args.expect_nodes, args.out)): + parser.error( + "shard extraction requires --shard-id, --expect-sku, --expect-backend, " + "--expect-nodes, and --out" + ) + try: + control = extract_shard( + args.extract_from, + args.shard_id, + args.out, + sku=args.expect_sku, + backend=args.expect_backend, + nodes=args.expect_nodes, + ) + except MatrixError as exc: + parser.error(str(exc)) + print(f"extracted {control['id']}: {control['n']} cases", file=sys.stderr) + print(json.dumps(control, separators=(",", ":"))) + return 0 + + matrix = resolve_matrix( + suites=args.suites, + backend=args.backend, + backends=args.backends, + only_sku=args.only_sku, + min_nodes=args.min_nodes, + max_nodes=args.max_nodes, + max_cases=args.max_cases, + ) + try: + validate_matrix_document(matrix) + except MatrixError as exc: + parser.error(str(exc)) + if args.out: + with open(args.out, "w") as fh: + json.dump(matrix, fh, sort_keys=True, separators=(",", ":")) + fh.write("\n") + runnable = sum( + item["disposition"] == "runnable" for item in matrix["requested_cases"] + ) + unsupported = len(matrix["requested_cases"]) - runnable + print( + f"resolved {len(matrix['include'])} shard-cells, " + f"{runnable} runnable and {unsupported} unsupported cases", + file=sys.stderr, + ) + print(json.dumps(matrix)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py new file mode 100644 index 0000000000..3109e7c771 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""CollectiveX DeepEP adapter for the v1 BF16 normal-mode workload.""" +from __future__ import annotations + +import inspect +import os +import sys +import types + +import torch +import torch.distributed as dist +import contracts + +try: + import deep_ep + from deep_ep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: deep_ep import failed: {exc!r}", file=sys.stderr) + raise + + +def _deepep_version() -> str: + try: + import importlib.metadata as metadata + + return metadata.version("deep_ep") + except Exception: + return getattr(deep_ep, "__version__", "unknown") + + +def _mnnvl_buffer_configuration() -> tuple[dict[str, bool], str]: + """Resolve the explicit DeepEP MNNVL API contract.""" + requested_value = os.environ.get("CX_ALLOW_MNNVL") + if requested_value not in {None, "", "0", "1"}: + raise RuntimeError("CX_ALLOW_MNNVL must be unset, 0, or 1") + requested = requested_value == "1" + if not requested: + return contracts.resolve_deepep_mnnvl( + requested=False, signature_parameters=(), + deepep_commit=os.environ.get("DEEPEP_COMMIT"), + ) + try: + parameters = inspect.signature(Buffer.__init__).parameters + except (TypeError, ValueError) as exc: + raise RuntimeError("cannot inspect DeepEP Buffer MNNVL API") from exc + try: + return contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=parameters, + deepep_commit=os.environ.get("DEEPEP_COMMIT"), + ) + except contracts.ContractError as exc: + raise RuntimeError(str(exc)) from exc + + +class DeepEPBackend: + name = "deepep" + combine_needs_redispatch = False + # DeepEP reduces activations and top-k weights independently. The activation + # tensor must therefore carry the complete local weighted expert sum. + combine_weight_semantics = "unweighted-rank-sum" + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + + self.group = dist.group.WORLD + device_sms = torch.cuda.get_device_properties(device).multi_processor_count + num_nvl_bytes = 4 * 1024 * 1024 * 1024 + mnnvl_kwargs, mnnvl_comm = _mnnvl_buffer_configuration() + self.buffer = Buffer(self.group, num_nvl_bytes, 0, **mnnvl_kwargs) + + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + raise RuntimeError( + f"DeepEP did not apply requested num_sms={num_sms}: {exc!r}" + ) from exc + applied_num_sms = int(getattr(Buffer, "num_sms", num_sms)) + if applied_num_sms != num_sms: + raise RuntimeError( + f"DeepEP num_sms mismatch: requested={num_sms} applied={applied_num_sms}" + ) + + version = _deepep_version() + self.backend_provenance = { + "deepep_version": version, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{version}", + "backend_lineage": "deepep-v1", + "mode": "normal", + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "resource_mode": "tuned", + "requested_num_sms": num_sms, + "num_sms": applied_num_sms, + "device_sms": device_sms, + "sm_fraction": applied_num_sms / device_sms, + "tuned_source": "deepep-default-num_sms", + "num_nvl_bytes": num_nvl_bytes, + "allow_mnnvl": bool(mnnvl_kwargs), + "mnnvl_comm": mnnvl_comm, + } + + def buffer_cap(self, args): + return None + + def make_problem(self, T, idx, weights, x): + return types.SimpleNamespace( + T=T, + x=x, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + ( + num_tokens_per_rank, + _, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch( + p.x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + recv_counts=recv_counts, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = h.recv_x + + def combine(self, p, h): + combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle) + return combined_x + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * (self.args.experts // self.world_size), + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=h.recv_x, + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64), + ordering_contract="source-rank-major-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle) + return combined + + def recv_tokens(self, h): + return int(h.recv_x.shape[0]) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py new file mode 100644 index 0000000000..6514e93c51 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer). + +The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA + +warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer` +(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72 +rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter drives the single- +NVLink-domain path (`num_of_hybrid_ep_ranks_per_nvlink_domain == world_size`, <=8 ranks). That domain +is ONE node on x86, while GB200/GB300 MNNVL can expose multiple trays as one NVLink domain. The v1 +matrix therefore exercises the same path at EP8 across two GB trays, subject to the normal three-run +qualification gate. The container build is done by runtime/run_in_container.sh +`cx_build_deepep_hybrid` (CUDA-13 CCCL include path, without the V2 NVSHMEM overlay). + +API (pinned on B300, branch e0a5b1d): + HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...) + .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle) + .combine(hidden, handle=) -> [T, hidden] + +CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is +reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8 +round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses +the same per-rank-sum combine contract (no gate re-weight). BF16 tolerance is 5e-2. + +STATUS: bf16 / normal / layout-and-dispatch-v1. The v1 scope is one detected NVLink domain at up to +eight ranks; fp8 and the cross-RACK (>1 NVL72, IBGDA/RDMA) path remain out of scope. +""" +from __future__ import annotations + +import hashlib +import importlib +import json +import os +from pathlib import Path +import re +import shutil +import sys +import tempfile +import types + +import torch +import torch.distributed as dist +import contracts + +try: + import deep_ep + HybridEPBuffer = deep_ep.HybridEPBuffer +except Exception as exc: # pragma: no cover - needs the hybrid-ep build + print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job " + "setup (cx_build_deepep_hybrid). " + f"{exc!r}", file=sys.stderr) + raise + + +def _deepep_hybrid_version() -> str: + return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep")) + + +def _hybrid_build_evidence() -> list[dict[str, str]]: + records = [] + for module_name, role in ( + ("deep_ep_cpp", "deepep-extension"), + ("hybrid_ep_cpp", "deepep-hybrid-extension"), + ): + module = importlib.import_module(module_name) + path = getattr(module, "__file__", None) + if not path: + raise RuntimeError(f"{module_name} has no loaded extension path") + records.append(contracts.content_manifest_evidence( + role=role, + name=module_name, + files=[(os.path.basename(path), path)], + )) + return sorted(records, key=lambda item: (item["role"], item["name"])) + + +HYBRID_CONFIG_FIELDS = ( + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", "pad_multiple", + "num_of_tokens_per_chunk_preprocessing_api", + "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api", + "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type", + "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api", + "num_of_in_flight_s2g_dispatch_api", + "num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_additional_in_flight_s2g_dispatch_api", + "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api", + "forward_dispatch_api", "device_side_sync_dispatch_api", + "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api", + "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api", + "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api", + "backward_combine_api", "device_side_sync_combine_api", +) + + +def _hybrid_realized_config(config) -> dict[str, str | int | bool]: + """Project the Python-visible, post-autotune HybridEP config to JSON scalars.""" + realized = {} + for field in HYBRID_CONFIG_FIELDS: + try: + value = getattr(config, field) + except AttributeError as exc: + raise RuntimeError(f"HybridEP realized config omits {field}") from exc + if field == "token_data_type": + token_type = getattr(value, "name", None) + if token_type not in {"UINT8", "UINT16"}: + token_type = {"uint8_t": "UINT8", "uint16_t": "UINT16"}.get(str(value)) + if token_type is None: + raise RuntimeError("HybridEP realized token_data_type is invalid") + realized[field] = token_type + continue + if type(value) is bool: + realized[field] = value + continue + try: + realized[field] = int(value) + except (TypeError, ValueError) as exc: + raise RuntimeError(f"HybridEP realized config {field} is not integral") from exc + return realized + + +def _sha256_with_size(path: Path) -> tuple[str, int]: + digest = hashlib.sha256() + size = 0 + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + size += len(chunk) + return digest.hexdigest(), size + + +def _hybrid_jit_evidence(root: Path) -> list[dict[str, str | int]]: + """Hash final JIT libraries without exposing rank-specific cache paths.""" + if not root.is_dir(): + raise RuntimeError("DeepEP Hybrid produced no JIT cache directory") + artifacts = [] + for path in sorted(root.iterdir(), key=lambda item: item.name): + if path.suffix != ".so": + continue + if path.is_symlink() or not path.is_file(): + raise RuntimeError("DeepEP Hybrid JIT artifact is not a regular file") + kernel_key = path.stem + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", kernel_key): + raise RuntimeError("DeepEP Hybrid JIT kernel key is invalid") + digest, size = _sha256_with_size(path) + if size <= 0: + raise RuntimeError("DeepEP Hybrid JIT artifact is empty") + artifacts.append({ + "bytes": size, + "kernel_key": kernel_key, + "sha256": digest, + }) + if len(artifacts) != 3: + raise RuntimeError( + f"DeepEP Hybrid expected 3 final JIT libraries, found {len(artifacts)}" + ) + return artifacts + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"DeepEP Hybrid {label} differs across ranks") + + +class DeepEPHybridBackend: + name = "deepep-hybrid" + # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed + # before a timed combine); the harness times dispatch and combine separately (like ep_deepep). + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + self.group = dist.group.WORLD + self.tolerance = 5e-2 + self.top_k = int(args.topk) + self.num_experts = int(args.experts) + self.hidden = int(args.hidden) + self.local_experts = max(1, self.num_experts // world_size) + # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap). + self.max_tokens = 4096 + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _deepep_hybrid_version() + loaded_libraries = _hybrid_build_evidence() + _require_cross_rank_equal(loaded_libraries, "loaded extension identities") + + # HybridEP's compiler uses a process-specific child of HYBRID_EP_CACHE_DIR. Give every + # rank a fresh private base so stale kernels cannot enter this attempt's evidence. + self._previous_jit_cache_dir = os.environ.get("HYBRID_EP_CACHE_DIR") + self._jit_cache_dir = tempfile.mkdtemp(prefix=f"collectivex-hybrid-r{rank}-") + os.environ["HYBRID_EP_CACHE_DIR"] = self._jit_cache_dir + self._jit_root = ( + Path(self._jit_cache_dir) / ".deepep" / "hybrid_ep" / "jit" + / f"proc-{os.getpid()}" + ) + self._realized_config = None + self._deferred_semantic_snapshot = None + self._deferred_jit_diagnostics = None + + # Construct the HybridEPBuffer treating all ranks as ONE NVLink domain (default + # num_of_hybrid_ep_ranks_per_nvlink_domain == world_size). On x86 that domain is one node; on a + # GB200/GB300 NVL72 the MNNVL fabric makes 2 trays one NVLink domain, so EP8 (8 ranks) is covered + # by this same path (validated transport=mnnvl). SM counts default. + try: + self.buffer = HybridEPBuffer( + self.group, hidden_dim=self.hidden, + max_num_of_tokens_per_rank=self.max_tokens, + num_local_experts=self.local_experts, use_fp8=False) + except Exception as exc: + shutil.rmtree(self._jit_cache_dir, ignore_errors=True) + if self._previous_jit_cache_dir is None: + os.environ.pop("HYBRID_EP_CACHE_DIR", None) + else: + os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir + raise RuntimeError( + f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} " + f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc + update_template_config = self.buffer.update_template_config + + def tracked_update_template_config(*call_args, **call_kwargs): + config = update_template_config(*call_args, **call_kwargs) + realized = _hybrid_realized_config(config) + if self._realized_config is not None and realized != self._realized_config: + raise RuntimeError("DeepEP Hybrid realized autotune config changed within one case") + self._realized_config = realized + return config + + self.buffer.update_template_config = tracked_update_template_config + self.domain_rank = int(self.buffer.local_rank) + if self.domain_rank != rank: + raise RuntimeError( + "HybridEPBuffer rank within the single NVLink domain differs from global rank: " + f"domain={self.domain_rank} global={rank}" + ) + if rank == 0: + print(f"[deepep-hybrid] HybridEPBuffer constructed (single NVLink domain, world={world_size}, " + f"local_experts={self.local_experts}, hidden={self.hidden})", file=sys.stderr) + + self.backend_provenance = { + "deepep_commit": ver, "branch": "hybrid-ep", + "deepep_tree": os.environ.get("DEEPEP_TREE"), + "backend_lineage": "deepep-hybrid", + "loaded_libraries": loaded_libraries, + "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)", + "mode": "normal", "transport": "nvlink-domain", # one node (x86) or one NVL72 MNNVL domain (gb300 EP8) + "resource_mode": "tuned", + "num_sms": None, "device_sms": dev_sms, + "tuned_source": "deepep-hybrid-configurer-autotune-v1", + "realized_config": None, "jit_kernel_keys": [], "jit_shared_objects": [], + "max_num_tokens": self.max_tokens, "top_k": self.top_k, + "num_experts": self.num_experts, "local_experts": self.local_experts, + "routing_factor": "ranks", + } + + def buffer_cap(self, args): + return self.max_tokens + + def make_problem(self, T, idx, weights, x): + return types.SimpleNamespace( + T=int(T), x=x, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + recv, recv_probs, _scales, handle = self.buffer.dispatch( + p.x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_of_experts=self.num_experts, + ) + return types.SimpleNamespace( + recv=recv, + recv_payload=recv, + recv_probs=recv_probs, + handle=handle, + combine_input=None, + ) + + def stage(self, p, h): + # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token. + h.combine_input = h.recv_payload + return None + + def combine(self, p, h): + # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks"). + comb = self.buffer.combine(h.combine_input, handle=h.handle) + return comb[0] if isinstance(comb, (tuple, list)) else comb + + def capture_deferred_provenance(self): + torch.cuda.synchronize() + dist.barrier() + if self._realized_config is None: + raise RuntimeError("DeepEP Hybrid autotune config was not materialized") + local_artifacts = _hybrid_jit_evidence(self._jit_root) + semantic = { + "jit_kernel_keys": [item["kernel_key"] for item in local_artifacts], + "realized_config": dict(self._realized_config), + } + # NVCC may embed each rank's timestamped source basename in its ELF, so raw .so hashes are + # diagnostics rather than a cross-rank identity. Stable kernel keys encode every codegen + # input, including HybridEpConfigInstance fields that the Python binding does not expose. + _require_cross_rank_equal(semantic, "realized config/JIT kernel keys") + gathered_artifacts = [None] * dist.get_world_size() + dist.all_gather_object(gathered_artifacts, local_artifacts) + diagnostics = [] + for artifact_index, kernel_key in enumerate(semantic["jit_kernel_keys"]): + diagnostics.append({ + "kernel_key": kernel_key, + "rank_artifacts": [ + { + "bytes": rank_artifacts[artifact_index]["bytes"], + "rank": artifact_rank, + "sha256": rank_artifacts[artifact_index]["sha256"], + } + for artifact_rank, rank_artifacts in enumerate(gathered_artifacts) + ], + }) + if self._deferred_semantic_snapshot is not None and semantic != self._deferred_semantic_snapshot: + raise RuntimeError("DeepEP Hybrid config/JIT kernel set changed after measurement") + if self._deferred_jit_diagnostics is not None and diagnostics != self._deferred_jit_diagnostics: + raise RuntimeError("DeepEP Hybrid rank-local JIT artifacts changed after measurement") + self._deferred_semantic_snapshot = semantic + self._deferred_jit_diagnostics = diagnostics + self.backend_provenance.update(semantic) + self.backend_provenance["jit_shared_objects"] = diagnostics + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + routing_map = h.handle[4][:count] + rows, local_expert_ids = routing_map.nonzero(as_tuple=True) + positions = routing_map.to(torch.int64).cumsum(dim=1)[rows, local_expert_ids] - 1 + probability_columns = self.domain_rank * self.local_experts + local_expert_ids + if h.recv_probs.shape[1] < (self.domain_rank + 1) * self.local_experts: + raise RuntimeError("HybridEPBuffer probability tensor omits this NVLink-domain rank") + expert_ids = torch.full( + (count, self.top_k), -1, dtype=torch.int64, device=self.device + ) + weights = torch.zeros( + (count, self.top_k), dtype=torch.float32, device=self.device + ) + expert_ids[rows, positions] = local_expert_ids + self.rank * self.local_experts + weights[rows, positions] = h.recv_probs[:count][rows, probability_columns] + return types.SimpleNamespace( + payload=h.recv_payload[:count], + expert_ids=expert_ids, + weights=weights, + local_expert_counts=routing_map.sum(dim=0, dtype=torch.int64), + ordering_contract="global-source-filter-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + combined = self.buffer.combine( + transformed.to(h.recv_payload.dtype), handle=h.handle + ) + return combined[0] if isinstance(combined, (tuple, list)) else combined + + def recv_tokens(self, h): + return int(h.handle[3].item()) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + shutil.rmtree(self._jit_cache_dir, ignore_errors=True) + if self._previous_jit_cache_dir is None: + os.environ.pop("HYBRID_EP_CACHE_DIR", None) + else: + os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir + return rc diff --git a/experimental/CollectiveX/tests/ep_deepep_v2.py b/experimental/CollectiveX/tests/ep_deepep_v2.py new file mode 100644 index 0000000000..a11185effb --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep_v2.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python3 +"""DeepEP PR #605 adapter with PR #630's pure scale-up initialization fix.""" + +from __future__ import annotations + +import ctypes +import hashlib +import importlib.metadata +import inspect +import json +import os +import re +import sys +import types +from pathlib import Path + +import torch +import torch.distributed as dist +import contracts +import ep_harness + +try: + import deep_ep + from deep_ep import ElasticBuffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: DeepEP V2 import failed: {exc!r}", file=sys.stderr) + raise + + +DEEPEP_V2_PR = 605 +DEEPEP_V2_FIX_PR = 630 +DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +DEEPEP_V2_TREE = "29809e75c5874e6609dac4804e7b651d5226959f" +DEEPEP_V2_FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" +DEEPEP_V2_VERSION = "2.0.0" +DEEPEP_V2_DISTRIBUTION = "2.0.0+fa8a9b1" +DEEPEP_V2_JIT_RANDOM_SEED = "collectivex-deepep-v2-fa8a9b1" +TORCH_VERSION = "2.10.0+cu130" +NCCL_VERSION = "2.30.4" +NVSHMEM_VERSION = "3.3.9" +DEEPEP_V2_JIT_KERNELS = contracts.DEEPEP_V2_JIT_KERNELS + + +def _sha256(path: str) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _api_sha256() -> str: + signatures = { + "ElasticBuffer.__init__": str(inspect.signature(ElasticBuffer.__init__)), + "ElasticBuffer.dispatch": str(inspect.signature(ElasticBuffer.dispatch)), + "ElasticBuffer.combine": str(inspect.signature(ElasticBuffer.combine)), + } + return hashlib.sha256( + json.dumps(signatures, sort_keys=True, separators=(",", ":")).encode() + ).hexdigest() + + +def _loaded_library_paths() -> set[str]: + extension = getattr(getattr(deep_ep, "_C", None), "__file__", None) + if not extension or not os.path.isfile(extension): + raise RuntimeError("DeepEP V2 extension library is not loaded") + paths = {os.path.realpath(extension)} + try: + with open("/proc/self/maps", encoding="utf-8") as handle: + for line in handle: + path = line.rstrip().split()[-1] + name = os.path.basename(path) + if ("libnccl.so" in name or "libnvshmem_host.so" in name) and os.path.isfile(path): + paths.add(os.path.realpath(path)) + except OSError as exc: # pragma: no cover - benchmark runtime is Linux + raise RuntimeError("cannot inspect loaded communication libraries") from exc + return paths + + +def _loaded_nccl_version() -> str: + matches = [ + path for path in _loaded_library_paths() + if "libnccl.so" in os.path.basename(path) + ] + if len(matches) != 1: + raise RuntimeError("expected exactly one loaded NCCL library") + version = ctypes.c_int() + if ctypes.CDLL(matches[0]).ncclGetVersion(ctypes.byref(version)) != 0: + raise RuntimeError("loaded NCCL version query failed") + return ep_harness.format_collective_version(version.value) + + +def _loaded_library_evidence() -> list[dict[str, str]]: + """Return content identities, never private library paths.""" + paths = _loaded_library_paths() + required = { + "nccl": [path for path in paths if "libnccl.so" in os.path.basename(path)], + "nvshmem": [path for path in paths if "libnvshmem_host.so" in os.path.basename(path)], + } + mismatches = [f"{name}={len(matches)}" for name, matches in required.items() if len(matches) != 1] + if mismatches: + raise RuntimeError("expected one loaded library for each dependency: " + ", ".join(mismatches)) + + def role(path: str) -> str: + name = os.path.basename(path) + if "libnccl.so" in name: + return "nccl" + if "libnvshmem_host.so" in name: + return "nvshmem" + return "deepep-extension" + + def label(path: str) -> str: + return "deep_ep._C" if role(path) == "deepep-extension" else os.path.basename(path) + + return sorted( + ({"role": role(path), "name": label(path), "sha256": _sha256(path)} for path in paths), + key=lambda item: (item["role"], item["name"], item["sha256"]), + ) + + +def _jit_artifact_evidence() -> list[dict[str, str]]: + root = Path(os.environ["EP_JIT_CACHE_DIR"]) / "cache" + if root.is_symlink() or not root.is_dir(): + raise RuntimeError("DeepEP V2 produced no JIT cache evidence") + artifacts = [] + kernel_names = set() + for directory in sorted(root.iterdir(), key=lambda item: item.name): + match = re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.([0-9a-f]{32})", directory.name) + if directory.is_symlink() or not directory.is_dir() or match is None: + raise RuntimeError("DeepEP V2 JIT cache contains an invalid entry") + if {path.name for path in directory.iterdir()} != { + "kernel.cu", "kernel.cubin", "kernel.sass", + }: + raise RuntimeError("DeepEP V2 JIT kernel evidence is incomplete") + source = directory / "kernel.cu" + cubin = directory / "kernel.cubin" + sass = directory / "kernel.sass" + if any(path.is_symlink() or not path.is_file() for path in (source, cubin, sass)): + raise RuntimeError("DeepEP V2 JIT evidence is not a regular file") + if any(path.stat().st_size <= 0 for path in (source, cubin, sass)): + raise RuntimeError("DeepEP V2 JIT evidence is empty") + kernel_names.add(match.group(1)) + artifacts.append({ + "cache_key": directory.name, + "source_sha256": _sha256(str(source)), + "sass_sha256": _sha256(str(sass)), + "cubin_sha256": _sha256(str(cubin)), + }) + if ( + len(artifacts) != len(DEEPEP_V2_JIT_KERNELS) + or kernel_names != DEEPEP_V2_JIT_KERNELS + ): + raise RuntimeError("DeepEP V2 JIT kernel set differs from the v1 contract") + return sorted(artifacts, key=lambda item: item["cache_key"]) + + +def _jit_cache_key( + args, + world_size: int, + max_tokens: int, + allow_hybrid_mode: bool, + realized: dict[str, int | bool], +) -> str: + """Key generated kernels by codegen inputs, not routing data or case identity.""" + payload = { + "contract": "deepep-v2-jit-config-v3", + "runner": args.runner, + "world_size": world_size, + "hidden": args.hidden, + "topk": args.topk, + "physical_experts": args.experts, + "tuning_experts": getattr(args, "num_logical_experts", args.experts), + "max_tokens": max_tokens, + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "input_layout": "bf16-no-sf", + "expert_alignment": 1, + "do_cpu_sync": True, + "cached_mode": False, + "do_expand": False, + "use_expanded_layout": False, + "allow_hybrid_mode": allow_hybrid_mode, + "allow_multiple_reduction": True, + "prefer_overlap_with_compute": True, + "deterministic": False, + **realized, + } + return "jitcfg-v3-" + hashlib.sha256( + json.dumps(payload, sort_keys=True, separators=(",", ":")).encode() + ).hexdigest() + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"DeepEP V2 {label} differs across ranks") + + +def _configure_gin_mode(args, world_size: int) -> bool: + scale_up_domain = int( + getattr(args, "scale_up_domain", None) + or getattr(args, "gpus_per_node", None) + or world_size + ) + allow_hybrid_mode = world_size > scale_up_domain + if allow_hybrid_mode: + os.environ.pop("EP_DISABLE_GIN", None) + else: + os.environ["EP_DISABLE_GIN"] = "1" + return allow_hybrid_mode + + +def _lsa_topology_is_valid( + gin_enabled: bool, world_size: int, config: dict[str, int | bool] +) -> bool: + return gin_enabled or ( + config["physical_rdma_ranks"] == 1 + and config["physical_nvlink_ranks"] == world_size + and config["logical_scaleout_ranks"] == 1 + and config["logical_scaleup_ranks"] == world_size + and config["is_scaleup_nvlink"] is True + ) + + +def _require_runtime() -> tuple[str, str]: + expected = { + "DEEPEP_V2_PR": str(DEEPEP_V2_PR), + "DEEPEP_V2_FIX_PR": str(DEEPEP_V2_FIX_PR), + "DEEPEP_V2_COMMIT": DEEPEP_V2_COMMIT, + "DEEPEP_V2_TREE": DEEPEP_V2_TREE, + "DEEPEP_V2_FMT_COMMIT": DEEPEP_V2_FMT_COMMIT, + "DEEPEP_V2_JIT_RANDOM_SEED": DEEPEP_V2_JIT_RANDOM_SEED, + "EP_JIT_DUMP_SASS": "1", + } + mismatches = [ + f"{name}={os.environ.get(name)!r}, expected {value!r}" + for name, value in expected.items() + if os.environ.get(name) != value + ] + torch_version = str(torch.__version__) + nccl_package_version = importlib.metadata.version("nvidia-nccl-cu13") + nvshmem_package_version = importlib.metadata.version("nvidia-nvshmem-cu12") + actual = { + "deep_ep": str(getattr(deep_ep, "__version__", "")), + "deep_ep distribution": importlib.metadata.version("deep_ep"), + "torch": torch_version, + "nvidia-nccl-cu13": nccl_package_version, + "nvidia-nvshmem-cu12": nvshmem_package_version, + } + required = { + "deep_ep": DEEPEP_V2_VERSION, + "deep_ep distribution": DEEPEP_V2_DISTRIBUTION, + "torch": TORCH_VERSION, + "nvidia-nccl-cu13": NCCL_VERSION, + "nvidia-nvshmem-cu12": NVSHMEM_VERSION, + } + mismatches.extend( + f"{name}={actual[name]!r}, expected {value!r}" + for name, value in required.items() + if actual[name] != value + ) + if not inspect.isclass(ElasticBuffer) or ElasticBuffer.__name__ != "ElasticBuffer": + mismatches.append("deep_ep.ElasticBuffer is absent") + if os.environ.get("EP_SUPPRESS_NCCL_CHECK"): + mismatches.append("EP_SUPPRESS_NCCL_CHECK must be unset") + nccl_runtime_version = _loaded_nccl_version() + if nccl_runtime_version != NCCL_VERSION: + mismatches.append( + f"loaded NCCL={nccl_runtime_version!r}, expected {NCCL_VERSION!r}" + ) + if mismatches: + raise RuntimeError("invalid DeepEP V2 runtime: " + "; ".join(mismatches)) + return torch_version, nccl_runtime_version + + +class DeepEPV2Backend: + name = "deepep-v2" + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + self.group = dist.group.WORLD + torch_version, nccl_runtime_version = _require_runtime() + ladder, _ = ep_harness.token_ladder(args.tokens_ladder, args.phase, None) + conditioning = ep_harness.CONDITIONING_LADDERS[args.phase] + self.max_tokens = max([*ladder, *conditioning]) + jit_root = Path(os.environ["EP_JIT_CACHE_DIR"]) + allow_hybrid_mode = _configure_gin_mode(args, world_size) + gin_enabled = allow_hybrid_mode + communication_backend = "nccl-gin" if gin_enabled else "nccl-device-lsa" + self._deferred_jit_snapshot = None + self.buffer = ElasticBuffer( + self.group, + num_max_tokens_per_rank=self.max_tokens, + hidden=args.hidden, + num_topk=args.topk, + use_fp8_dispatch=False, + deterministic=False, + allow_hybrid_mode=allow_hybrid_mode, + allow_multiple_reduction=True, + prefer_overlap_with_compute=True, + num_gpu_timeout_secs=100, + explicitly_destroy=True, + ) + tuning_num_experts = int(getattr(args, "num_logical_experts", args.experts)) + self.num_sms = int( + self.buffer.get_theoretical_num_sms(tuning_num_experts, args.topk) + ) + self.num_qps = int(self.buffer.get_theoretical_num_qps(self.num_sms)) + properties = torch.cuda.get_device_properties(device) + device_sms = int(properties.multi_processor_count) + jit_config = { + "num_sms": self.num_sms, + "num_qps": self.num_qps, + "allocated_qps": int(self.buffer.num_allocated_qps), + "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks), + "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks), + "physical_rdma_ranks": int(self.buffer.num_rdma_ranks), + "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks), + "is_scaleup_nvlink": self.buffer.num_scaleup_ranks == self.buffer.num_nvlink_ranks, + "device_arch_major": int(properties.major), + "device_arch_minor": int(properties.minor), + "device_sms": device_sms, + "device_smem_bytes": int(properties.shared_memory_per_block_optin), + "gpu_timeout_cycles": 100 * int(properties.clock_rate) * 1000, + } + _require_cross_rank_equal(jit_config, "JIT configuration") + if not _lsa_topology_is_valid(gin_enabled, world_size, jit_config): + raise RuntimeError("DeepEP V2 no-GIN run is outside one realized LSA domain") + self.jit_cache_key = _jit_cache_key( + args, world_size, self.max_tokens, allow_hybrid_mode, jit_config + ) + os.environ["EP_JIT_CACHE_DIR"] = str(jit_root / self.jit_cache_key) + realized_config = { + "jit_cache_key": self.jit_cache_key, + "num_max_tokens_per_rank": self.max_tokens, + **jit_config, + } + _require_cross_rank_equal(realized_config, "realized tuning/topology") + comm = getattr(self.buffer, "nccl_comm_handle", None) + communicator = ( + "deepep-managed" if getattr(comm, "managed", True) else "pytorch-reused" + ) + + loaded_libraries = _loaded_library_evidence() + _require_cross_rank_equal(loaded_libraries, "loaded libraries") + self.backend_provenance = { + "deepep_version": DEEPEP_V2_VERSION, + "deepep_distribution_version": importlib.metadata.version("deep_ep"), + "deepep_commit": DEEPEP_V2_COMMIT, + "deepep_tree": DEEPEP_V2_TREE, + "deepep_pr": DEEPEP_V2_PR, + "deepep_fix_pr": DEEPEP_V2_FIX_PR, + "fmt_commit": DEEPEP_V2_FMT_COMMIT, + "api": "deep_ep.ElasticBuffer", + "api_signature_sha256": _api_sha256(), + "communication_backend": communication_backend, + "gin_enabled": gin_enabled, + "nccl_communicator": communicator, + "torch_version": torch_version, + "torch_git_version": str(torch.version.git_version), + "cuda_version": str(torch.version.cuda), + "nccl_package_version": importlib.metadata.version("nvidia-nccl-cu13"), + "nccl_version": nccl_runtime_version, + "nvshmem_package_version": importlib.metadata.version("nvidia-nvshmem-cu12"), + "loaded_libraries": loaded_libraries, + "jit_cache_key": self.jit_cache_key, + "jit_cubins": [], + "jit_random_seed": DEEPEP_V2_JIT_RANDOM_SEED, + "num_experts": int(args.experts), + "mode": "normal", + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "deterministic": False, + "resource_mode": "tuned", + "requested_num_sms": self.num_sms, + "tuning_num_experts": tuning_num_experts, + "num_sms": self.num_sms, + "num_qps": self.num_qps, + "allocated_qps": int(self.buffer.num_allocated_qps), + "device_sms": device_sms, + "sm_fraction": self.num_sms / device_sms, + "tuned_source": "deepep-v2-analytical-sm-qp-logical-experts-v1", + "num_max_tokens_per_rank": self.max_tokens, + "allow_hybrid_mode": bool(self.buffer.allow_hybrid_mode), + "allow_multiple_reduction": bool(self.buffer.allow_multiple_reduction), + "prefer_overlap_with_compute": bool( + self.buffer.prefer_overlap_with_compute + ), + "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks), + "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks), + "physical_rdma_ranks": int(self.buffer.num_rdma_ranks), + "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks), + } + + def buffer_cap(self, args): + return self.max_tokens + + def make_problem(self, T, idx, weights, x): + return types.SimpleNamespace( + T=T, + x=x, + topk_idx=idx.to(deep_ep.topk_idx_t), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + recv_x, recv_topk_idx, recv_topk_weights, handle, _ = self.buffer.dispatch( + p.x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_experts=self.args.experts, + num_max_tokens_per_rank=self.max_tokens, + expert_alignment=1, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + do_handle_copy=True, + do_cpu_sync=True, + do_expand=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = h.recv_x + + def combine(self, p, h): + combined_x, _, _ = self.buffer.combine( + h.combine_input, + handle=h.handle, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + ) + return combined_x + + def capture_deferred_provenance(self): + # destroy() uses this same barrier. Materialize its JIT kernel before hashing the + # implementation so the first and later routing cases see identical evidence. + self.buffer.barrier(use_comm_stream=True, with_cpu_sync=True) + torch.cuda.synchronize() + jit_cubins = _jit_artifact_evidence() + _require_cross_rank_equal(jit_cubins, "JIT CUBINs") + if ( + self._deferred_jit_snapshot is not None + and jit_cubins != self._deferred_jit_snapshot + ): + raise RuntimeError("DeepEP V2 JIT CUBIN set changed after measurement") + self._deferred_jit_snapshot = jit_cubins + self.backend_provenance["jit_cubins"] = jit_cubins + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + local_idx = h.recv_topk_idx[:count] + valid = local_idx >= 0 + expert_ids = torch.where( + valid, + local_idx + self.rank * (self.args.experts // self.world_size), + local_idx, + ) + local = local_idx[valid].to(torch.int64) + return types.SimpleNamespace( + payload=h.recv_x[:count], + expert_ids=expert_ids, + weights=h.recv_topk_weights[:count].masked_fill(~valid, 0), + local_expert_counts=torch.bincount( + local, minlength=self.args.experts // self.world_size + ), + ordering_contract="elastic-source-metadata-v1", + ) + + def combine_transformed(self, p, h, transformed): + combine_input = torch.zeros_like(h.recv_x) + combine_input[: transformed.shape[0]].copy_(transformed.to(combine_input.dtype)) + combined, _, _ = self.buffer.combine( + combine_input, + handle=h.handle, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + ) + return combined + + def recv_tokens(self, h): + return int(h.handle.psum_num_recv_tokens_per_scaleup_rank[-1].item()) + + def finalize(self, rc): + try: + dist.barrier() + self.buffer.destroy() + dist.barrier() + dist.destroy_process_group() + except Exception: + return 1 + return rc diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py new file mode 100644 index 0000000000..ca9dee8fcf --- /dev/null +++ b/experimental/CollectiveX/tests/ep_harness.py @@ -0,0 +1,1362 @@ +#!/usr/bin/env python3 +"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness. + +Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`) +implement a small duck-typed protocol; this module owns the source-tokens-per-rank +sweep, the timing, the correctness gate, and the provenance-tagged JSON doc. + +Fair-comparison contract (see docs/methodology.md): + * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs + + gate weights are generated once from a fixed seed over the *global* batch and are + identical on every SKU; each rank materializes its slice. So every platform runs + the *same* problem (no per-rank/per-platform RNG in the adapters). + * **Explicit measurement contract**: layout-and-dispatch-v1 includes routing-layout + generation in dispatch timing. Combine excludes staging. + Isolated sum is derived independently at each percentile and is not a measured chained op. + * **Correct collective percentile**: each iteration's latency is reduced MAX across + ranks first (a collective finishes with its slowest rank), THEN percentiled — + `median_i(max_r)`, not `max_r(median_i)`. + * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and + `global_tokens = T * ep_size` are recorded as explicit chart coordinates. + +stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported +lazily inside run_sweep) so this file `py_compile`s without torch. + +Backend protocol: + name, mode, combine_needs_redispatch, backend_provenance(dict) + buffer_cap(args) -> int|None + make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice + dispatch(problem) -> handle # pure dispatch comm (timed) + stage(problem, handle) # untimed expert-output placement + combine(problem, handle) -> tensor # pure combine comm (timed) + inspect_dispatch(problem, handle) -> view # normalized payload/expert/weight metadata + combine_transformed(problem, handle, tensor) -> tensor + recv_tokens(handle) -> int # realized tokens received this rank + finalize(rc) -> int|NoReturn +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import math +import os + +import contracts +import identity +import workload as workload_contract + +# Raw v1 result emitted by one benchmark case. Publication uses a separate contract. +SCHEMA_VERSION = 1 + +# Every comparison-grade EP point uses the same literal timing profile on every SKU/backend. +# Eight timed iterations keep each MoRI burst well below its sustained-iteration wedge, 64 trials +# provide 512 observations per operation, and 32 warmups meet Blackwell's measured clock-ramp floor. +SAMPLING_CONTRACT = identity.V1_CASE_PROFILE["sampling_contract"] +TIMED_SAMPLES_PER_POINT = 512 +TIMED_ITERS_PER_TRIAL = 8 +TRIALS_PER_POINT = 64 +WARMUP_ITERS_PER_TRIAL = 32 +WARMUP_SEMANTICS = "full-roundtrip-before-each-component-trial-point-v1" +ROUTING_SEED = 67 +ROUTING_GENERATOR = workload_contract.GENERATOR_VERSION +ACTIVATION_PROFILE = "canonical-counter-source-v3" +ACTIVATION_GENERATOR = workload_contract.ACTIVATION_GENERATOR +PLACEMENT = "packed" +COMPONENT_ORDER_CONTRACT = "roundtrip-dispatch-activation-only-combine-v2" + +# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal +# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a +# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap). +DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128] +PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096] +CONDITIONING_LADDERS = { + phase: list(ladder) for phase, ladder in contracts.V1_CONDITIONING_LADDERS.items() +} +CONDITIONING_ROUNDS_PER_SHAPE = contracts.V1_CONDITIONING_ROUNDS_PER_SHAPE +CONDITIONING_CONTRACT = identity.V1_CASE_PROFILE["conditioning_contract"] +ORACLE_CONTRACT = identity.V1_CASE_PROFILE["oracle_contract"] +ORACLE_RTOL = 5e-2 +ORACLE_ATOL = 2e-2 + +BF16_BYTES = 2 +EPLB_REDUNDANT_EXPERTS = 32 +EPLB_REFERENCE_TOKENS_PER_RANK = 2048 +EPLB_PLANNER = "greedy-rank-major-v1" +V1_PROFILE = { + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "mode": "normal", + "measurement_contract": "layout-and-dispatch-v1", + "resource_mode": "tuned", + "placement": PLACEMENT, + "activation_profile": ACTIVATION_PROFILE, + "activation_generator": ACTIVATION_GENERATOR, + "routing_generator": ROUTING_GENERATOR, + "component_order_contract": COMPONENT_ORDER_CONTRACT, + "conditioning_contract": CONDITIONING_CONTRACT, + "eplb_reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK, + "eplb_redundant_experts": EPLB_REDUNDANT_EXPERTS, + "eplb_planner": EPLB_PLANNER, + # DeepEP/UCCL use this only as the fallback when their tuned default is not exported. + "num_sms": 24, +} + +def format_collective_version(raw) -> str: + """Normalize PyTorch's tuple or packed NCCL/RCCL version representation.""" + if isinstance(raw, int): + if raw < 10_000: + return f"{raw // 1000}.{raw // 100 % 10}.{raw % 100}" + return f"{raw // 10_000}.{raw // 100 % 100}.{raw % 100}" + if isinstance(raw, (tuple, list)): + return ".".join(map(str, raw)) + return str(raw) if raw not in (None, "") else "unknown" + + +def add_common_args(ap: argparse.ArgumentParser) -> None: + """Add the varying v1 inputs; fixed profile values are not CLI axes.""" + ap.set_defaults(**V1_PROFILE) + ap.add_argument("--phase", default="decode", choices=["decode", "prefill"], + help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder") + ap.add_argument("--tokens-ladder", default="", + help="space/comma-separated source-tokens-per-rank sweep; blank = phase default") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)") + ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) + # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical + # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform + # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew. + ap.add_argument("--eplb", action="store_true", + help="apply EPLB expert replication/placement to the routing trace") + # Canonical workloads consume pre-generated trace bytes instead of the + # seeded runtime generator, so a result is provably the SAME workload as another machine's + # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py). + ap.add_argument("--workload-dir", default="", + help="dir of canonical workload traces; empty = seeded runtime generation (dev)") + ap.add_argument("--case-id", default="") + ap.add_argument("--suite", default="") + ap.add_argument("--workload-name", default="") + ap.add_argument("--required-publication", default="") + ap.add_argument("--seed", type=int, default=ROUTING_SEED) + # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks + + # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us + # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within + # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless. + ap.add_argument("--warmup", type=int, default=WARMUP_ITERS_PER_TRIAL, + help=f"untimed full roundtrips before each trial/point; fixed by " + f"{SAMPLING_CONTRACT} to {WARMUP_ITERS_PER_TRIAL}") + ap.add_argument("--iters", type=int, default=TIMED_ITERS_PER_TRIAL, + help=f"timed iterations per trial; fixed by {SAMPLING_CONTRACT} to " + f"{TIMED_ITERS_PER_TRIAL}") + ap.add_argument("--trials", type=int, default=TRIALS_PER_POINT, + help=f"timed trials; fixed by {SAMPLING_CONTRACT} to {TRIALS_PER_POINT}") + # provenance / output + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + # gpus-per-node=0 means one node containing the whole EP group. + ap.add_argument("--gpus-per-node", type=int, default=0) + ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + + +def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]: + """Return (ladder, dropped): explicit spec else the phase default; positive ints; + clamped to `cap` with dropped points reported (never silently truncated).""" + if spec and spec.strip(): + want = [int(t) for t in spec.replace(",", " ").split() if t] + else: + want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER + want = sorted({t for t in want if t > 0}) + if cap is not None: + return [t for t in want if t <= cap], [t for t in want if t > cap] + return want, [] + + +def sampling_contract_error(iters: int, trials: int, warmup: int) -> str | None: + """Return a user-facing error unless the exact cross-SKU timing profile is used.""" + expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL) + observed = (iters, trials, warmup) + if observed != expected: + return (f"{SAMPLING_CONTRACT} requires exactly iters:trials:warmup=" + f"{expected[0]}:{expected[1]}:{expected[2]} on every SKU/backend; got " + f"{observed[0]}:{observed[1]}:{observed[2]} " + f"({iters * trials if iters > 0 and trials > 0 else 'invalid'} timed samples)") + return None + + +def _stats_vec(xs: list[int]) -> dict: + """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token + or load summary without dumping the full vector.""" + n = len(xs) or 1 + mean = sum(xs) / n + var = sum((x - mean) ** 2 for x in xs) / n + cv = (var ** 0.5 / mean) if mean > 0 else 0.0 + return {"min": min(xs) if xs else 0, "mean": round(mean, 3), + "max": max(xs) if xs else 0, "cv": round(cv, 4), + "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n} + + +def percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, math.ceil(q / 100.0 * len(s)) - 1)) + return s[i] + + +def _sha256_json(value) -> str: + payload = json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":") + ).encode() + return hashlib.sha256(payload).hexdigest() + + +def _series_provenance(provenance: dict) -> dict: + """Retain stable semantic build identity while keeping raw binaries diagnostic.""" + return contracts.series_provenance(provenance) + + +def _write_bytes_atomic(path: str, payload: bytes) -> tuple[str, int]: + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + temporary = f"{path}.tmp-{os.getpid()}" + try: + with open(temporary, "wb") as handle: + handle.write(payload) + handle.flush() + os.fsync(handle.fileno()) + os.replace(temporary, path) + finally: + try: + os.unlink(temporary) + except FileNotFoundError: + pass + return hashlib.sha256(payload).hexdigest(), len(payload) + + +def _write_json_atomic(path: str, value) -> tuple[str, int]: + payload = ( + json.dumps(value, allow_nan=False, ensure_ascii=False, indent=2) + "\n" + ).encode() + return _write_bytes_atomic(path, payload) + + +def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]: + """Per-iteration CUDA-event latencies (µs) for THIS rank. + + Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync + before the start event so its GPU work can't bleed in), then times `fn(pre_result)` + — how combine is isolated when it consumes the dispatch state and needs a fresh + untimed dispatch+stage before every sample. Returns the raw per-iteration series; + the caller reduces across ranks per iteration before percentiling. + """ + def sample(): + arg = pre() if pre is not None else None + if pre is not None: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn(arg) if pre is not None else fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) * 1000.0 # ms -> us + + for _ in range(max(0, warmup)): + if pre is not None: + a = pre() + torch.cuda.synchronize() + fn(a) + else: + fn() + # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn + # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back + # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort + # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync. + torch.cuda.synchronize() + return [sample() for _ in range(iters)] + + +def kernel_generation(backend) -> str: + """Return the adapter's explicit kernel family when one exists.""" + declared = getattr(backend, "kernel_generation", None) + if declared: + return declared + return { + "deepep": "v1", + "deepep-v2": "v2-elastic-buffer", + "deepep-hybrid": "hybrid", + }.get(backend.name, "n-a") + + +def _reduce_vec(torch, dist, device, vals, op): + t = torch.tensor(vals, device=device, dtype=torch.float64) + dist.all_reduce(t, op=op) + return [float(x) for x in t.tolist()] + + +def _reduce_int(torch, dist, device, v: int, op) -> int: + t = torch.tensor([int(v)], device=device, dtype=torch.int64) + dist.all_reduce(t, op=op) + return int(t.item()) + + +def _same_hash_across_ranks(torch, dist, device, digest: str) -> bool: + parts = [int(digest[offset:offset + 8], 16) for offset in range(0, 64, 8)] + low = torch.tensor(parts, device=device, dtype=torch.int64) + high = low.clone() + dist.all_reduce(low, op=dist.ReduceOp.MIN) + dist.all_reduce(high, op=dist.ReduceOp.MAX) + return bool(torch.equal(low, high)) + + +def _tensor_sha256(*tensors) -> str: + digest = hashlib.sha256() + for tensor in tensors: + digest.update(tensor.detach().contiguous().cpu().numpy().tobytes()) + return digest.hexdigest() + + +def _normalized_expert_metadata(torch, expert_ids, weights): + """Sort each row by global expert ID while keeping -1 sentinels last.""" + valid = expert_ids >= 0 + keys = torch.where(valid, expert_ids.to(torch.int64), torch.full_like(expert_ids, 1 << 30)) + order = torch.argsort(keys, dim=1, stable=True) + sorted_ids = torch.gather(expert_ids.to(torch.int64), 1, order) + sorted_weights = torch.gather(weights.to(torch.float32), 1, order) + sorted_valid = sorted_ids >= 0 + return ( + torch.where(sorted_valid, sorted_ids, torch.full_like(sorted_ids, -1)), + sorted_weights.masked_fill(~sorted_valid, 0), + ) + + +def _expert_transform(torch, payload, expert_ids, weights, combine_weight_semantics): + """Build one local expert aggregate for the v1 unweighted combine contract.""" + if combine_weight_semantics != "unweighted-rank-sum": + raise ValueError("v1 requires unweighted rank-sum combine") + valid = expert_ids >= 0 + expert = expert_ids.clamp(min=0).to(torch.int64) + gate = weights.to(torch.float32).masked_fill(~valid, 0) + scale = ((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32 + offset_a = (((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64 + offset_b = (((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128 + scale_sum = (gate * scale).sum(dim=1, keepdim=True) + offset_a_sum = (gate * offset_a).sum(dim=1, keepdim=True) + offset_b_sum = (gate * offset_b).sum(dim=1, keepdim=True) + columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64) + pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8 + transformed = ( + payload.float() * scale_sum + offset_a_sum + offset_b_sum * pattern.unsqueeze(0) + ) + return transformed.to(payload.dtype) + + +def _expected_transformed_combine(torch, problem): + """Independently derive sum_i gate_i * expert_i(x) for each source token.""" + expected = torch.zeros_like(problem.x, dtype=torch.float32) + expert_ids = problem.topk_idx.to(torch.int64) + weights = problem.topk_weights.to(torch.float32) + columns = torch.arange(problem.x.shape[1], device=problem.x.device, dtype=torch.int64) + pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8 + for slot in range(expert_ids.shape[1]): + expert = expert_ids[:, slot] + gate = weights[:, slot].unsqueeze(1) + scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1) + offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1) + offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1) + expert_output = problem.x.float() * scale + offset_a + offset_b * pattern.unsqueeze(0) + expected.add_(gate * expert_output) + return expected + + +def _run_expert_oracle( + torch, + routing, + backend, + problem, + global_idx, + global_weights, + rank: int, + experts_per_rank: int, + seed: int, +): + """Verify one real dispatch/transform/combine without entering a timed region.""" + handle = backend.dispatch(problem) + torch.cuda.synchronize() + try: + view = backend.inspect_dispatch(problem, handle) + source_ids = routing.decode_source_ids(view.payload, seed) + except Exception as inspection_error: + try: + problem.recv_tokens = backend.recv_tokens(handle) + backend.stage(problem, handle) + backend.combine(problem, handle) + torch.cuda.synchronize() + except Exception as cleanup_error: + raise inspection_error from cleanup_error + return { + "contract": ORACLE_CONTRACT, + "passed": False, + "ordering_contract": "adapter-inspection-failed", + "order_sha256": None, + "dispatch_sha256": None, + "combine_weight_semantics": getattr( + backend, "combine_weight_semantics", "undeclared" + ), + "receive_count": 0, + "atol": ORACLE_ATOL, + "max_absolute_error": None, + "max_elementwise_relative_error": None, + "max_relative_error": None, + "max_weight_error": None, + "rtol": ORACLE_RTOL, + "checks": { + "combine_values": False, + "counts": False, + "metadata": False, + "multiplicity": False, + "payload": False, + "source_set": False, + "weights": False, + }, + } + + receive_count = int(view.payload.shape[0]) + shape_ok = ( + view.payload.ndim == 2 + and view.expert_ids.shape == (receive_count, problem.topk_idx.shape[1]) + and view.weights.shape == view.expert_ids.shape + ) + source_range = bool( + receive_count == 0 + or ((source_ids >= 0) & (source_ids < global_idx.shape[0])).all().item() + ) + if source_range: + expected_idx = global_idx.to(problem.x.device).index_select(0, source_ids) + expected_weights = global_weights.to(problem.x.device).index_select(0, source_ids) + local = (expected_idx // experts_per_rank) == rank + expected_ids = torch.where(local, expected_idx, torch.full_like(expected_idx, -1)) + expected_weights = expected_weights.masked_fill(~local, 0) + expected_payload = routing.activations_for_source_ids( + source_ids, problem.x.shape[1], seed, problem.x.dtype + ) + else: + expected_ids = torch.full_like(view.expert_ids, -1) + expected_weights = torch.zeros_like(view.weights) + expected_payload = torch.empty_like(view.payload) + actual_ids, actual_weights = _normalized_expert_metadata( + torch, view.expert_ids, view.weights + ) + expected_ids, expected_weights = _normalized_expert_metadata( + torch, expected_ids, expected_weights + ) + expected_sources = ( + ((global_idx // experts_per_rank) == rank).any(dim=1).nonzero(as_tuple=True)[0] + ).to(problem.x.device) + source_set_ok = ( + source_range + and source_ids.numel() == torch.unique(source_ids).numel() + and torch.equal(torch.sort(source_ids).values, expected_sources) + ) + payload_ok = source_range and torch.equal(view.payload, expected_payload) + metadata_ok = shape_ok and torch.equal(actual_ids, expected_ids) + max_weight_error = ( + float((actual_weights - expected_weights).abs().max().item()) + if actual_weights.numel() + else 0.0 + ) + weights_ok = max_weight_error == 0.0 + valid_expected = expected_ids >= 0 + expected_local = expected_ids[valid_expected] - rank * experts_per_rank + expected_counts = torch.bincount(expected_local, minlength=experts_per_rank) + counts_ok = torch.equal( + view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64) + ) + multiplicity_ok = torch.equal( + (actual_ids >= 0).sum(dim=1), (expected_ids >= 0).sum(dim=1) + ) + # Receive-slot assignment may use atomics and is not a semantic EP guarantee. Compare + # pre/post dispatch evidence in canonical source-token order without changing the native path. + canonical_order = torch.argsort(source_ids.to(torch.int64), stable=True) + canonical_sources = source_ids.to(torch.int64).index_select(0, canonical_order) + canonical_ids = actual_ids.to(torch.int64).index_select(0, canonical_order) + canonical_weights = actual_weights.index_select(0, canonical_order) + ordering_contract = f"canonical-source-id-v1/{view.ordering_contract}" + order_sha256 = _tensor_sha256(canonical_sources) + dispatch_sha256 = _tensor_sha256( + canonical_sources, canonical_ids, canonical_weights + ) + + problem.recv_tokens = receive_count + combine_weight_semantics = backend.combine_weight_semantics + transformed = _expert_transform( + torch, view.payload, actual_ids, actual_weights, combine_weight_semantics + ) + combined = backend.combine_transformed(problem, handle, transformed) + torch.cuda.synchronize() + expected_combined = _expected_transformed_combine(torch, problem) + if combined.shape == expected_combined.shape and combined.numel(): + absolute_error = (combined.float() - expected_combined).abs() + max_absolute_error = float(absolute_error.max().item()) + max_relative_error = max_absolute_error / ( + float(expected_combined.abs().max().item()) + 1e-6 + ) + max_elementwise_relative_error = float( + (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item() + ) + combine_values_ok = bool(torch.allclose( + combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL + )) + elif combined.shape == expected_combined.shape: + max_absolute_error = 0.0 + max_elementwise_relative_error = 0.0 + max_relative_error = 0.0 + combine_values_ok = True + else: + max_absolute_error = None + max_elementwise_relative_error = None + max_relative_error = None + combine_values_ok = False + tolerance = float(getattr(backend, "tolerance", 5e-2)) + checks = { + "combine_values": combine_values_ok, + "counts": counts_ok, + "metadata": metadata_ok, + "multiplicity": multiplicity_ok, + "payload": payload_ok, + "source_set": source_set_ok, + "weights": weights_ok, + } + return { + "contract": ORACLE_CONTRACT, + "passed": bool( + all(checks.values()) + and ordering_contract + and max_relative_error is not None + and max_relative_error < tolerance + ), + "atol": ORACLE_ATOL, + "combine_weight_semantics": combine_weight_semantics, + "ordering_contract": ordering_contract, + "order_sha256": order_sha256, + "dispatch_sha256": dispatch_sha256, + "receive_count": receive_count, + "max_absolute_error": max_absolute_error, + "max_elementwise_relative_error": max_elementwise_relative_error, + "max_relative_error": max_relative_error, + "max_weight_error": max_weight_error, + "rtol": ORACLE_RTOL, + "checks": checks, + } + + +def _histogram(xs: list[float], nbins: int = 40) -> dict: + """Compact equal-width summary of the exact private cross-rank-max samples.""" + if not xs: + return {"n": 0} + lo, hi = min(xs), max(xs) + if hi <= lo: + return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]} + counts = [0] * nbins + span = hi - lo + for x in xs: + b = min(nbins - 1, int((x - lo) / span * nbins)) + counts[b] += 1 + return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts} + + +def _derive_publication_status(v: dict) -> str: + """Classify raw attempts; only the isolated coverage publisher may promote evidence.""" + if v["execution_status"] != "complete": + return "failed" + if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \ + or v["workload_identity"] == "inconsistent": + return "invalid" + # Per-case producers cannot prove exact matrix coverage, repeat stability, or controlled + # cohorts. Keep even sound attempts diagnostic until the isolated publisher validates them. + return "diagnostic" + + +def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int: + """Drive the source-tokens-per-rank sweep for one fully-specified line.""" + sampling_error = sampling_contract_error(args.iters, args.trials, args.warmup) + if sampling_error: + if rank == 0: + print(f"ERROR: {sampling_error}") + return 2 + import routing # torch-based; imported lazily so the module byte-compiles without torch + import eplb # stdlib planner + torch remap (the EPLB transform) + + ep_size = world_size + # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the + # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL + # experts then remapped to physical (build_trace), so the whole sweep runs over the + # balanced physical placement with no adapter change. + eplb_on = getattr(args, "eplb", False) + num_logical = getattr(args, "num_logical_experts", args.experts) + if args.experts % ep_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})") + return 2 + experts_per_rank = args.experts // ep_size + if getattr(backend, "combine_weight_semantics", None) != "unweighted-rank-sum": + if rank == 0: + print("ERROR: v1 requires activation-only unweighted combine") + return 2 + + cap = backend.buffer_cap(args) + conditioning_ladder = CONDITIONING_LADDERS[args.phase] + if cap is not None and cap < conditioning_ladder[-1]: + if rank == 0: + print(f"ERROR: {backend.name} buffer cap {cap} cannot run the v1 conditioning ladder") + return 2 + ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap) + if rank == 0 and dropped: + print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} " + f"(hidden={args.hidden}); not silently truncated.") + if not ladder: + if rank == 0: + print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})") + return 2 + MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM + + # EPLB plan (once): estimate logical load from the global logical trace at the largest + # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB + # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps + # to physical when the plan is present; otherwise it's the identity (logical == physical). + eplb_plan = None + if eplb_on: + ref_idx, _ = routing.build_global_routing( + EPLB_REFERENCE_TOKENS_PER_RANK * ep_size, + num_logical, + args.topk, + args.routing, + args.seed, + ) + load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist() + eplb_plan = eplb.build_plan(load, args.experts, ep_size) + if rank == 0: + print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); " + f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> " + f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts " + f"replicated (hottest {eplb_plan['max_replicas']}x)") + + canonical = bool(getattr(args, "workload_dir", "")) + loaded_workload_ids, loaded_checksums = [], {} + if canonical: + import workload as _wl + + def build_trace(gt): + # canonical: load pre-serialized trace bytes (verified by checksum) so this run is + # provably the SAME workload as any other consuming the same files. else: seeded gen. + if canonical: + wid = _wl.compute_workload_id( + args.routing, args.hidden, args.topk, num_logical, ep_size, gt, args.seed + ) + idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True) + idx_l = torch.from_numpy(idx_np).to(torch.int64) + w = torch.from_numpy(w_np).to(torch.float32) + if wid not in loaded_workload_ids: + loaded_workload_ids.append(wid) + loaded_checksums[wid] = man.get("checksums") + else: + idx_l, w = routing.build_global_routing( + gt, num_logical, args.topk, args.routing, args.seed + ) + return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w + + # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold + # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually + # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone + # and is also cold-jump-safe for MoRI. + def warm_roundtrips(problem, count): + for _ in range(count): + handle = backend.dispatch(problem) + if not hasattr(problem, "recv_tokens"): + # Dynamic receive cardinality is stable for this fixed routing trace. Cache it + # during untimed conditioning so adapters never read a device scalar in timing. + problem.recv_tokens = backend.recv_tokens(handle) + backend.stage(problem, handle) + backend.combine(problem, handle) + torch.cuda.synchronize() + + for wt in conditioning_ladder: + # Warm-only shapes need not have canonical manifests: they are never measured or emitted. + wi, ww = routing.build_global_routing( + wt * ep_size, num_logical, args.topk, args.routing, args.seed, + ) + if eplb_plan is not None: + wi = eplb.remap_idx(wi, eplb_plan) + wsi, wsw = routing.rank_slice(wi, ww, rank, wt) + wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16) + wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx) + warm_roundtrips(wp, CONDITIONING_ROUNDS_PER_SHAPE) + torch.cuda.synchronize() + dist.barrier() + # Setup may materialize deferred provenance such as DeepEP V2 JIT CUBINs. + # Resolve it after conditioning but before correctness or timed measurements. + capture_deferred_provenance = getattr(backend, "capture_deferred_provenance", None) + if capture_deferred_provenance is not None: + capture_deferred_provenance() + provenance_issues = contracts.backend_provenance_issues( + backend.name, backend.backend_provenance + ) + if provenance_issues: + if rank == 0: + print( + f"ERROR: unpinned provenance {provenance_issues} " + f"in {backend.backend_provenance}" + ) + return 4 + elem_dispatch = BF16_BYTES + + # ---- Pass 1: build each deterministic problem and run the expert oracle. ---- + problems, gate, gts, global_traces, input_snapshots = {}, {}, {}, {}, {} + routing_hashes = set() + for T in ladder: + counts = [T] * ep_size + gt = T * ep_size + gts[T] = gt + idx_g, w_g = build_trace(gt) + rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g) + gpn = args.gpus_per_node or ep_size + rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T), + gpn, args.scale_up_domain or None) + rstats["source_token_stats"] = _stats_vec(counts) + routing_hashes.add(rstats["routing_hash"]) + my_off, my_cnt = rank * T, T + idx_s = idx_g[my_off:my_off + my_cnt].contiguous() + w_s = w_g[my_off:my_off + my_cnt].contiguous() + x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16) + problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x) + input_snapshots[T] = ( + problem.x.clone(), problem.topk_idx.clone(), problem.topk_weights.clone() + ) + oracle = _run_expert_oracle( + torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank, + args.seed, + ) + before_x, before_idx, before_weights = input_snapshots[T] + pre_input_unchanged = ( + torch.equal(problem.x, before_x) + and torch.equal(problem.topk_idx, before_idx) + and torch.equal(problem.topk_weights, before_weights) + ) + problems[T] = problem + global_traces[T] = (idx_g, w_g) + gate[T] = { + "rstats": rstats, + "recv_local": oracle["receive_count"], + "max_rel": oracle["max_relative_error"] or 0.0, + "local_ok": int(oracle["passed"]), + "oracle_pre": oracle, + "pre_input_unchanged": pre_input_unchanged, + } + + # ---- Pass 2: every backend uses the same ascending point order and conditioning ramp. + # Per-iteration cross-rank MAX samples are pooled across trials. ---- + disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch) + comb_pool = {T: [] for T in ladder} # ... combine + rt_pool = {T: [] for T in ladder} # independently measured round trip + disp_trials = {T: [] for T in ladder} + comb_trials = {T: [] for T in ladder} + rt_trials = {T: [] for T in ladder} + order = list(ladder) + for _trial in range(args.trials): + for T in order: + problem = problems[T] + # Stateful paired APIs may expose only a measured round trip. + # Do not synthesize component latency from that measurement. + roundtrip_only = getattr(backend, "roundtrip_only", False) + + def rt_once(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return backend.combine(p, hh) + + # Every available component starts after the same synchronized full-roundtrip warmup. + # Roundtrip is first on every backend because it is the comparison headline. + warm_roundtrips(problem, args.warmup) + rt_iters = time_us(torch, lambda p=problem: rt_once(p), 0, args.iters) + if roundtrip_only: + disp_iters = comb_iters = [] + else: + warm_roundtrips(problem, args.warmup) + disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p), + 0, args.iters) + + def prep(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return hh + warm_roundtrips(problem, args.warmup) + if backend.combine_needs_redispatch: + comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh), + 0, args.iters, pre=prep) + else: + hh = prep() + torch.cuda.synchronize() + comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx), + 0, args.iters) + # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled. + if disp_iters: + reduced_dispatch = _reduce_vec(torch, dist, device, disp_iters, MAX) + reduced_combine = _reduce_vec(torch, dist, device, comb_iters, MAX) + disp_trials[T].append(reduced_dispatch) + comb_trials[T].append(reduced_combine) + disp_pool[T] += reduced_dispatch + comb_pool[T] += reduced_combine + reduced_roundtrip = _reduce_vec(torch, dist, device, rt_iters, MAX) + rt_trials[T].append(reduced_roundtrip) + rt_pool[T] += reduced_roundtrip + + # ---- Pass 3: prove timed inputs were immutable and repeat the full oracle. ---- + for T in ladder: + problem = problems[T] + before_x, before_idx, before_weights = input_snapshots[T] + input_unchanged = gate[T]["pre_input_unchanged"] and ( + torch.equal(problem.x, before_x) + and torch.equal(problem.topk_idx, before_idx) + and torch.equal(problem.topk_weights, before_weights) + ) + idx_g, w_g = global_traces[T] + post = _run_expert_oracle( + torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank, + args.seed, + ) + pre = gate[T]["oracle_pre"] + order_stable = ( + pre["ordering_contract"] == post["ordering_contract"] + and pre["order_sha256"] == post["order_sha256"] + and pre["dispatch_sha256"] == post["dispatch_sha256"] + ) + gate[T].update({ + "input_unchanged": input_unchanged, + "local_ok": int(pre["passed"] and post["passed"] and input_unchanged and order_stable), + "max_rel": max(pre["max_relative_error"] or 0.0, post["max_relative_error"] or 0.0), + "oracle_post": post, + "order_stable": order_stable, + }) + + # ---- Pass 4: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ---- + def pcts(xs): + return ({"p50": percentile(xs, 50), "p90": percentile(xs, 90), + "p95": percentile(xs, 95), "p99": percentile(xs, 99)} if xs else None) + + def component(percentiles, count, *, derived=False): + if percentiles is None: + return {"availability": "unavailable", "origin": None, + "percentiles_us": None, "sample_count": 0} + return { + "availability": "derived" if derived else "measured", + "origin": "derived-percentile-sum" if derived else "measured", + "percentiles_us": percentiles, + "sample_count": 0 if derived else count, + } + rows = [] + all_anomalies = [] + thr_rt = 3.0 + for T in ladder: + gt = gts[T] + g = gate[T] + rstats = g["rstats"] + d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T] + dp, cp, rtp = pcts(d), pcts(c), pcts(rt) + # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op + # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput + # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency. + isum = {key: dp[key] + cp[key] for key in dp} if dp and cp else None + recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM) + recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX) + recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN) + global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN) + max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0] + point_ok = bool(global_ok) and recv_total > 0 + rank_evidence = [None] * world_size + dist.all_gather_object( + rank_evidence, + { + "input_unchanged": g["input_unchanged"], + "order_stable": g["order_stable"], + "post_timing": g["oracle_post"], + "pre_timing": g["oracle_pre"], + "rank": rank, + }, + ) + # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv + # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy + # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert. + token_rank_copies = rstats["routed_copies"] + H = args.hidden + throughput = { + percentile_name: gt / (latency_us * 1e-6) + for percentile_name, latency_us in rtp.items() + } + disp_bytes_l = token_rank_copies * H * elem_dispatch + comb_bytes_l = token_rank_copies * H * 2 + # Contract-level anomalies are attached to the row and rolled into validity. + # roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99. + # roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained + # op can't finish faster than its slowest required component (sync semantics violated). + row_anoms = [] + if isum and isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]: + row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T, + "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2), + "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt}) + floor = max(dp["p50"], cp["p50"]) if dp and cp else None + if floor and rtp["p50"] > 0 and rtp["p50"] < 0.95 * floor: + row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T, + "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)}) + all_anomalies.extend(row_anoms) + rows.append({ + "anomalies": row_anoms, + "components": { + "combine": component(cp, len(c)), + "dispatch": component(dp, len(d)), + "isolated_sum": component(isum, 0, derived=True), + "roundtrip": component(rtp, len(rt)), + }, + "correctness": { + "contract": ORACLE_CONTRACT, + "max_relative_error": max_rel, + "passed": point_ok, + "rank_evidence": rank_evidence, + "scope": "dispatch-metadata-and-transformed-combine", + }, + "global_tokens": gt, + "logical_bytes": { + "combine": comb_bytes_l, + "dispatch": disp_bytes_l, + "roundtrip": disp_bytes_l + comb_bytes_l, + }, + "receive": { + "max": recv_max, + "mean": recv_total / world_size, + "min": recv_min, + "total": recv_total, + }, + "routing": { + "empty_expert_count": rstats["empty_expert_count"], + "empty_rank_count": rstats["empty_rank_count"], + "expert_assignment_rank_cv": rstats["expert_assignment_rank_cv"], + "expert_assignments_per_rank": rstats["expert_assignments_per_rank"], + "expert_load_cv": rstats["expert_load_cv"], + "expert_load_max": rstats["expert_load_max"], + "expert_load_mean": rstats["expert_load_mean"], + "expert_load_min": rstats["expert_load_min"], + "fanout_histogram": rstats["fanout_hist"], + "fanout_max": rstats["fanout_max"], + "fanout_mean": rstats["fanout_mean"], + "fanout_min": rstats["fanout_min"], + "hash": rstats["routing_hash"], + "hotspot_ratio": rstats["hotspot_ratio"], + "locality": rstats.get("locality"), + "payload_copies_per_rank": rstats["payload_copies_per_rank"], + "payload_rank_cv": rstats["payload_rank_cv"], + "routed_copies": rstats["routed_copies"], + "source_token_stats": rstats.get("source_token_stats"), + }, + "sample_histograms": { + "dispatch": _histogram(d) if d else None, + "combine": _histogram(c) if c else None, + "roundtrip": _histogram(rt), + }, + "token_rate_at_latency_percentile": throughput, + "tokens_per_rank": T, + }) + if rank == 0: + component_log = (f"disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} " + f"comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " if dp and cp + else "components=unavailable ") + print(f" T={T:<5} {component_log}" + f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(rt)} fanout={rstats['fanout_mean']:.2f} " + f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} " + f"correct={point_ok}") + + # Cross-rank workload-identity proof: every rank must have built the SAME global routing + # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and + # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing. + trace_sig = hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest() + routing_consistent = _same_hash_across_ranks(torch, dist, device, trace_sig) + + # Capture again after correctness and timing so no lazily generated kernel can escape + # the implementation identity recorded in the artifact. + if capture_deferred_provenance is not None: + capture_deferred_provenance() + + if rank != 0: + return 0 + + # status=valid requires correctness AND a proven-identical routing trace across ranks. + all_ok = bool(rows) and all(r["correctness"]["passed"] for r in rows) and routing_consistent + + # Adapters never self-label official; status is derived from these gates. + prov = backend.backend_provenance + provenance_complete = contracts.provenance_complete( + prov, + backend.name, + getattr(args, "git_run", None), + image_digest=getattr(args, "image_digest", None), + image_verified=getattr(args, "image_digest_verified", False), + squash_sha256=getattr(args, "squash_sha256", None), + ) + resource_profile = contracts.project_resource_profile(prov) + resource_conformance = resource_profile["conformance_class"] + # record the canonical workload identity consumed (one trace per T -> set of ids/checksums). + if canonical and loaded_workload_ids: + args.workload_id = identity.workload_id( + { + "members": [ + {"checksums": loaded_checksums[member], "workload_id": member} + for member in sorted(loaded_workload_ids) + ] + } + ) + args.workload_members = sorted(loaded_workload_ids) + args.workload_checksums = loaded_checksums + canonical_workload = bool(getattr(args, "workload_id", None)) + activation_identity = workload_contract.compute_activation_identity(args.seed, args.hidden) + # EPLB identity covers replica placement, not only counts. + eplb_mapping_hash = None + if eplb_plan is not None: + eplb_mapping_hash = eplb.mapping_hash(eplb_plan) + anomaly_free = len(all_anomalies) == 0 + validity = { + "execution_status": "complete" if rows else "failed", + "semantic_correctness": ( + "pass" if rows and all(r["correctness"]["passed"] for r in rows) else "fail" + ), + "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent", + "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime", + "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run + "sampling_conformance": "conformant", # fixed-512-v1 gate rejects any other profile + "resource_conformance": resource_conformance, + "provenance_complete": provenance_complete, + # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above). + "anomaly_free": anomaly_free, + } + publication_status = _derive_publication_status(validity) + + shape = { # FIXED line identity (no T, no per-backend resource knobs) + "hidden": args.hidden, "topk": args.topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, "dispatch_dtype": "bf16", + "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical, + # V2 is reserved for the PR #605 ElasticBuffer adapter; package versions never imply it. + "kernel_gen": kernel_generation(backend), + "activation_profile": ACTIVATION_PROFILE, + "quant": { + "combine_input_dtype": "bf16", + "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"), + "combine_output_dtype": "bf16", "combine_quant_mode": "none", + "scale_layout": None, + }, + } + generated_at = args.timestamp or _dt.datetime.now().astimezone().isoformat() + realized_placement = getattr(args, "realized_placement", None) + nodes = ( + realized_placement["nodes"] + if realized_placement is not None + else int(os.environ.get("SLURM_NNODES", "1")) + ) + case_factors = { + "case": { + "backend": backend.name, + "canonical": canonical, + "eplb": bool(eplb_plan), + "ep": ep_size, + "experts": num_logical, + "gpus_per_node": args.gpus_per_node or ep_size, + "hidden": args.hidden, + "ladder": " ".join(map(str, ladder)), + "nodes": nodes, + "phase": args.phase, + "required_publication": args.required_publication or "diagnostic", + "routing": args.routing, + "samples_per_point": TIMED_SAMPLES_PER_POINT, + "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size), + "suite": args.suite or "manual", + "timing": f"{args.iters}:{args.trials}:{args.warmup}", + "topk": args.topk, + "warmup_semantics": WARMUP_SEMANTICS, + "workload": args.workload_name or "manual", + }, + "profile": identity.V1_CASE_PROFILE, + "sku": args.runner, + } + computed_case_id = identity.digest("case", case_factors) + if args.case_id and args.case_id != computed_case_id: + raise ValueError( + f"scheduled case ID does not match realized factors: {args.case_id} != {computed_case_id}" + ) + case_identifier = args.case_id or computed_case_id + git_run = getattr(args, "git_run", None) or {} + allocation_factors = { + "artifact": git_run.get("artifact"), + "execution_id": getattr(args, "allocation_execution_id", None), + "job": git_run.get("job"), + "repo": git_run.get("repo"), + "run_attempt": git_run.get("run_attempt"), + "run_id": git_run.get("run_id"), + "runner": args.runner, + "source_sha": git_run.get("source_sha"), + } + allocation_identifier = identity.allocation_id(allocation_factors) + try: + attempt_ordinal = int(os.environ.get("CX_ATTEMPT_ID", "1")) + except ValueError: + attempt_ordinal = 0 + if attempt_ordinal <= 0: + raise ValueError("CX_ATTEMPT_ID must be a positive integer") + attempt_identifier = identity.attempt_id( + allocation=allocation_identifier, case=case_identifier, ordinal=attempt_ordinal + ) + runtime_fingerprint = getattr(args, "runtime_fingerprint", None) or {} + implementation_contract = { + "kernel_generation": kernel_generation(backend), + "name": backend.name, + "provenance": _series_provenance(backend.backend_provenance), + "resource_profile": resource_profile, + } + public_config = contracts.public_series_config( + kernel_generation=implementation_contract["kernel_generation"], + provenance=backend.backend_provenance, + resource_profile=resource_profile, + resource_mode=args.resource_mode, + device_product=getattr(args, "runtime_device_product", None), + ) + series_factors = { + "backend": backend.name, + "implementation_contract_sha256": _sha256_json(implementation_contract), + "public_config_sha256": contracts.public_series_config_sha256(public_config), + "routing_control_sha256": contracts.routing_implementation_control_sha256( + implementation_contract + ), + "case_id": case_identifier, + "image_digest": getattr(args, "image_digest", None), + "runtime_fingerprint_sha256": _sha256_json(runtime_fingerprint), + "source_sha": git_run.get("source_sha"), + "squash_sha256": getattr(args, "squash_sha256", None), + "workload_id": getattr(args, "workload_id", None) or trace_sig, + } + series_identifier = identity.series_id(series_factors) + + sample_points = [] + for row in rows: + token_count = row["tokens_per_rank"] + + def sampled_component(trials): + return { + "availability": "measured" if trials else "unavailable", + "sample_count": sum(len(trial) for trial in trials), + "trials": trials if trials else None, + } + + sample_point = { + "components": { + "combine": sampled_component(comb_trials[token_count]), + "dispatch": sampled_component(disp_trials[token_count]), + "roundtrip": sampled_component(rt_trials[token_count]), + }, + "tokens_per_rank": token_count, + } + sample_sha256 = _sha256_json(sample_point) + point_identifier = identity.point_id( + series=series_identifier, tokens_per_rank=token_count + ) + evidence_identifier = identity.evidence_id( + point=point_identifier, + allocation=allocation_identifier, + attempt=attempt_identifier, + sample_sha256=sample_sha256, + ) + sample_point.update( + { + "evidence_id": evidence_identifier, + "point_id": point_identifier, + "sample_sha256": sample_sha256, + } + ) + sample_points.append(sample_point) + row.update({ + "evidence_id": evidence_identifier, + "point_id": point_identifier, + "sample_sha256": sample_sha256, + }) + + samples_path = args.out[:-5] + ".samples.json" if args.out.endswith(".json") else args.out + ".samples.json" + samples_document = { + "allocation_id": allocation_identifier, + "attempt_id": attempt_identifier, + "case_id": case_identifier, + "format": "collectivex.samples.v1", + "points": sample_points, + "sampling": { + "iterations_per_trial": args.iters, + "reduction": identity.V1_CASE_PROFILE["rank_reduction"], + "trials": args.trials, + }, + "schema_version": 1, + "series_id": series_identifier, + } + samples_payload = contracts.canonical_json_bytes(samples_document) + samples_sha256 = hashlib.sha256(samples_payload).hexdigest() + samples_bytes = len(samples_payload) + sample_artifact = { + "bytes": samples_bytes, + "format": "collectivex.samples.v1", + "path": os.path.basename(samples_path), + "sha256": samples_sha256, + } + headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2]) + eplb_record = ( + { + "enabled": True, + "imbalance_after": eplb_plan["imbalance_after"], + "imbalance_before": eplb_plan["imbalance_before"], + "mapping_hash": eplb_mapping_hash, + "max_replicas": eplb_plan["max_replicas"], + "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": args.experts - num_logical, + "planner": EPLB_PLANNER, + "reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK, + "replicated_experts": eplb_plan["replicated_experts"], + } + if eplb_plan + else { + "enabled": False, + "imbalance_after": None, + "imbalance_before": None, + "mapping_hash": None, + "max_replicas": None, + "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": 0, + "planner": None, + "reference_tokens_per_rank": None, + "replicated_experts": 0, + } + ) + doc = { + "format": "collectivex.ep.v1", + "schema_version": SCHEMA_VERSION, + "record_type": "case-attempt", + "generated_at": generated_at, + "identity": { + "allocation_factors": allocation_factors, + "allocation_id": allocation_identifier, + "attempt_id": attempt_identifier, + "attempt_ordinal": attempt_ordinal, + "case_factors": case_factors, + "case_id": case_identifier, + "series_factors": series_factors, + "series_id": series_identifier, + }, + "case": { + "attempt_ordinal": attempt_ordinal, + "backend": backend.name, + "eplb": eplb_record, + "ep_size": ep_size, + "mode": "normal", + "phase": args.phase, + "required_publication": args.required_publication or "diagnostic", + "resource_mode": "tuned", + "runner": args.runner, + "shape": shape, + "suite": args.suite or "manual", + "workload_name": args.workload_name or "manual", + }, + "workload": { + "activation_generator": ACTIVATION_GENERATOR, + "activation_identity": activation_identity, + "activation_profile": ACTIVATION_PROFILE, + "cross_rank_consistent": routing_consistent, + "manifest_checksums": getattr(args, "workload_checksums", None), + "members": getattr(args, "workload_members", None), + "routing_generator": ROUTING_GENERATOR, + "source": validity["workload_source"], + "trace_hashes": sorted(routing_hashes), + "trace_signature": trace_sig, + "workload_id": getattr(args, "workload_id", None), + }, + "measurement": { + "component_order_contract": COMPONENT_ORDER_CONTRACT, + "conditioning": { + "contract": CONDITIONING_CONTRACT, + "ladder": conditioning_ladder, + "roundtrips_per_shape": CONDITIONING_ROUNDS_PER_SHAPE, + }, + "contract": "layout-and-dispatch-v1", + "rows": rows, + "sampling": { + "contract": SAMPLING_CONTRACT, + "iterations_per_trial": args.iters, + "percentile_method": identity.V1_CASE_PROFILE["percentile_method"], + "reduction": identity.V1_CASE_PROFILE["rank_reduction"], + "samples_per_component": TIMED_SAMPLES_PER_POINT, + "trials": args.trials, + "warmup_iterations": args.warmup, + "warmup_semantics": WARMUP_SEMANTICS, + }, + "source_allocation": "even", + }, + "implementation": { + "kernel_generation": kernel_generation(backend), + "name": backend.name, + "provenance": backend.backend_provenance, + "resource_profile": resource_profile, + }, + "topology": { + "device_count": getattr(args, "runtime_device_count", None), + "device_product": getattr(args, "runtime_device_product", None), + "gpus_per_node": args.gpus_per_node or ep_size, + "nodes": nodes, + "placement": "packed", + "realized_placement": realized_placement, + "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size), + "topology_class": args.topology_class, + "transport": args.transport, + "world_size": world_size, + }, + "runtime_fingerprint": runtime_fingerprint, + "provenance": { + "command": getattr(args, "reproduction_command", ""), + "distributed_launcher": getattr(args, "distributed_launcher", None), + "git_run": getattr(args, "git_run", None), + "image": { + "arch": getattr(args, "image_arch", None), + "digest": getattr(args, "image_digest", "") or None, + "digest_verified": getattr(args, "image_digest_verified", False), + "reference": getattr(args, "image", "") or None, + "squash_sha256": getattr(args, "squash_sha256", None), + }, + "redaction": "sanitized-v1", + }, + "sample_artifact": sample_artifact, + "outcome": { + "publication_status": publication_status, + "reasons": [] if all_ok else ["semantic correctness or routing identity failed"], + "status": "success" if all_ok else "invalid", + "validity": validity, + }, + } + contracts.validate_raw_document(doc, samples_document) + _write_bytes_atomic(samples_path, samples_payload) + _write_json_atomic(args.out, doc) + dispatch_percentiles = headline["components"]["dispatch"]["percentiles_us"] + dispatch_p99 = dispatch_percentiles["p99"] if dispatch_percentiles else None + component_summary = (f"disp_p99={dispatch_p99:.1f}us " + if dispatch_p99 is not None + else "components=unavailable ") + print(f"{backend.name} ep-dispatch-combine [{args.phase}/normal/layout-and-dispatch-v1]: " + f"status={doc['outcome']['status']} {len(rows)} pts, routing_consistent={routing_consistent}, " + f"headline T={headline['tokens_per_rank']} {component_summary}" + f"-> {args.out}") + # A complete invalid document is still a successfully captured terminal outcome. Launchers + # inspect its status to fail the case without conflating it with an execution failure. + return 0 diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py new file mode 100644 index 0000000000..7f99990253 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_mori.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""CollectiveX MoRI adapter for the v1 BF16 normal-mode workload.""" +from __future__ import annotations + +import os +from pathlib import Path +import re +import sys +import types + +# MoRI registers the whole symmetric heap at import time. +os.environ["MORI_SHMEM_HEAP_SIZE"] = "2G" + +import torch +import torch.distributed as dist + +try: + import mori # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: mori import failed: {exc!r}", file=sys.stderr) + raise + + +def _project_local_metadata(torch_module, raw_expert_ids, raw_weights, rank, experts_per_rank): + local_start = rank * experts_per_rank + local = (raw_expert_ids >= local_start) & ( + raw_expert_ids < local_start + experts_per_rank + ) + expert_ids = torch_module.where( + local, raw_expert_ids, torch_module.full_like(raw_expert_ids, -1) + ) + weights = torch_module.where(local, raw_weights, torch_module.zeros_like(raw_weights)) + return expert_ids, weights, raw_expert_ids[local] - local_start + + +def _mori_source_commit() -> str: + module_path = Path(mori.__file__).resolve() + for root in module_path.parents: + head = root / ".git" / "HEAD" + if not head.is_symlink() and head.is_file() and head.stat().st_size <= 128: + value = head.read_text(encoding="ascii").strip() + if re.fullmatch(r"[0-9a-f]{40}", value): + return value + raise RuntimeError("MoRI image source is not pinned to a detached commit") + raise RuntimeError("MoRI image source revision is unavailable") + + +class MoRIBackend: + name = "mori" + combine_needs_redispatch = True + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + + self.ep_size = world_size + self.experts_per_rank = args.experts // self.ep_size + device_cus = torch.cuda.get_device_properties(device).multi_processor_count + self.block_num = self._block_target = 80 + self._block_floored = False + self._tuned_source = "default-80" + self.dispatch_warps = 16 + self.combine_warps = 8 + + # MI355X uses the direct intranode kernel. MI325X uses MoRI's split + # AsyncLL send/receive kernel as its normal-mode XGMI transport. + kernel_request = os.environ.get("CX_MORI_KERNEL_TYPE", "intranode").strip().lower() + self._kernel_type = None + self._kernel_type_label = "IntraNode" + self._async_ll = False + if kernel_request in ("asyncll", "async_ll", "async-ll"): + kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None) + if kernel_enum is None or not hasattr(kernel_enum, "AsyncLL"): + raise RuntimeError( + "CX_MORI_KERNEL_TYPE=asyncll requires " + "EpDispatchCombineKernelType.AsyncLL" + ) + self._kernel_type = kernel_enum.AsyncLL + self._kernel_type_label = "AsyncLL" + self._async_ll = True + self.block_num = self._block_target = 64 + self.dispatch_warps = self.combine_warps = 8 + self._tuned_source = "upstream-asyncll-64x8-external-input" + elif kernel_request not in ("intranode", "intra_node", "intra-node", ""): + raise RuntimeError( + f"unknown CX_MORI_KERNEL_TYPE={kernel_request!r} (expected intranode|asyncll)" + ) + self.kernel_generation = "async-ll" if self._async_ll else "intranode" + + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + self._cap = self.buffer_cap(args) + config_kwargs = { + "data_type": torch.bfloat16, + "rank": rank, + "world_size": world_size, + "hidden_dim": args.hidden, + "scale_dim": 0, + "scale_type_size": 1, + "max_token_type_size": torch.tensor([], dtype=torch.float32).element_size(), + "max_num_inp_token_per_rank": max(512, self._cap), + "num_experts_per_rank": self.experts_per_rank, + "num_experts_per_token": args.topk, + "use_external_inp_buf": self._async_ll, + "quant_type": "none", + } + if self._async_ll: + config_kwargs["kernel_type"] = self._kernel_type + config_kwargs["max_total_recv_tokens"] = 0 + config_kwargs["block_num"] = self.block_num + config_kwargs["warp_num_per_block"] = self.dispatch_warps + self.config = mori.ops.EpDispatchCombineConfig(**config_kwargs) + if self._async_ll and ( + self.config.block_num != self.block_num + or self.config.warp_num_per_block != self.dispatch_warps + ): + raise RuntimeError("MoRI AsyncLL launch configuration was not realized") + self.op = mori.ops.EpDispatchCombineOp(self.config) + + expected_mori_commit = os.environ.get("MORI_COMMIT") + mori_commit = _mori_source_commit() + if expected_mori_commit and mori_commit != expected_mori_commit: + raise RuntimeError("MoRI image source revision differs from canonical provenance") + self.backend_provenance = { + "mori_commit": mori_commit, + "api": ( + "mori.ops.EpDispatchCombineOp/external-input" + if self._async_ll + else "mori.ops.EpDispatchCombineOp/registered-input" + ), + "mode": "normal", + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "kernel_type": self._kernel_type_label, + "enable_sdma": os.environ.get("MORI_ENABLE_SDMA"), + "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"), + "max_num_inp_token_per_rank": max(512, self._cap), + "max_total_recv_tokens": config_kwargs.get("max_total_recv_tokens"), + "num_qps": 1, + "resource_mode": "tuned", + "block_num": self.block_num, + "block_num_target": self._block_target, + "block_num_floored": self._block_floored, + "dispatch_warps": self.dispatch_warps, + "combine_warps": self.combine_warps, + "device_cus": device_cus, + "sm_fraction": None if self._async_ll else self.block_num / device_cus, + "tuned_source": self._tuned_source, + } + + def buffer_cap(self, args): + return 512 + + def make_problem(self, T, idx, weights, x): + indices = idx.to(torch.int32) + gate_weights = weights.to(torch.float32) + return types.SimpleNamespace( + T=T, + x=x, + topk_idx=indices, + topk_weights=gate_weights, + indices=indices, + weights=gate_weights, + scales=torch.empty((T, 0), dtype=torch.uint8, device=self.device), + ) + + def dispatch(self, p): + dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num = ( + self.op.dispatch( + p.x, + p.weights, + p.scales, + p.indices, + block_num=self.block_num, + warp_per_block=self.dispatch_warps, + ) + ) + if self._async_ll: + self.op.dispatch_recv(warp_per_block=self.dispatch_warps) + return types.SimpleNamespace( + dispatch_output=dispatch_output, + dispatch_weights=dispatch_weights, + dispatch_indices=dispatch_indices, + recv_num=recv_num[0], + combine_input=dispatch_output.to(torch.bfloat16), + ) + + def stage(self, p, h): + rows = getattr(p, "recv_tokens", None) + if not isinstance(rows, int) or rows < 0 or rows > h.combine_input.size(0): + raise RuntimeError("MoRI receive count was not validated before staging") + if self._async_ll: + return None + buffer = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1) + ) + buffer[:rows, :].copy_(h.combine_input[:rows, :]) + h.combine_input = buffer + + def combine(self, p, h): + combine_indices = p.indices if self._async_ll else h.dispatch_indices + combined, _weights = self.op.combine( + h.combine_input, + None, + combine_indices, + block_num=self.block_num, + warp_per_block=self.combine_warps, + ) + if self._async_ll: + self.op.combine_recv(warp_per_block=self.combine_warps) + return combined[:p.T] + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + if h.dispatch_weights is None: + raise RuntimeError("MoRI dispatch did not expose gate weights") + if count < 0 or any( + tensor.ndim == 0 or count > tensor.size(0) + for tensor in (h.dispatch_output, h.dispatch_indices, h.dispatch_weights) + ): + raise RuntimeError("MoRI receive count exceeds dispatch metadata") + raw_expert_ids = h.dispatch_indices[:count].to(torch.int64) + expert_ids, weights, local_expert_ids = _project_local_metadata( + torch, + raw_expert_ids, + h.dispatch_weights[:count].to(torch.float32), + self.rank, + self.experts_per_rank, + ) + return types.SimpleNamespace( + payload=h.dispatch_output[:count], + expert_ids=expert_ids, + weights=weights, + local_expert_counts=torch.bincount( + local_expert_ids, minlength=self.experts_per_rank + ), + ordering_contract="mori-global-topk-masked-v1", + ) + + def combine_transformed(self, p, h, transformed): + h.combine_input = transformed.to(torch.bfloat16) + self.stage(p, h) + return self.combine(p, h) + + def recv_tokens(self, h): + return int(h.recv_num.item()) + + def finalize(self, rc): + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(rc if 0 <= rc <= 255 else 1) diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py new file mode 100644 index 0000000000..327a4063f8 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_nccl.py @@ -0,0 +1,177 @@ +"""CollectiveX NCCL all-to-all expert-parallel reference backend. + +The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``. Like the +DeepEP-family APIs, dispatch sends one hidden-state copy to each distinct destination rank, even when +multiple selected experts live on that rank. Combine reverses the shuffle and sums those rank copies. + +Why this exists alongside DeepEP/UCCL/MoRI: it is the portable collective reference baseline for the +same rank-deduplicated payload and routing metadata. It keeps the library comparison anchored to the +platform collective stack without claiming the custom fused kernels use the same transport algorithm. + +Scope: BF16, normal mode, layout-and-dispatch-v1. The timed dispatch includes layout, count exchange, +payload, rank-masked expert indices, gate weights, and source-token metadata; combine returns only +the activation payload. RCCL exposes the same API. The v1 AMD matrix uses this backend at EP8. +""" + +import re +import types + +import torch +import torch.distributed as dist +import contracts + + +def _runtime_collective(args, torch_module) -> tuple[str, str]: + expected = "rccl" if torch_module.version.hip else "nccl" + fingerprint = getattr(args, "runtime_fingerprint", None) + collective = fingerprint.get("collective_library") if isinstance(fingerprint, dict) else None + if ( + not isinstance(collective, dict) + or collective.get("kind") != expected + or not isinstance(collective.get("version"), str) + or not re.fullmatch(r"[0-9]+\.[0-9]+\.[0-9]+", collective["version"]) + ): + raise RuntimeError("loaded collective runtime identity is unavailable") + return expected, collective["version"] + + +class NCCLBackend: + name = "nccl-ep" + combine_needs_redispatch = False # dispatch saves the permutation + splits + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.experts = args.experts + if args.experts % world_size: + raise ValueError(f"experts({args.experts}) must divide world_size({world_size})") + self.experts_per_rank = args.experts // world_size + self.tolerance = 5e-2 # bf16 round-trip + _library, _version = _runtime_collective(args, torch) + self.kernel_generation = contracts.collective_kernel_generation(_library) + self.backend_provenance = { + "backend": f"{_library}-all2all", + "backend_lineage": _library, + "collective_library": _library, + "nccl_version": _version, + "transport": f"{_library}-all_to_all_single", + "resource_mode": "tuned", + "num_sms": None, + "device_sms": torch.cuda.get_device_properties(device).multi_processor_count, + "tuned_source": "nccl-collective", + "reference_semantics": "rank-deduplicated-payload-plus-routing-metadata-v2", + "routing_metadata": "expert-index-gate-weight-source-token", + } + + def buffer_cap(self, args): + return None # no fixed pre-allocated buffer; all-to-all sizes itself per step + + def make_problem(self, T, idx, weights, x): + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice. + return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), layout=None) + + def dispatch(self, p): + ws = self.world_size + x = p.x # [T, H] bf16 + idx = p.topk_idx # [T, topk] + T, H = int(x.shape[0]), int(x.shape[1]) + dev = x.device + # DeepEP dispatches one token per destination rank, not one copy per expert. Build the same + # rank-deduplicated routing map so NCCL traffic and combine semantics are comparable. + destinations = (idx // self.experts_per_rank).clamp_(0, ws - 1) + present = torch.zeros((T, ws), dtype=torch.bool, device=dev) + present.scatter_(1, destinations, True) + flat_token, flat_dest = present.nonzero(as_tuple=True) + # Group rank copies by destination (stable -> deterministic, invertible permutation). + order = torch.argsort(flat_dest, stable=True) + ordered_token = flat_token.index_select(0, order) + ordered_dest = flat_dest.index_select(0, order) + send_counts = torch.bincount(flat_dest, minlength=ws) # [ws] + send_x = x.index_select(0, ordered_token).contiguous() + send_topk_idx = idx.index_select(0, ordered_token).contiguous() + expert_start = ordered_dest.unsqueeze(1) * self.experts_per_rank + local_mask = ((send_topk_idx >= expert_start) + & (send_topk_idx < expert_start + self.experts_per_rank)) + send_topk_idx = torch.where( + local_mask, send_topk_idx - expert_start, torch.full_like(send_topk_idx, -1) + ) + send_topk_weights = p.topk_weights.index_select(0, ordered_token).contiguous() + send_topk_weights.masked_fill_(~local_mask, 0) + send_src_metadata = (ordered_token.to(torch.int64) | (self.rank << 32)).contiguous() + # Exchange per-rank counts so every rank can size its receive buffer. + recv_counts = torch.empty_like(send_counts) + dist.all_to_all_single(recv_counts, send_counts) + sc = send_counts.tolist() + rc = recv_counts.tolist() + total_recv = int(sum(rc)) + recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev) + recv_topk_idx = torch.empty((total_recv, int(idx.shape[1])), dtype=idx.dtype, device=dev) + recv_topk_weights = torch.empty((total_recv, int(idx.shape[1])), + dtype=p.topk_weights.dtype, device=dev) + recv_src_metadata = torch.empty((total_recv,), dtype=torch.int64, device=dev) + # Dispatch the uneven per-rank splits over the configured collective transport. + dist.all_to_all_single(recv_x, send_x, rc, sc) + dist.all_to_all_single(recv_topk_idx, send_topk_idx, rc, sc) + dist.all_to_all_single(recv_topk_weights, send_topk_weights, rc, sc) + dist.all_to_all_single(recv_src_metadata, send_src_metadata, rc, sc) + return types.SimpleNamespace( + recv_x=recv_x, combine_input=None, order=order, flat_token=flat_token, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, recv_src_rank=recv_src_metadata >> 32, + recv_src_token=recv_src_metadata & ((1 << 32) - 1), send_counts=sc, recv_counts=rc, + T=T, H=H, topk=int(idx.shape[1]), total_recv=total_recv) + + def stage(self, p, h): + # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity). + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists). + send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype, + device=h.combine_input.device) + dist.all_to_all_single(send_back, h.combine_input.contiguous(), + h.send_counts, h.recv_counts) + # send_back is in send (sorted) order; invert the argsort to token-copy order. + copies = torch.empty_like(send_back) + copies[h.order] = send_back + # Sum one copy per destination rank under this reference's explicit unweighted contract. + out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device) + out.index_add_(0, h.flat_token, copies.float()) + return out.to(p.x.dtype) + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * self.experts_per_rank, + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=h.recv_x, + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.bincount( + h.recv_topk_idx[valid], minlength=self.experts_per_rank + ), + ordering_contract="source-rank-major-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + h.combine_input = transformed.to(h.recv_x.dtype) + return self.combine(p, h) + + def recv_tokens(self, h): + return int(h.total_recv) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py new file mode 100644 index 0000000000..c962b4ce13 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_uccl.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +"""CollectiveX UCCL adapter for the v1 BF16 normal-mode workload.""" +from __future__ import annotations + +import importlib.metadata as metadata +import json +import os +from pathlib import Path +from pathlib import PurePosixPath +import sys +import types + +import torch +import torch.distributed as dist +import contracts + +try: + import uccl + import uccl_deepep + from uccl_deepep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: uccl.ep import failed: {exc!r}", file=sys.stderr) + raise + + +def _uccl_version() -> str: + try: + return metadata.version("uccl") + except Exception: + return getattr(uccl, "__version__", "unknown") + + +def _uccl_dependency_versions() -> dict[str, str]: + versions = { + package: metadata.version(package) + for package in contracts.UCCL_DEPENDENCY_VERSIONS + } + if versions != contracts.UCCL_DEPENDENCY_VERSIONS: + raise RuntimeError( + "UCCL runtime dependency versions differ from the v1 contract" + ) + return versions + + +def _is_uccl_runtime_payload(name: str) -> bool: + path = PurePosixPath(name) + return ( + bool(path.parts) + and path.parts[0] in {"uccl", "uccl.libs"} + and "__pycache__" not in path.parts + and path.suffix != ".pyc" + ) + + +def _python_dependency_evidence(package: str, version: str) -> dict[str, str]: + distribution = metadata.distribution(package) + runtime_files = [] + for entry in distribution.files or (): + logical = PurePosixPath(entry.as_posix()) + path = Path(distribution.locate_file(entry)) + if ( + logical.parts + and logical.parts[0] == package + and "__pycache__" not in logical.parts + and logical.suffix != ".pyc" + and path.is_file() + ): + runtime_files.append((entry.as_posix(), path)) + return contracts.content_manifest_evidence( + role=f"{package}-distribution", + name=f"{package}-{version}", + files=runtime_files, + ) + + +def _loaded_libcudart_evidence( + version: str, maps_path: Path = Path("/proc/self/maps") +) -> dict[str, str]: + distribution = metadata.distribution("nvidia-cuda-runtime-cu12") + candidates = { + Path(distribution.locate_file(entry)).resolve() + for entry in distribution.files or () + if PurePosixPath(entry.as_posix()).name.startswith("libcudart.so") + and Path(distribution.locate_file(entry)).is_file() + } + candidate_names = {path.name for path in candidates} + if not candidates or not candidate_names: + raise RuntimeError("pinned CUDA runtime distribution has no libcudart payload") + + loaded: set[Path] = set() + try: + mappings = maps_path.read_text().splitlines() + except OSError as exc: + raise RuntimeError("cannot inspect mapped UCCL runtime libraries") from exc + for mapping in mappings: + columns = mapping.split(maxsplit=5) + if len(columns) != 6: + continue + raw_path = columns[5] + deleted = raw_path.endswith(" (deleted)") + if deleted: + raw_path = raw_path.removesuffix(" (deleted)") + mapped = Path(raw_path) + if mapped.name not in candidate_names: + continue + if deleted or not mapped.is_file(): + raise RuntimeError( + "mapped libcudart is unavailable for content verification" + ) + resolved = mapped.resolve() + if resolved not in candidates: + raise RuntimeError( + "mapped libcudart is not owned by the pinned CUDA runtime package" + ) + loaded.add(resolved) + if len(loaded) != 1: + raise RuntimeError( + "expected exactly one mapped libcudart from the pinned CUDA runtime" + ) + return contracts.content_manifest_evidence( + role="cuda-runtime", + name=f"nvidia-cuda-runtime-cu12-{version}", + files=[("libcudart.so", loaded.pop())], + ) + + +def _uccl_build_evidence( + version: str, dependency_versions: dict[str, str] +) -> list[dict[str, str]]: + distribution = metadata.distribution("uccl") + distribution_files = [ + (entry.as_posix(), distribution.locate_file(entry)) + for entry in distribution.files or () + if _is_uccl_runtime_payload(entry.as_posix()) + and Path(distribution.locate_file(entry)).is_file() + ] + wrapper_root = Path(uccl_deepep.__file__).resolve().parent + wrapper_files = [ + (path.relative_to(wrapper_root).as_posix(), path) + for path in wrapper_root.rglob("*.py") + if path.is_file() + ] + return [ + contracts.content_manifest_evidence( + role="uccl-distribution", + name=f"uccl-{version}", + files=distribution_files, + ), + contracts.content_manifest_evidence( + role="uccl-wrapper", + name="uccl-deepep-wrapper", + files=wrapper_files, + ), + _python_dependency_evidence("intervaltree", dependency_versions["intervaltree"]), + _python_dependency_evidence( + "sortedcontainers", dependency_versions["sortedcontainers"] + ), + _loaded_libcudart_evidence(dependency_versions["nvidia-cuda-runtime-cu12"]), + ] + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"UCCL {label} differs across ranks") + + +class UCCLBackend: + name = "uccl" + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + + self.group = dist.group.WORLD + device_sms = torch.cuda.get_device_properties(device).multi_processor_count + num_nvl_bytes = 4 * 1024 * 1024 * 1024 + self.buffer = Buffer(self.group, num_nvl_bytes, 0) + + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + raise RuntimeError( + f"UCCL did not apply requested num_sms={num_sms}: {exc!r}" + ) from exc + applied_num_sms = int(getattr(Buffer, "num_sms", num_sms)) + if applied_num_sms != num_sms: + raise RuntimeError( + f"UCCL num_sms mismatch: requested={num_sms} applied={applied_num_sms}" + ) + + version = _uccl_version() + dependency_versions = _uccl_dependency_versions() + loaded_libraries = _uccl_build_evidence(version, dependency_versions) + _require_cross_rank_equal(loaded_libraries, "installed content identities") + self.backend_provenance = { + "uccl_version": version, + "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{version}", + "uccl_wrapper_commit": os.environ.get("UCCL_WRAPPER_COMMIT"), + "backend_lineage": "uccl", + "uccl_dependency_versions": dependency_versions, + "loaded_libraries": loaded_libraries, + "mode": "normal", + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "resource_mode": "tuned", + "requested_num_sms": num_sms, + "num_sms": applied_num_sms, + "device_sms": device_sms, + "sm_fraction": applied_num_sms / device_sms, + "tuned_source": "uccl-default-num_sms", + "num_nvl_bytes": num_nvl_bytes, + } + + def buffer_cap(self, args): + return None + + def make_problem(self, T, idx, weights, x): + return types.SimpleNamespace( + T=T, + x=x, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + ( + num_tokens_per_rank, + _, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch( + p.x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + recv_counts=recv_counts, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = h.recv_x + + def combine(self, p, h): + combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle) + return combined_x + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * (self.args.experts // self.world_size), + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=h.recv_x, + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64), + ordering_contract="source-rank-major-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle) + return combined + + def recv_tokens(self, h): + return int(h.recv_x.shape[0]) + + def finalize(self, rc): + # UCCL's proxy teardown can crash after results are written; preserve the real rc. + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(rc if 0 <= rc <= 255 else 1) diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py new file mode 100644 index 0000000000..b1479da9f1 --- /dev/null +++ b/experimental/CollectiveX/tests/eplb.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for +skewed (zipf) expert load. + +Under skewed routing, the ranks hosting hot logical experts receive far more token-copies +than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX +the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts +onto extra physical slots and PLACES the slots so every rank carries ~equal load. + +This module is backend-agnostic: it is purely a transform of the deterministic routing +trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to +rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots +RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping +reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical` +and the remapped (physical) trace; nothing else changes. + + num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size) + build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks + remap_idx(): each token's logical targets -> physical replicas, spread by global token id + +Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch. +""" +from __future__ import annotations + +import hashlib +import json + + +def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int: + """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the + physical experts divide evenly across ranks (symmetric dispatch).""" + r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size + return num_logical + r + + +def _contiguous_rank_load(logical_load, ep_size): + """Per-rank received load WITHOUT EPLB: logical experts placed contiguously + (experts_per_rank = num_logical/ep_size), so rank r carries its block's total.""" + n = len(logical_load) + per = n // ep_size + return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)] + + +def build_plan(logical_load, num_physical: int, ep_size: int) -> dict: + """logical_load: list[float] length num_logical (token-copies per logical expert). + Returns the replication+placement plan (all pure-Python lists) + before/after balance.""" + num_logical = len(logical_load) + assert num_physical >= num_logical, "num_physical must be >= num_logical" + assert num_physical % ep_size == 0, "num_physical must divide ep_size" + assert num_logical % ep_size == 0, "num_logical must divide ep_size" + spp = num_physical // ep_size # physical slots per rank (fixed) + + # 1) Replica allocation — start one slot per logical expert, then hand each redundant + # slot to the expert with the highest CURRENT per-replica load (greedy min-max). + replicas = [1] * num_logical + for _ in range(num_physical - num_logical): + best, best_lps = 0, -1.0 + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + if lps > best_lps: + best, best_lps = e, lps + replicas[best] += 1 + + # 2) Slots = (per-replica load, logical expert), one per replica. + slots = [] + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + slots.extend((lps, e) for _ in range(replicas[e])) + + # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the + # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity. + slots.sort(reverse=True) + rank_slots = [[] for _ in range(ep_size)] + rank_load = [0.0] * ep_size + for lps, e in slots: + r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp), + key=lambda r: rank_load[r]) + rank_slots[r].append(e) + rank_load[r] += lps + + # 4) Rank-major physical numbering -> contiguous placement == this balanced placement. + phys2log, rank_of_phys = [], [] + for r in range(ep_size): + for e in rank_slots[r]: + phys2log.append(e) + rank_of_phys.append(r) + log2phys = [[] for _ in range(num_logical)] + for pid, e in enumerate(phys2log): + log2phys[e].append(pid) + + before = _contiguous_rank_load(logical_load, ep_size) + total = sum(logical_load) or 1.0 + mean = total / ep_size + return { + "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size, + "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas), + "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys, + "rank_load_after": rank_load, "rank_load_before": before, + # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts. + "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean, + "replicated_experts": sum(1 for r in replicas if r > 1), + } + + +def mapping_hash(plan: dict) -> str: + """Hash the placement fields that fully determine the logical-to-physical remap.""" + payload = { + "phys2log": plan["phys2log"], + "rank_of_phys": plan["rank_of_phys"], + "replicas": plan["replicas"], + } + return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest() + + +def remap_rows(indices: list[list[int]], plan: dict) -> list[list[int]]: + """Pure-Python equivalent of remap_idx for contract verification.""" + replicas = plan["log2phys"] + return [ + [replicas[expert][token % len(replicas[expert])] for expert in row] + for token, row in enumerate(indices) + ] + + +def remap_idx(idx_logical, plan): + """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace). + Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's + physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out + across its replicas (= across ranks). Replicas of distinct logical experts are disjoint, + so a token's top-k physical ids stay distinct (dispatch invariant preserved).""" + import torch + replicas = plan["replicas"] + num_logical = len(replicas) + max_rc = plan["max_replicas"] + rc = torch.tensor(replicas, dtype=torch.int64) + # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed + # past rc[e] because the replica index is taken mod rc[e]). + padded = torch.zeros(num_logical, max_rc, dtype=torch.int64) + for e, phys in enumerate(plan["log2phys"]): + for k in range(max_rc): + padded[e, k] = phys[k] if k < len(phys) else phys[0] + gt = idx_logical.shape[0] + rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id + e = idx_logical.to(torch.int64) # [gt,topk] + ridx = rows % rc[e] # [gt,topk] replica index + return padded[e, ridx] # [gt,topk] physical ids + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed. + import sys + NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32 + load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)] + nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP) + plan = build_plan(load, nphys, EP) + print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}") + print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} " + f"(hottest expert 0 replicas={plan['replicas'][0]})") + print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}") + print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}") + print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x") + # Gates: equal slot cardinality, every logical expert placed, big imbalance cut. + assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL)) + assert sum(plan["replicas"]) == nphys + assert len(plan["phys2log"]) == nphys + assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL)) + # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing + assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"]) + assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance" + assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}" + # remap (if torch present): distinctness + balanced receive on a sampled zipf trace. + try: + import torch + g = torch.Generator().manual_seed(0) + p = torch.tensor(load) + p = (p / p.sum()).expand(4096, NUM_LOGICAL) + idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64) + idx_p = remap_idx(idx_l, plan) + assert idx_p.shape == idx_l.shape + # top-k physical ids distinct per token + assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct" + spp = plan["slots_per_rank"] + recv_before = [0] * EP + recv_after = [0] * EP + per_log = NUM_LOGICAL // EP + for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()): + for e in row_l: + recv_before[e // per_log] += 1 + for pid in row_p: + recv_after[pid // spp] += 1 + ib = max(recv_before) / (sum(recv_before) / EP) + ia = max(recv_after) / (sum(recv_after) / EP) + print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x") + assert ia < ib and ia < 1.35, "remap must balance per-rank receive load" + print("remap self-test: OK") + except ImportError: + print("(torch absent — skipped remap self-test; planner gates passed)") + print("EPLB self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py new file mode 100644 index 0000000000..862c3d0375 --- /dev/null +++ b/experimental/CollectiveX/tests/make_workloads.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Generate canonical serialized workloads. Runs the stdlib counter generator for +each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a +dir that runs then consume via `run_ep.py --workload-dir`. One trace is emitted per global-token +count because global token count is part of workload identity. + + python3 tests/make_workloads.py --out-dir /path/to/cx_workloads \\ + --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\ + --tokens-ladder "1 2 4 8 16 32 64 128 256 512" + +Or by the named v1 workload in configs/workloads.yaml. Explicit dimension flags still override it: + + python3 tests/make_workloads.py --out-dir /path/to/cx_workloads --workload deepseek-v3-v1 --routing uniform --ep 8 + +--id-only prints the content-bound workload_id per ladder point without torch/numpy: + + python3 tests/make_workloads.py --workload deepseek-v3-v1 --ep 8 --id-only + +Generate every routing the suites need by running once per --routing. Idempotent (same id => same +file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes. +""" +from __future__ import annotations + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import workload as wl # noqa: E402 + +# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution. +_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +def resolve_manifest(name): + """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts). + Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`. + Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib.""" + import yaml + path = os.path.join(_REPO, "configs", "workloads.yaml") + with open(path) as handle: + cfg = yaml.safe_load(handle) + known = [] + for section in ("synthetic", "model_derived"): + sec = cfg.get(section) or {} + known += list(sec) + m = sec.get(name) + if m is None: + continue + experts = m.get("experts", m.get("routed_experts")) + if m.get("hidden") is None or m.get("topk") is None or experts is None: + raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}") + return int(m["hidden"]), int(m["topk"]), int(experts) + raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}") + + +def main() -> int: + ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads") + ap.add_argument("--out-dir", help="required unless --id-only") + ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)") + ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) + ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)") + ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)") + ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)") + ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)") + ap.add_argument("--seed", type=int, default=67) + ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512") + ap.add_argument("--id-only", action="store_true", + help="print content-bound workload_id per point without torch/numpy") + a = ap.parse_args() + + # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override + # per field. With neither, fall back to the v1 DeepSeek dimensions (7168/8/256). + base_h, base_t, base_e = (7168, 8, 256) + if a.workload: + base_h, base_t, base_e = resolve_manifest(a.workload) + hidden = a.hidden if a.hidden is not None else base_h + topk = a.topk if a.topk is not None else base_t + experts = a.experts if a.experts is not None else base_e + + if not a.id_only and not a.out_dir: + ap.error("--out-dir is required unless --id-only") + + raw_ladder = [int(token) for token in a.tokens_ladder.replace(",", " ").split()] + if (a.ep <= 0 or min(hidden, topk, experts) <= 0 or topk > experts or experts % a.ep + or not raw_ladder or any(token <= 0 for token in raw_ladder) + or len(raw_ladder) != len(set(raw_ladder))): + ap.error("shape, EP, and token ladder must be positive, divisible, and unique") + ladder = sorted(raw_ladder) + epr = experts // a.ep + label = f"workload={a.workload} " if a.workload else "" + + if a.id_only: + # The stdlib counter generator derives the same content-bound ID on every runtime. + made = [] + for T in ladder: + gt = T * a.ep + wid = wl.compute_workload_id(a.routing, hidden, topk, experts, a.ep, gt, a.seed) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}") + print(f"{label}id-only: {len(made)} workload_id(s) " + f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})") + return 0 + + os.makedirs(a.out_dir, exist_ok=True) + made = [] + for T in ladder: + gt = T * a.ep + idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr) + wid = wl.save_workload(a.out_dir, idx, w, man) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} " + f"(trace sha {man['checksums']['trace'][:12]})") + print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py new file mode 100644 index 0000000000..6065a06e43 --- /dev/null +++ b/experimental/CollectiveX/tests/routing.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +"""CollectiveX — deterministic, platform-independent MoE routing trace. + +Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated +ONCE from a fixed seed over the *global* token batch, indexed by global token id, and +is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k). +Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations +are per-rank (same rank ⇒ same x on any platform), so a given global token id has +identical activation everywhere without materializing a global activation tensor. + +The v1 suite keeps two routing distributions: + + * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT. + Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈ + 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson. + * zipf — expert popularity proportional to 1/rank, producing expert/rank load skew. + +Always publish the realized fan-out so the workload is never misread again +(`routing_stats`). +""" +from __future__ import annotations + +import hashlib + +import torch + +ACTIVATION_GENERATOR = "collectivex-activation-counter-v3" +SOURCE_ID_BASE = 128 +SOURCE_ID_COLUMNS = 4 + + +def build_global_routing( + global_tokens: int, experts: int, topk: int, routing: str, seed: int +): + """Return byte-stable counter-generated routing tensors on CPU.""" + import workload + + indices, weights = workload.canonical_routing_rows( + int(global_tokens), int(experts), int(topk), routing, int(seed) + ) + return ( + torch.tensor(indices, dtype=torch.int64), + torch.tensor(weights, dtype=torch.float32), + ) + + +def rank_slice(idx, weights, rank: int, tokens_per_rank: int): + lo = rank * tokens_per_rank + return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous() + + +def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device, + dtype=torch.bfloat16): + """Exact counter-derived inputs with a reversible global source-token prefix.""" + source = torch.arange(tokens, device=device, dtype=torch.int64) + rank * tokens + return activations_for_source_ids(source, hidden, seed, dtype) + + +def activations_for_source_ids(source, hidden: int, seed: int, dtype=torch.bfloat16): + """Materialize canonical activations for arbitrary global source-token IDs.""" + if hidden < SOURCE_ID_COLUMNS: + raise ValueError(f"hidden must be at least {SOURCE_ID_COLUMNS}") + source = source.to(torch.int64) + column = torch.arange(hidden, device=source.device, dtype=torch.int64) + values = (source[:, None] * 131 + column[None, :] * 17 + int(seed) * 19) % 257 - 128 + output = values.to(dtype).mul_(1 / 64) + output[:, 0] = source % SOURCE_ID_BASE + output[:, 1] = (source // SOURCE_ID_BASE) % SOURCE_ID_BASE + output[:, 2] = (source // (SOURCE_ID_BASE**2)) % SOURCE_ID_BASE + output[:, 3] = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE + return output + + +def decode_source_ids(payload, seed: int): + """Decode and validate source IDs carried by rank_activations.""" + if payload.ndim != 2 or payload.shape[1] < SOURCE_ID_COLUMNS: + raise ValueError("received payload cannot carry the source-token prefix") + prefix = payload[:, :SOURCE_ID_COLUMNS].float() + digits = prefix.round().to(torch.int64) + if not torch.equal(prefix, digits.float()): + raise ValueError("received source-token prefix is not exact") + if bool(((digits < 0) | (digits >= SOURCE_ID_BASE)).any().item()): + raise ValueError("received source-token prefix is out of range") + source = digits[:, 0] + SOURCE_ID_BASE * digits[:, 1] + SOURCE_ID_BASE**2 * digits[:, 2] + checksum = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE + if not torch.equal(checksum, digits[:, 3]): + raise ValueError("received source-token checksum differs") + return source + + +def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int, + gpus_per_node: int, scale_up_domain: int = None) -> dict: + """Locality of rank-deduplicated payload copies under packed placement.""" + import torch as _t + gt = idx.shape[0] + assignments = (idx // experts_per_rank).clamp(max=ep_size - 1) + destinations = _t.zeros((gt, ep_size), dtype=_t.bool) + destinations.scatter_(1, assignments, True) + token, dest = destinations.nonzero(as_tuple=True) + src = (token // max(1, tokens_per_rank)).clamp(max=ep_size - 1) + sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain + phys = _t.arange(ep_size, dtype=_t.int64) + pd, ps = phys[dest], phys[src] + local = (dest == src) + same_node = (pd // gpus_per_node) == (ps // gpus_per_node) + same_dom = (pd // sud) == (ps // sud) + n = dest.numel() + return { + "placement": "packed", + "local_rank_fraction": float(local.float().mean()), + "same_node_fraction": float(same_node.float().mean()), + "same_scaleup_domain_fraction": float(same_dom.float().mean()), + "cross_node_fraction": float((~same_node).float().mean()), + "cross_domain_fraction": float((~same_dom).float().mean()), + "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n), + } + + +def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict: + """Realized routing properties for the GLOBAL trace — published per point so the + fan-out / load can never be silently misread. idx is the global [gt, topk] tensor; + weights the matching [gt, topk] gate weights (hashed too for workload identity). + """ + ep = max(1, experts // max(1, experts_per_rank)) + ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment + # unique destination ranks per token (fan-out) + onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool) + onehot.scatter_(1, ranks.clamp(max=ep - 1), True) + fanout = onehot.sum(dim=1) # [gt] + hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep + load = torch.bincount(idx.reshape(-1), minlength=experts).float() + # Keep expert assignments (compute load) separate from rank-deduplicated payload copies + # (network load). Conflating them overstates traffic when two experts share a rank. + assignment_load = torch.bincount( + ranks.reshape(-1).clamp(max=ep - 1), minlength=ep + ).float() + payload_load = onehot.sum(dim=0).float() + # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity + # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio = + # worst expert load over the mean. Zipf should be more concentrated than uniform. + def _cv(t): + m = float(t.mean()) + return float(t.std(unbiased=False) / m) if m > 0 else 0.0 + expert_load_cv = _cv(load) + assignment_rank_cv = _cv(assignment_load) + payload_rank_cv = _cv(payload_load) + hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0 + # Empty experts capture compute skew; empty destination ranks capture network skew. + empty_expert_count = int((load == 0).sum()) + empty_rank_count = int((payload_load == 0).sum()) + # SHA-256 workload identity over both topk_idx and gate weights: a chart + # point's routing is provably identical across SKUs only if both hashes match. + idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes() + idx_hash = hashlib.sha256(idx_bytes).hexdigest() + if weights is not None: + w_bytes = weights.to(torch.float32).cpu().numpy().tobytes() + w_hash = hashlib.sha256(w_bytes).hexdigest() + routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest() + else: + w_hash, routing_hash = None, idx_hash + return { + "fanout_mean": float(fanout.float().mean()), + "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()), + "fanout_hist": hist, # index k-1 = #tokens with fan-out k + "expert_assignments_per_rank": [int(x) for x in assignment_load.tolist()], + "payload_copies_per_rank": [int(x) for x in payload_load.tolist()], + "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs + "expert_load_min": int(load.min()), "expert_load_max": int(load.max()), + "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv, + "expert_assignment_rank_cv": assignment_rank_cv, + "payload_rank_cv": payload_rank_cv, "hotspot_ratio": hotspot_ratio, + "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count, + "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash, + } + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + E, TOPK, EPR, GT = 256, 8, 32, 4096 + ui, _ = build_global_routing(GT, E, TOPK, "uniform", 67) + zi, _ = build_global_routing(GT, E, TOPK, "zipf", 67) + assert all(len(set(row.tolist())) == TOPK for row in ui[:16]) + uniform, zipf = routing_stats(ui, E, EPR), routing_stats(zi, E, EPR) + assert uniform["hotspot_ratio"] < zipf["hotspot_ratio"] + dev = torch.device("cpu") + first = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32) + second = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32) + assert torch.equal(first, second) and torch.isfinite(first).all() + print("routing self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py new file mode 100644 index 0000000000..cf019af28f --- /dev/null +++ b/experimental/CollectiveX/tests/run_ep.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +"""CollectiveX v1 EP benchmark entrypoint for torchrun or rank environments.""" + +from __future__ import annotations + +import argparse +import ctypes +import json +import os +import platform +import re +import shlex +import socket +import subprocess +import sys + +# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under +# torchrun (it executes the file as __main__, not as a package). +HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path[:0] = [HERE, os.path.dirname(HERE)] + +import ep_harness # noqa: E402 (stdlib-only; safe before torch) +import identity # noqa: E402 + + +def _numeric_version(command: list[str]) -> str | None: + try: + result = subprocess.run( + command, capture_output=True, check=False, text=True, timeout=10 + ) + except (OSError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + match = re.search(r"\b[0-9]+(?:\.[0-9]+){1,3}\b", result.stdout) + return match.group(0) if match else None + + +def _loaded_collective_version() -> str | None: + try: + with open("/proc/self/maps", encoding="utf-8") as handle: + paths = { + os.path.realpath(line.rstrip().split()[-1]) + for line in handle + if any(name in line for name in ("libnccl.so", "librccl.so")) + and os.path.isfile(line.rstrip().split()[-1]) + } + if len(paths) != 1: + return None + version = ctypes.c_int() + library = ctypes.CDLL(paths.pop()) + if library.ncclGetVersion(ctypes.byref(version)) != 0: + return None + return ep_harness.format_collective_version(version.value) + except (AttributeError, OSError): + return None + + +def _runtime_fingerprint( + torch, device, *, machine: str, vendor: str, arch: str +) -> dict: + """Return strict runtime facts without hosts, addresses, UUIDs, or paths.""" + properties = torch.cuda.get_device_properties(device) + if vendor == "nvidia": + driver = _numeric_version( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) + runtime_kind, runtime_version, collective_kind = ( + "cuda", + torch.version.cuda, + "nccl", + ) + else: + driver = _numeric_version(["rocm-smi", "--showdriverversion"]) + runtime_kind, runtime_version, collective_kind = ( + "hip", + torch.version.hip, + "rccl", + ) + return { + "accelerator_runtime": {"kind": runtime_kind, "version": runtime_version}, + "collective_library": { + "kind": collective_kind, + "version": _loaded_collective_version(), + }, + "device": { + "arch": arch, + "compute_units": int(properties.multi_processor_count), + "memory_bytes": int(properties.total_memory), + "product": torch.cuda.get_device_name(device), + "warp_size": int(properties.warp_size), + }, + "driver_version": driver, + "framework": {"kind": "torch", "version": str(torch.__version__)}, + "machine": machine, + "python_version": platform.python_version(), + "vendor": vendor, + } + + +def _summarize_realized_placement( + records: list[tuple[str, int]], + *, + expected_nodes: int, + expected_gpus_per_node: int, + expected_world_size: int, +) -> dict: + """Validate private host/rank records and return only publication-safe aggregates.""" + if expected_nodes < 1 or expected_gpus_per_node < 1: + raise ValueError("requested placement dimensions must be positive") + if expected_nodes * expected_gpus_per_node != expected_world_size: + raise ValueError("requested nodes x GPUs per node differs from world size") + if len(records) != expected_world_size: + raise ValueError("realized rank count differs from world size") + + by_host: dict[str, list[int]] = {} + for host, local_rank in records: + if not isinstance(host, str) or not host or type(local_rank) is not int: + raise ValueError("realized placement record has invalid types") + by_host.setdefault(host, []).append(local_rank) + + counts = sorted(len(local_ranks) for local_ranks in by_host.values()) + complete_local_ranks = all( + sorted(local_ranks) == list(range(expected_gpus_per_node)) + for local_ranks in by_host.values() + ) + unique_pairs = len(set(records)) == len(records) + if len(by_host) != expected_nodes: + raise ValueError( + f"realized node count {len(by_host)} differs from requested {expected_nodes}" + ) + if counts != [expected_gpus_per_node] * expected_nodes: + raise ValueError("realized ranks per node differ from requested GPUs per node") + if not complete_local_ranks or not unique_pairs: + raise ValueError("realized local ranks are incomplete or duplicated") + return { + "gpus_per_node": expected_gpus_per_node, + "nodes": expected_nodes, + "ranks_per_node": expected_gpus_per_node, + "unique_local_ranks": True, + "valid": True, + } + + +def _common_runtime_fingerprint(records: list[dict]) -> dict: + """Return the shared sanitized fingerprint, rejecting heterogeneous ranks.""" + if not records: + raise ValueError("runtime fingerprint evidence is empty") + canonical = { + json.dumps(record, allow_nan=False, sort_keys=True, separators=(",", ":")) + for record in records + } + if len(canonical) != 1: + raise ValueError("runtime fingerprint differs across distributed ranks") + return records[0] + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep") + ap.add_argument( + "--backend", + required=True, + choices=[ + "deepep", + "deepep-v2", + "deepep-hybrid", + "mori", + "uccl", + "nccl-ep", + ], + ) + ep_harness.add_common_args(ap) + args = ap.parse_args() + + if args.case_id and not identity.is_typed_id(args.case_id, "case"): + print(f"ERROR: invalid native case ID {args.case_id!r}", file=sys.stderr) + return 2 + if args.case_id and args.seed != ep_harness.ROUTING_SEED: + print( + f"ERROR: scheduled v1 cases require seed={ep_harness.ROUTING_SEED}; got {args.seed}", + file=sys.stderr, + ) + return 2 + + sampling_error = ep_harness.sampling_contract_error( + args.iters, args.trials, args.warmup + ) + if sampling_error: + print(f"ERROR: {sampling_error}", file=sys.stderr) + return 2 + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + + import capability + + sku = capability.PLATFORMS.get(args.runner) + if sku is None: + print(f"ERROR: unknown runner identity {args.runner!r}", file=sys.stderr) + return 5 + machine = {"x86_64": "amd64", "aarch64": "arm64"}.get( + platform.machine(), platform.machine() + ) + props = torch.cuda.get_device_properties(device) + if torch.version.hip: + vendor = "amd" + accelerator = str(getattr(props, "gcnArchName", "")).split(":", 1)[0] + else: + vendor = "nvidia" + major, minor = torch.cuda.get_device_capability(device) + accelerator = f"sm{major}{minor}" + device_name = torch.cuda.get_device_name(device) + device_count = torch.cuda.device_count() + identity_issues = capability.runtime_identity_issues( + args.runner, + vendor=vendor, + arch=accelerator, + machine=machine, + device_name=device_name, + device_count=device_count, + world_size=world_size, + ) + if identity_issues: + print( + f"ERROR: runtime identity does not match {args.runner}: " + + "; ".join(identity_issues), + file=sys.stderr, + ) + return 5 + if args.gpus_per_node and args.gpus_per_node != sku["gpus_per_node"]: + print( + f"ERROR: {args.runner} requires {sku['gpus_per_node']} GPUs per node", + file=sys.stderr, + ) + return 5 + args.runtime_device_product = device_name + args.runtime_device_count = device_count + args.allocation_execution_id = os.environ.get("COLLECTIVEX_EXECUTION_ID") + + # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction + # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL + # routing trace and remaps it to the balanced physical placement (a pure routing transform, + # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count. + if getattr(args, "eplb", False): + import eplb + + args.num_logical_experts = args.experts + args.experts = eplb.physical_count( + args.experts, ep_harness.EPLB_REDUNDANT_EXPERTS, world_size + ) + + # Reproduction provenance (recorded in the artifact). Rack launchers provide ranks directly + # through srun, while single-node launchers use torchrun; do not claim torchrun for both. + if os.environ.get("TORCHELASTIC_RUN_ID"): + args.distributed_launcher = "torchrun" + prefix = f"torchrun --nproc_per_node={world_size}" + else: + args.distributed_launcher = "rank-environment" + prefix = f"RANK={rank} WORLD_SIZE={world_size} LOCAL_RANK={local_rank} python3" + args.reproduction_command = f"{prefix} tests/run_ep.py {shlex.join(sys.argv[1:])}" + args.image = os.environ.get("COLLECTIVEX_IMAGE", "") + args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "") + args.image_digest_verified = ( + os.environ.get("COLLECTIVEX_IMAGE_DIGEST_VERIFIED") == "1" + ) + # Container architecture and local squash hash for Enroot/Pyxis. + args.image_arch = machine + args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256") + # GitHub provenance: repo, run ID, attempt, ref, source SHA, job, + # artifact. A result is only publication-'official' when these are present (validity gate). + _run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") + or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + args.git_run = _run if any(_run.values()) else None + + # Import the backend class only after torch initializes. Every adapter implements + # the same fixed v1 profile; the CLI has no precision/mode/contract fallbacks. + if args.backend == "mori": + from ep_mori import MoRIBackend as Backend + elif args.backend == "nccl-ep": + from ep_nccl import NCCLBackend as Backend + elif args.backend == "uccl": + from ep_uccl import UCCLBackend as Backend + elif args.backend == "deepep-hybrid": + from ep_deepep_hybrid import DeepEPHybridBackend as Backend + elif args.backend == "deepep-v2": + from ep_deepep_v2 import DeepEPV2Backend as Backend + else: + from ep_deepep import DeepEPBackend as Backend + + # MoRI uses the gloo+NCCL group shape from its reference; other adapters use NCCL/RCCL. + if not dist.is_initialized(): + if args.backend == "mori": + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=rank, + world_size=world_size, + device_id=device, + ) + elif args.backend == "deepep-v2": + # PR #605 reuses PyTorch's NCCL communicator through ``_comm_ptr``. Supplying + # device_id eagerly forms it before ElasticBuffer construction. + dist.init_process_group("nccl", device_id=device) + else: + dist.init_process_group("nccl") + + args.runtime_fingerprint = _runtime_fingerprint( + torch, device, machine=machine, vendor=vendor, arch=accelerator + ) + + gpus_per_node = args.gpus_per_node or sku["gpus_per_node"] + try: + expected_nodes = int( + os.environ.get("SLURM_NNODES", str(world_size // gpus_per_node)) + ) + except ValueError as exc: + raise ValueError("SLURM_NNODES must be a positive integer") from exc + realized_records: list[tuple[str, int, dict] | None] = [None] * world_size + dist.all_gather_object( + realized_records, + (socket.gethostname(), local_rank, args.runtime_fingerprint), + ) + complete_records = [record for record in realized_records if record is not None] + args.realized_placement = _summarize_realized_placement( + [(record[0], record[1]) for record in complete_records], + expected_nodes=expected_nodes, + expected_gpus_per_node=gpus_per_node, + expected_world_size=world_size, + ) + args.runtime_fingerprint = _common_runtime_fingerprint( + [record[2] for record in complete_records] + ) + + # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its + # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an + # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise. + try: + backend = Backend(args, rank, world_size, local_rank, device) + if rank == 0: + print( + f"[run_ep] backend={args.backend} phase={args.phase} mode=normal " + f"world={world_size} ep_size={world_size} hidden={args.hidden} " + f"topk={args.topk} experts={args.experts} dtype=bf16 " + f"routing={args.routing} seed={args.seed}" + ) + rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size) + except Exception: + import traceback + + print( + f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n" + + traceback.format_exc(), + flush=True, + ) + raise + # finalize() handles backend-specific teardown: DeepEP returns rc cleanly; + # MoRI hard-exits past its post-shmem_finalize teardown assertion. + return backend.finalize(rc) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/test_deepep_v2_contract.py b/experimental/CollectiveX/tests/test_deepep_v2_contract.py new file mode 100644 index 0000000000..afd01ea3e6 --- /dev/null +++ b/experimental/CollectiveX/tests/test_deepep_v2_contract.py @@ -0,0 +1,1852 @@ +#!/usr/bin/env python3 +"""CPU-only structural and registry tests for the pinned DeepEP V2 path.""" +from __future__ import annotations + +import ast +import copy +import ctypes +import hashlib +import json +import os +from pathlib import Path +from pathlib import PurePosixPath +import shutil +import stat +import subprocess +import sys +import tempfile +import types +import unittest + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path.insert(0, str(ROOT)) + +import capability # noqa: E402 +import contracts # noqa: E402 +import ep_harness # noqa: E402 +import run_ep # noqa: E402 + + +COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +TREE = "29809e75c5874e6609dac4804e7b651d5226959f" +FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" + + +def deepep_v2_jit_provenance() -> list[dict[str, str]]: + return [ + { + "cache_key": f"kernel.{name}.{index:032x}", + "cubin_sha256": f"{index + 1:x}" * 64, + "sass_sha256": f"{index + 2:x}" * 64, + "source_sha256": f"{index + 3:x}" * 64, + } + for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)) + ] + + +def hybrid_realized_config() -> dict[str, object]: + config = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS} + for field in contracts.HYBRID_REALIZED_BOOL_FIELDS: + config[field] = True + config["token_data_type"] = "UINT16" + return config + + +def hybrid_jit_provenance(ranks: int = 2) -> tuple[list[str], list[dict[str, object]]]: + keys = ["combine-key", "dispatch-key", "preprocess-key"] + artifacts = [ + { + "kernel_key": key, + "rank_artifacts": [ + {"bytes": 10 + index, "rank": rank, "sha256": f"{index + 1:x}" * 64} + for rank in range(ranks) + ], + } + for index, key in enumerate(keys) + ] + return keys, artifacts + + +def load_uccl_function(name: str, namespace: dict[str, object]): + path = HERE / "ep_uccl.py" + function = next( + node + for node in ast.parse(path.read_text()).body + if isinstance(node, ast.FunctionDef) and node.name == name + ) + exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace) + return namespace[name] + + +def operator_config(root: Path) -> dict[str, object]: + path = str(root) + runners = { + "h100-dgxc": {"partition": "test", "account": "test", "squash_dir": path}, + "h200-dgxc": {"partition": "test", "squash_dir": path}, + "b200-dgxc": {"partition": "test", "account": "test", "squash_dir": path}, + "b300": { + "partition": "test", "account": "test", "squash_dir": path, "stage_dir": path, + }, + "gb200": {"partition": "test", "account": "test", "storage_roots": [path]}, + "gb300": { + "partition": "test", "account": "test", "squash_dir": path, + "stage_dir": path, "enroot_cache_path": path, + }, + "mi325x": {"partition": "test", "squash_dir": path}, + "mi355x": {"partition": "test", "squash_dir": path}, + } + return {"schema_version": 1, "runners": runners} + + +class DeepEPV2ContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.path = HERE / "ep_deepep_v2.py" + cls.tree = ast.parse(cls.path.read_text(), str(cls.path)) + + def test_capability_is_explicit_for_every_sku(self) -> None: + backend = capability.BACKENDS["deepep-v2"] + self.assertEqual( + (backend["implementation"], backend["commit"], backend["torch"], backend["nccl"]), + ("deep_ep.ElasticBuffer", COMMIT, "2.10.0+cu130", "2.30.4"), + ) + self.assertEqual(backend["source"], "deepseek-ai/DeepEP#605+#630") + self.assertEqual(backend["communication_backend"], "nccl-device-lsa") + self.assertEqual(set(backend["sku_capabilities"]), set(capability.PLATFORMS)) + for sku, platform in capability.PLATFORMS.items(): + ok, _ = capability.resolve(sku, "deepep-v2") + self.assertEqual(ok, platform["vendor"] == "nvidia") + self.assertEqual( + set(backend["sku_capabilities"][sku]), {"basis", "schedulable"} + ) + + def test_adapter_ast_pins_elastic_api_and_weight_semantics(self) -> None: + imports = { + alias.name + for node in ast.walk(self.tree) + if isinstance(node, ast.ImportFrom) and node.module == "deep_ep" + for alias in node.names + } + self.assertEqual(imports, {"ElasticBuffer"}) + constants = { + node.targets[0].id: ast.literal_eval(node.value) + for node in self.tree.body + if isinstance(node, ast.Assign) + and len(node.targets) == 1 + and isinstance(node.targets[0], ast.Name) + and isinstance(node.value, ast.Constant) + } + self.assertEqual(constants["DEEPEP_V2_COMMIT"], COMMIT) + self.assertEqual(constants["DEEPEP_V2_TREE"], TREE) + self.assertEqual(constants["DEEPEP_V2_FMT_COMMIT"], FMT_COMMIT) + self.assertEqual(constants["DEEPEP_V2_PR"], 605) + self.assertEqual(constants["DEEPEP_V2_FIX_PR"], 630) + self.assertEqual( + constants["DEEPEP_V2_JIT_RANDOM_SEED"], + "collectivex-deepep-v2-fa8a9b1", + ) + self.assertEqual(constants["NCCL_VERSION"], "2.30.4") + self.assertEqual(constants["NVSHMEM_VERSION"], "3.3.9") + backend = next( + node for node in self.tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend" + ) + assignments = { + node.targets[0].id: ast.literal_eval(node.value) + for node in backend.body + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Name) + and isinstance(node.value, ast.Constant) + } + self.assertEqual(assignments["combine_weight_semantics"], "unweighted-rank-sum") + methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)} + self.assertTrue({ + "dispatch", "inspect_dispatch", "combine_transformed", "capture_deferred_provenance", + "finalize", + } <= methods) + self.assertNotIn("expected", methods) + constructor = next( + node for node in ast.walk(backend) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "ElasticBuffer" + ) + deterministic = next( + keyword for keyword in constructor.keywords if keyword.arg == "deterministic" + ) + self.assertIs(ast.literal_eval(deterministic.value), False) + self.assertIn("deterministic", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("tuning_num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("jit_random_seed", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("gin_enabled", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("communication_backend", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("deepep_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("deepep_fix_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + source = self.path.read_text() + self.assertIn('getattr(args, "num_logical_experts", args.experts)', source) + self.assertIn('"use_expanded_layout": False', source) + self.assertIn("allow_hybrid_mode = _configure_gin_mode(args, world_size)", source) + self.assertIn("get_theoretical_num_sms(tuning_num_experts, args.topk)", source) + + jit_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_jit_cache_key" + ) + namespace = {"hashlib": __import__("hashlib"), "json": json} + exec(compile(ast.Module(body=[jit_function], type_ignores=[]), str(self.path), "exec"), namespace) + key = namespace["_jit_cache_key"] + baseline = types.SimpleNamespace( + runner="h100-dgxc", hidden=7168, topk=8, experts=256, + routing="uniform", eplb=False, case_id="uniform", + ) + zipf = types.SimpleNamespace(**{**vars(baseline), "routing": "zipf", "case_id": "zipf"}) + eplb = types.SimpleNamespace( + **{**vars(zipf), "experts": 288, "num_logical_experts": 256, "eplb": True} + ) + realized = { + "num_sms": 24, + "num_qps": 9, + "allocated_qps": 17, + "logical_scaleout_ranks": 1, + "logical_scaleup_ranks": 8, + "physical_rdma_ranks": 2, + "physical_nvlink_ranks": 4, + "is_scaleup_nvlink": False, + "device_arch_major": 9, + "device_arch_minor": 0, + "device_sms": 132, + "device_smem_bytes": 232448, + "gpu_timeout_cycles": 198000000000, + } + direct = key(baseline, 8, 128, False, realized) + self.assertTrue(direct.startswith("jitcfg-v3-")) + self.assertEqual(direct, key(zipf, 8, 128, False, realized)) + self.assertNotEqual(direct, key(zipf, 8, 128, True, realized)) + self.assertNotEqual(direct, key(eplb, 8, 128, False, realized)) + for field, value in realized.items(): + changed = not value if type(value) is bool else value + 1 + self.assertNotEqual( + direct, + key(baseline, 8, 128, False, {**realized, field: changed}), + field, + ) + init = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) and node.name == "__init__" + ) + buffer_call = next( + node for node in ast.walk(init) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "ElasticBuffer" + ) + jit_config_check = next( + node for node in ast.walk(init) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "_require_cross_rank_equal" + and ast.literal_eval(node.args[1]) == "JIT configuration" + ) + cache_assignment = next( + node for node in ast.walk(init) + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Subscript) + and ast.unparse(node.targets[0].value) == "os.environ" + and ast.literal_eval(node.targets[0].slice) == "EP_JIT_CACHE_DIR" + ) + self.assertLess(buffer_call.lineno, jit_config_check.lineno) + self.assertLess(jit_config_check.lineno, cache_assignment.lineno) + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) + and node.name == "capture_deferred_provenance" + ) + calls = [node for node in ast.walk(capture) if isinstance(node, ast.Call)] + barrier = next( + node for node in calls + if isinstance(node.func, ast.Attribute) and node.func.attr == "barrier" + ) + self.assertEqual( + {keyword.arg: ast.literal_eval(keyword.value) for keyword in barrier.keywords}, + {"use_comm_stream": True, "with_cpu_sync": True}, + ) + scan = next( + node for node in calls + if isinstance(node.func, ast.Name) and node.func.id == "_jit_artifact_evidence" + ) + self.assertLess(barrier.lineno, scan.lineno) + realized_check = next( + node for node in ast.walk(backend) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "_require_cross_rank_equal" + and len(node.args) > 1 + and isinstance(node.args[1], ast.Constant) + and node.args[1].value == "realized tuning/topology" + ) + self.assertIsInstance(realized_check, ast.Call) + self.assertEqual( + (ROOT / "tests" / "ep_harness.py").read_text().count( + "capture_deferred_provenance()" + ), + 2, + ) + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + provenance = schema["properties"]["implementation"]["properties"]["provenance"] + self.assertEqual(provenance["properties"]["deterministic"], {"type": "boolean"}) + self.assertEqual( + provenance["properties"]["num_experts"], + {"minimum": 1, "type": "integer"}, + ) + self.assertEqual( + provenance["properties"]["tuning_num_experts"], + {"minimum": 1, "type": "integer"}, + ) + self.assertEqual( + provenance["properties"]["jit_cubins"]["items"], + {"$ref": "#/$defs/deepep_v2_jit_cubin"}, + ) + self.assertEqual( + ( + provenance["properties"]["jit_cubins"]["minItems"], + provenance["properties"]["jit_cubins"]["maxItems"], + ), + (5, 5), + ) + self.assertEqual( + provenance["properties"]["jit_random_seed"], + {"const": "collectivex-deepep-v2-fa8a9b1"}, + ) + self.assertEqual(provenance["properties"]["allow_hybrid_mode"], {"const": False}) + self.assertEqual(provenance["properties"]["gin_enabled"], {"const": False}) + self.assertEqual(provenance["properties"]["deepep_pr"], {"const": 605}) + self.assertEqual(provenance["properties"]["deepep_fix_pr"], {"const": 630}) + self.assertEqual( + provenance["properties"]["communication_backend"], + {"const": "nccl-device-lsa"}, + ) + for field, value in ( + ("num_experts", "288"), + ("tuning_num_experts", "not-an-integer"), + ("tuning_num_experts", 0), + ): + with self.subTest(provenance_field=field, value=value): + self.assertIn( + field, + contracts.backend_provenance_issues( + "deepep-v2", {field: value} + ), + ) + + def test_v2_gin_mode_uses_the_scale_up_domain_and_safe_fallbacks(self) -> None: + functions = { + node.name: node for node in self.tree.body if isinstance(node, ast.FunctionDef) + } + namespace = {"os": os} + exec( + compile( + ast.Module( + body=[ + functions["_configure_gin_mode"], + functions["_lsa_topology_is_valid"], + ], + type_ignores=[], + ), + str(self.path), + "exec", + ), + namespace, + ) + configure = namespace["_configure_gin_mode"] + topology_is_valid = namespace["_lsa_topology_is_valid"] + original = os.environ.get("EP_DISABLE_GIN") + try: + args = types.SimpleNamespace(scale_up_domain=72, gpus_per_node=4) + self.assertFalse(configure(args, 8)) + self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1") + + os.environ["EP_DISABLE_GIN"] = "stale" + args = types.SimpleNamespace(scale_up_domain=8, gpus_per_node=4) + self.assertTrue(configure(args, 16)) + self.assertNotIn("EP_DISABLE_GIN", os.environ) + + args = types.SimpleNamespace(gpus_per_node=4) + self.assertTrue(configure(args, 8)) + self.assertNotIn("EP_DISABLE_GIN", os.environ) + + self.assertFalse(configure(types.SimpleNamespace(), 8)) + self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1") + + topology = { + "physical_rdma_ranks": 1, + "physical_nvlink_ranks": 8, + "logical_scaleout_ranks": 1, + "logical_scaleup_ranks": 8, + "is_scaleup_nvlink": True, + } + self.assertTrue(topology_is_valid(False, 8, topology)) + self.assertTrue(topology_is_valid(True, 16, topology)) + topology["physical_nvlink_ranks"] = 4 + self.assertFalse(topology_is_valid(False, 8, topology)) + finally: + if original is None: + os.environ.pop("EP_DISABLE_GIN", None) + else: + os.environ["EP_DISABLE_GIN"] = original + + def test_ep_adapters_declare_unweighted_rank_sum(self) -> None: + adapters = { + "ep_deepep.py": "DeepEPBackend", + "ep_deepep_v2.py": "DeepEPV2Backend", + "ep_deepep_hybrid.py": "DeepEPHybridBackend", + "ep_mori.py": "MoRIBackend", + "ep_nccl.py": "NCCLBackend", + "ep_uccl.py": "UCCLBackend", + } + for filename, class_name in adapters.items(): + with self.subTest(adapter=filename): + tree = ast.parse((HERE / filename).read_text()) + backend = next( + node for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == class_name + ) + assignment = next( + node for node in backend.body + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Name) + and node.targets[0].id == "combine_weight_semantics" + ) + self.assertEqual(ast.literal_eval(assignment.value), "unweighted-rank-sum") + combine_methods = [ + item for item in backend.body + if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) + and item.name in {"combine", "combine_transformed"} + ] + self.assertEqual(len(combine_methods), 2) + for method in combine_methods: + self.assertNotIn("topk_weights", ast.unparse(method)) + self.assertNotIn("combine_topk_weights", ast.unparse(method)) + + def test_deepep_v2_jit_evidence_is_strict_and_stable(self) -> None: + valid = deepep_v2_jit_provenance() + self.assertTrue(contracts._deepep_v2_jit_cubins_are_valid(valid)) + for invalid in ( + [], + [{**valid[0], "path": "/private/kernel.cubin"}], + [{**item, "cache_key": "dispatch"} for item in valid], + [{**item, "cubin_sha256": "invalid"} for item in valid], + valid[:-1], + [*valid, valid[0]], + [ + *valid, + { + **valid[0], + "cache_key": valid[0]["cache_key"][:-32] + "f" * 32, + }, + ], + ): + with self.subTest(invalid=invalid): + self.assertFalse(contracts._deepep_v2_jit_cubins_are_valid(invalid)) + + backend = next( + node for node in self.tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend" + ) + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) + and node.name == "capture_deferred_provenance" + ) + artifacts = copy.deepcopy(valid) + + class FakeBuffer: + @staticmethod + def barrier(*, use_comm_stream: bool, with_cpu_sync: bool) -> None: + self.assertTrue(use_comm_stream) + self.assertTrue(with_cpu_sync) + + namespace = { + "torch": types.SimpleNamespace( + cuda=types.SimpleNamespace(synchronize=lambda: None) + ), + "_jit_artifact_evidence": lambda: copy.deepcopy(artifacts), + "_require_cross_rank_equal": lambda _value, _label: None, + } + exec( + compile(ast.Module(body=[capture], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + state = types.SimpleNamespace( + buffer=FakeBuffer(), + _deferred_jit_snapshot=None, + backend_provenance={"jit_cubins": []}, + ) + namespace["capture_deferred_provenance"](state) + namespace["capture_deferred_provenance"](state) + artifacts[0]["cubin_sha256"] = "f" * 64 + with self.assertRaisesRegex(RuntimeError, "changed after measurement"): + namespace["capture_deferred_provenance"](state) + + def test_deepep_v2_jit_files_are_complete_regular_and_content_bound(self) -> None: + functions = [ + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) + and node.name in {"_sha256", "_jit_artifact_evidence"} + ] + namespace = { + "hashlib": hashlib, + "os": os, + "Path": Path, + "re": __import__("re"), + "DEEPEP_V2_JIT_KERNELS": contracts.DEEPEP_V2_JIT_KERNELS, + } + exec(compile(ast.Module(body=functions, type_ignores=[]), str(self.path), "exec"), namespace) + with tempfile.TemporaryDirectory() as temporary: + cache = Path(temporary) / "cache" + cache.mkdir() + for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)): + kernel = cache / f"kernel.{name}.{index:032x}" + kernel.mkdir() + for suffix in ("cu", "cubin", "sass"): + (kernel / f"kernel.{suffix}").write_bytes(f"{name}-{suffix}".encode()) + old_cache = os.environ.get("EP_JIT_CACHE_DIR") + os.environ["EP_JIT_CACHE_DIR"] = temporary + try: + evidence = namespace["_jit_artifact_evidence"]() + self.assertEqual(len(evidence), len(contracts.DEEPEP_V2_JIT_KERNELS)) + self.assertEqual( + set(evidence[0]), + {"cache_key", "cubin_sha256", "sass_sha256", "source_sha256"}, + ) + first = cache / evidence[0]["cache_key"] + duplicate = cache / (evidence[0]["cache_key"][:-32] + "f" * 32) + duplicate.mkdir() + for suffix in ("cu", "cubin", "sass"): + (duplicate / f"kernel.{suffix}").write_bytes(b"duplicate") + with self.assertRaisesRegex(RuntimeError, "kernel set"): + namespace["_jit_artifact_evidence"]() + shutil.rmtree(duplicate) + (first / "kernel.sass").unlink() + with self.assertRaisesRegex(RuntimeError, "incomplete"): + namespace["_jit_artifact_evidence"]() + (first / "kernel.sass").symlink_to(first / "kernel.cubin") + with self.assertRaisesRegex(RuntimeError, "regular file"): + namespace["_jit_artifact_evidence"]() + finally: + if old_cache is None: + os.environ.pop("EP_JIT_CACHE_DIR", None) + else: + os.environ["EP_JIT_CACHE_DIR"] = old_cache + + def test_runtime_and_shared_version_formatter_are_valid(self) -> None: + subprocess.run( + ["bash", "-n", str(ROOT / "runtime" / "run_in_container.sh")], + check=True, + ) + self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4") + self.assertEqual(ep_harness.format_collective_version((2, 30, 4)), "2.30.4") + source = self.path.read_text() + version_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_loaded_nccl_version" + ) + + class FakeNccl: + @staticmethod + def ncclGetVersion(pointer) -> int: + pointer._obj.value = 23004 + return 0 + + namespace = { + "ctypes": types.SimpleNamespace( + CDLL=lambda _path: FakeNccl(), byref=ctypes.byref, c_int=ctypes.c_int, + ), + "ep_harness": ep_harness, + "os": os, + "_loaded_library_paths": lambda: {"/safe/libnccl.so.2"}, + } + exec( + compile(ast.Module(body=[version_function], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + self.assertEqual(namespace["_loaded_nccl_version"](), "2.30.4") + for paths in (set(), {"/safe/libnccl.so.2", "/other/libnccl.so.2"}): + namespace["_loaded_library_paths"] = lambda paths=paths: paths + with self.assertRaisesRegex(RuntimeError, "exactly one"): + namespace["_loaded_nccl_version"]() + evidence_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_loaded_library_evidence" + ) + paths = { + "/safe/_C.cpython-310-x86_64-linux-gnu.so", + "/safe/libnccl.so.2", + "/safe/libnvshmem_host.so.3", + } + namespace.update( + _loaded_library_paths=lambda: paths, + _sha256=lambda _path: "a" * 64, + ) + exec( + compile(ast.Module(body=[evidence_function], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + evidence = namespace["_loaded_library_evidence"]() + self.assertIn( + {"name": "deep_ep._C", "role": "deepep-extension", "sha256": "a" * 64}, + evidence, + ) + self.assertTrue( + contracts._content_evidence_is_valid( + evidence, {"deepep-extension", "nccl", "nvshmem"} + ) + ) + self.assertNotIn("torch.cuda.nccl.version()", source) + fingerprint = {"runtime": "cuda", "version": "13.0"} + self.assertIs( + run_ep._common_runtime_fingerprint([fingerprint, dict(fingerprint)]), + fingerprint, + ) + with self.assertRaises(ValueError): + run_ep._common_runtime_fingerprint([fingerprint, {"runtime": "cuda", "version": "12.8"}]) + + def test_conditioning_contract_is_exact_for_each_phase(self) -> None: + expected = { + "decode": [1, 2, 4, 8, 16, 32, 64, 128], + "prefill": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512], + } + for phase, ladder in expected.items(): + valid = { + "contract": "fixed-phase-ramp-8-roundtrips-v1", + "ladder": ladder, + "roundtrips_per_shape": 8, + } + self.assertIs(contracts.validate_conditioning_contract(valid, phase), valid) + for mutate in ( + lambda item: item["ladder"].reverse(), + lambda item: item["ladder"].pop(), + lambda item: item.update(ladder=[1.0, *item["ladder"][1:]]), + lambda item: item.update(roundtrips_per_shape=7), + lambda item: item.update(roundtrips_per_shape=8.0), + ): + changed = copy.deepcopy(valid) + mutate(changed) + with self.assertRaises(contracts.ContractError): + contracts.validate_conditioning_contract(changed, phase) + other = "prefill" if phase == "decode" else "decode" + with self.assertRaises(contracts.ContractError): + contracts.validate_conditioning_contract(valid, other) + + def test_content_manifest_evidence_is_stable_and_content_sensitive(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + first, second = root / "first", root / "second" + first.write_bytes(b"first") + second.write_bytes(b"second") + files = [("pkg/first", first), ("pkg/second", second)] + evidence = contracts.content_manifest_evidence( + role="test-content", name="test-build", files=files, + ) + self.assertNotIn(temporary, json.dumps(evidence)) + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=reversed(files), + ), + ) + self.assertRegex(evidence["sha256"], r"^[0-9a-f]{64}$") + second.write_bytes(b"changed") + self.assertNotEqual( + evidence, + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=files, + ), + ) + for invalid in ( + [("../first", first)], + [("same", first), ("same", second)], + [("missing", root / "missing")], + ): + with self.assertRaises(contracts.ContractError): + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=invalid, + ) + + def test_hybrid_realized_config_and_jit_evidence_are_path_free(self) -> None: + path = HERE / "ep_deepep_hybrid.py" + tree = ast.parse(path.read_text(), str(path)) + selected = [ + node for node in tree.body + if ( + isinstance(node, ast.Assign) + and any( + isinstance(target, ast.Name) and target.id == "HYBRID_CONFIG_FIELDS" + for target in node.targets + ) + ) + or isinstance(node, ast.FunctionDef) + and node.name in { + "_hybrid_realized_config", "_sha256_with_size", "_hybrid_jit_evidence", + } + ] + namespace = {"Path": Path, "hashlib": hashlib, "re": __import__("re")} + exec(compile(ast.Module(body=selected, type_ignores=[]), str(path), "exec"), namespace) + fields = namespace["HYBRID_CONFIG_FIELDS"] + self.assertEqual(set(fields), contracts.HYBRID_REALIZED_CONFIG_FIELDS) + + class TokenType: + def __init__(self, label: str, name: str | None = None) -> None: + self.label = label + if name is not None: + self.name = name + + def __str__(self) -> str: + return self.label + + values = {field: 1 for field in fields} + values.update({field: True for field in contracts.HYBRID_REALIZED_BOOL_FIELDS}) + for raw, expected in (("uint16_t", "UINT16"), ("uint8_t", "UINT8")): + values["token_data_type"] = TokenType(raw) + config = types.SimpleNamespace(**values) + realized = namespace["_hybrid_realized_config"](config) + self.assertEqual(realized["token_data_type"], expected) + self.assertEqual(set(realized), contracts.HYBRID_REALIZED_CONFIG_FIELDS) + values["token_data_type"] = TokenType("opaque-enum", "UINT16") + self.assertEqual( + namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))[ + "token_data_type" + ], + "UINT16", + ) + values["token_data_type"] = TokenType("UINT16") + with self.assertRaisesRegex(RuntimeError, "token_data_type is invalid"): + namespace["_hybrid_realized_config"](types.SimpleNamespace(**values)) + values["token_data_type"] = TokenType("uint16_t") + config = types.SimpleNamespace(**values) + delattr(config, "hidden_dim") + with self.assertRaisesRegex(RuntimeError, "omits hidden_dim"): + namespace["_hybrid_realized_config"](config) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + for key, payload in ( + ("preprocess-key", b"pre"), + ("combine-key", b"combine"), + ("dispatch-key", b"dispatch"), + ): + (root / f"{key}.so").write_bytes(payload) + evidence = namespace["_hybrid_jit_evidence"](root) + self.assertEqual( + [item["kernel_key"] for item in evidence], + ["combine-key", "dispatch-key", "preprocess-key"], + ) + self.assertNotIn(temporary, json.dumps(evidence)) + (root / "dispatch-key.so").write_bytes(b"changed") + self.assertNotEqual(evidence, namespace["_hybrid_jit_evidence"](root)) + (root / "extra-key.so").write_bytes(b"extra") + with self.assertRaisesRegex(RuntimeError, "expected 3"): + namespace["_hybrid_jit_evidence"](root) + (root / "extra-key.so").unlink() + (root / "bad key.so").write_bytes(b"bad") + with self.assertRaisesRegex(RuntimeError, "kernel key"): + namespace["_hybrid_jit_evidence"](root) + (root / "bad key.so").unlink() + (root / "combine-key.so").unlink() + (root / "combine-key.so").symlink_to(root / "dispatch-key.so") + with self.assertRaisesRegex(RuntimeError, "regular file"): + namespace["_hybrid_jit_evidence"](root) + empty = root / "empty" + empty.mkdir() + with self.assertRaisesRegex(RuntimeError, "expected 3"): + namespace["_hybrid_jit_evidence"](empty) + + def test_hybrid_deferred_provenance_wraps_before_conditioning_and_recaptures(self) -> None: + path = HERE / "ep_deepep_hybrid.py" + source = path.read_text() + tree = ast.parse(source, str(path)) + backend = next( + node for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPHybridBackend" + ) + methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)} + self.assertIn("capture_deferred_provenance", methods) + constructor = next(node for node in backend.body if isinstance(node, ast.FunctionDef) and node.name == "__init__") + buffer_call = next( + node for node in ast.walk(constructor) + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) + and node.func.id == "HybridEPBuffer" + ) + wrapper_install = next( + node for node in ast.walk(constructor) + if isinstance(node, ast.Assign) + and any( + isinstance(target, ast.Attribute) + and target.attr == "update_template_config" + for target in node.targets + ) + ) + cache_line = source[:source.index('os.environ["HYBRID_EP_CACHE_DIR"]')].count("\n") + 1 + self.assertLess(cache_line, buffer_call.lineno) + self.assertLess(buffer_call.lineno, wrapper_install.lineno) + + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) and node.name == "capture_deferred_provenance" + ) + called = { + node.func.id if isinstance(node.func, ast.Name) else node.func.attr + for node in ast.walk(capture) if isinstance(node, ast.Call) + and isinstance(node.func, (ast.Name, ast.Attribute)) + } + self.assertTrue({"_hybrid_jit_evidence", "_require_cross_rank_equal", "all_gather_object"} <= called) + self.assertIn("changed after measurement", ast.get_source_segment(source, capture)) + + artifacts = [[ + {"bytes": 1, "kernel_key": key, "sha256": digit * 64} + for key, digit in (("a", "1"), ("b", "2"), ("c", "3")) + ]] + + class FakeCuda: + @staticmethod + def synchronize() -> None: + return None + + class FakeDist: + @staticmethod + def barrier() -> None: + return None + + @staticmethod + def get_world_size() -> int: + return 2 + + @staticmethod + def all_gather_object(output, value) -> None: + output[:] = [copy.deepcopy(value), copy.deepcopy(value)] + + namespace = { + "torch": types.SimpleNamespace(cuda=FakeCuda), + "dist": FakeDist, + "_hybrid_jit_evidence": lambda _root: copy.deepcopy(artifacts[0]), + "_require_cross_rank_equal": lambda _value, _label: None, + } + exec(compile(ast.Module(body=[capture], type_ignores=[]), str(path), "exec"), namespace) + state = types.SimpleNamespace( + _deferred_jit_diagnostics=None, + _deferred_semantic_snapshot=None, + _jit_root=Path("private-cache"), + _realized_config=hybrid_realized_config(), + backend_provenance={}, + ) + namespace["capture_deferred_provenance"](state) + artifacts[0][0]["kernel_key"] = "changed" + with self.assertRaisesRegex(RuntimeError, "kernel set changed"): + namespace["capture_deferred_provenance"](state) + artifacts[0][0]["kernel_key"] = "a" + artifacts[0][0]["sha256"] = "f" * 64 + with self.assertRaisesRegex(RuntimeError, "artifacts changed"): + namespace["capture_deferred_provenance"](state) + + harness = (HERE / "ep_harness.py").read_text() + captures = [ + index for index in range(len(harness)) + if harness.startswith("capture_deferred_provenance()", index) + ] + self.assertEqual(len(captures), 2) + self.assertLess(harness.index("for wt in conditioning_ladder:"), captures[0]) + self.assertLess(captures[0], harness.index("oracle = _run_expert_oracle(")) + self.assertLess(harness.index("trace_sig = hashlib.sha256"), captures[1]) + + def test_hybrid_diagnostic_hashes_do_not_split_series_identity(self) -> None: + keys, artifacts = hybrid_jit_provenance() + provenance = { + "deepep_tree": "b" * 40, + "jit_kernel_keys": keys, + "jit_shared_objects": artifacts, + "loaded_libraries": [{ + "name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", + "sha256": "a" * 64, + }], + "realized_config": hybrid_realized_config(), + } + baseline = ep_harness._series_provenance(provenance) + changed = copy.deepcopy(provenance) + changed["jit_shared_objects"][0]["rank_artifacts"][0]["sha256"] = "f" * 64 + self.assertEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["loaded_libraries"][0]["sha256"] = "f" * 64 + self.assertEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["jit_kernel_keys"][0] = "changed-key" + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["realized_config"]["num_of_blocks_dispatch_api"] += 1 + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["deepep_tree"] = "c" * 40 + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + + def test_v2_series_identity_uses_source_and_sass_not_container_metadata(self) -> None: + provenance = { + "deepep_tree": "a" * 40, + "loaded_libraries": [ + {"name": "deep_ep._C.so", "role": "deepep-extension", "sha256": "1" * 64}, + {"name": "libnccl.so.2", "role": "nccl", "sha256": "2" * 64}, + ], + "jit_cubins": deepep_v2_jit_provenance(), + "jit_random_seed": "collectivex-deepep-v2-fa8a9b1", + } + baseline = contracts.series_provenance(provenance) + changed = copy.deepcopy(provenance) + changed["loaded_libraries"][0]["sha256"] = "f" * 64 + changed["jit_cubins"][0]["cubin_sha256"] = "e" * 64 + self.assertEqual(contracts.series_provenance(changed), baseline) + for mutate in ( + lambda item: item["loaded_libraries"][1].update(sha256="f" * 64), + lambda item: item["jit_cubins"][0].update(source_sha256="f" * 64), + lambda item: item["jit_cubins"][0].update(sass_sha256="f" * 64), + lambda item: item.update(deepep_tree="f" * 40), + ): + changed = copy.deepcopy(provenance) + mutate(changed) + self.assertNotEqual(contracts.series_provenance(changed), baseline) + + def test_mnnvl_resolution_has_no_ambiguous_signature_fallback(self) -> None: + self.assertEqual( + contracts.resolve_deepep_mnnvl( + requested=False, signature_parameters=(), deepep_commit=None, + ), + ({}, "not-requested"), + ) + self.assertEqual( + contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=("allow_mnnvl",), + deepep_commit="a" * 40, + ), + ({"allow_mnnvl": True}, "explicit-allow-mnnvl"), + ) + with self.assertRaises(contracts.ContractError): + contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=(), + deepep_commit="814e508537c6ffc775d59f6f1b9ba43f3a65968c", + ) + + def test_backend_provenance_requires_lineage_and_content_hashes(self) -> None: + def record(role: str, name: str, digit: str) -> dict[str, str]: + return {"role": role, "name": name, "sha256": digit * 64} + + hybrid_keys, hybrid_artifacts = hybrid_jit_provenance() + v2 = { + **contracts.DEEPEP_V2_V1_PROVENANCE, + "api_signature_sha256": "c" * 64, + "loaded_libraries": [ + record("deepep-extension", "deep_ep._C", "1"), + record("nccl", "libnccl.so.2", "2"), + record("nvshmem", "libnvshmem_host.so.3", "3"), + ], + "jit_cubins": deepep_v2_jit_provenance(), + "jit_random_seed": "collectivex-deepep-v2-fa8a9b1", + "deterministic": False, + "num_experts": 256, + "tuning_num_experts": 256, + } + deepep = { + "deepep_version": "1.1.0", "deepep_commit": "a" * 40, + "backend_lineage": "deepep-v1", "allow_mnnvl": False, + "mnnvl_comm": "not-requested", + } + hybrid = { + "deepep_commit": "a" * 40, "deepep_tree": "b" * 40, + "branch": "hybrid-ep", "backend_lineage": "deepep-hybrid", + "loaded_libraries": [ + record("deepep-extension", "deep_ep_cpp", "1"), + record("deepep-hybrid-extension", "hybrid_ep_cpp", "2"), + ], + "jit_kernel_keys": hybrid_keys, + "jit_shared_objects": hybrid_artifacts, + "realized_config": hybrid_realized_config(), + } + uccl = { + "uccl_version": "0.1.1", "uccl_commit": "pkg-0.1.1", + "uccl_wrapper_commit": "c" * 40, "backend_lineage": "uccl", + "uccl_dependency_versions": dict(contracts.UCCL_DEPENDENCY_VERSIONS), + "loaded_libraries": [ + record("uccl-distribution", "uccl-0.1.1", "3"), + record("uccl-wrapper", "uccl-deepep-wrapper", "4"), + record("intervaltree-distribution", "intervaltree-3.1.0", "5"), + record("sortedcontainers-distribution", "sortedcontainers-2.4.0", "6"), + record("cuda-runtime", "nvidia-cuda-runtime-cu12-12.9.79", "7"), + ], + } + reference = { + "nccl_version": "2.30.4", "collective_library": "nccl", + "backend_lineage": "nccl", + } + for backend, provenance in ( + ("deepep", deepep), ("deepep-v2", v2), ("deepep-hybrid", hybrid), + ("uccl", uccl), ("nccl-ep", reference), + ): + self.assertEqual(contracts.backend_provenance_issues(backend, provenance), []) + changed = copy.deepcopy(provenance) + if "loaded_libraries" in changed: + changed["loaded_libraries"][0]["sha256"] = "invalid" + expected = "loaded_libraries" + else: + changed["backend_lineage"] = "wrong" + expected = "backend_lineage" + self.assertIn(expected, contracts.backend_provenance_issues(backend, changed)) + + changed = copy.deepcopy(uccl) + changed["uccl_dependency_versions"]["intervaltree"] = "3.2.0" + self.assertIn( + "uccl_dependency_versions", + contracts.backend_provenance_issues("uccl", changed), + ) + changed = copy.deepcopy(uccl) + changed["loaded_libraries"] = [ + item + for item in changed["loaded_libraries"] + if item["role"] != "sortedcontainers-distribution" + ] + self.assertIn( + "loaded_libraries", contracts.backend_provenance_issues("uccl", changed) + ) + + for field, mutate in ( + ("realized_config", lambda item: item["realized_config"].pop("hidden_dim")), + ("jit_kernel_keys", lambda item: item["jit_kernel_keys"].reverse()), + ( + "jit_shared_objects", + lambda item: item["jit_shared_objects"][0]["rank_artifacts"][0].update( + sha256="invalid" + ), + ), + ): + with self.subTest(hybrid_field=field): + changed = copy.deepcopy(hybrid) + mutate(changed) + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-hybrid", changed), + ) + + for field, value in ( + ("jit_cubins", [{"cache_key": "invalid", "cubin_sha256": "4" * 64}]), + ("jit_random_seed", "different-seed"), + ): + with self.subTest(v2_field=field): + changed = copy.deepcopy(v2) + changed[field] = value + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-v2", changed), + ) + + changed = copy.deepcopy(v2) + changed["gin_enabled"] = True + self.assertIn("gin_enabled", contracts.backend_provenance_issues("deepep-v2", changed)) + changed = copy.deepcopy(v2) + changed["communication_backend"] = "nccl-gin" + self.assertIn( + "communication_backend", contracts.backend_provenance_issues("deepep-v2", changed) + ) + changed = copy.deepcopy(v2) + changed.update( + allow_hybrid_mode=True, + gin_enabled=True, + communication_backend="nccl-gin", + ) + self.assertEqual( + contracts.backend_provenance_issues("deepep-v2", changed), + ["allow_hybrid_mode", "communication_backend", "gin_enabled"], + ) + for field, expected in contracts.DEEPEP_V2_V1_PROVENANCE.items(): + with self.subTest(v2_pin_field=field): + changed = copy.deepcopy(v2) + changed[field] = not expected if type(expected) is bool else "wrong" + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-v2", changed), + ) + + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + provenance_schema = schema["properties"]["implementation"]["properties"]["provenance"] + self.assertEqual( + provenance_schema["properties"]["realized_config"], + {"$ref": "#/$defs/hybrid_realized_config"}, + ) + self.assertFalse(schema["$defs"]["hybrid_realized_config"]["additionalProperties"]) + self.assertEqual(provenance_schema["properties"]["jit_kernel_keys"]["minItems"], 3) + self.assertEqual(provenance_schema["properties"]["jit_shared_objects"]["minItems"], 3) + + self.assertEqual(contracts.collective_kernel_generation("nccl"), "nccl") + self.assertEqual(contracts.collective_kernel_generation("rccl"), "rccl") + with self.assertRaises(contracts.ContractError): + contracts.collective_kernel_generation("unknown") + + def test_routing_control_binds_binary_but_allows_treatment_configuration(self) -> None: + hybrid_keys, hybrid_artifacts = hybrid_jit_provenance() + implementation = { + "kernel_generation": "hybrid", + "name": "deepep-hybrid", + "provenance": { + "deepep_tree": "a" * 40, + "loaded_libraries": [{ + "role": "deepep-extension", "name": "deep_ep_cpp", "sha256": "1" * 64, + }], + "local_experts": 32, + "num_experts": 256, + "num_sms": 24, + "jit_cache_key": "case-one", + "jit_cubins": [{"cache_key": "one", "cubin_sha256": "2" * 64}], + "jit_kernel_keys": hybrid_keys, + "jit_shared_objects": hybrid_artifacts, + "realized_config": hybrid_realized_config(), + }, + "resource_profile": {"configured_units": 24}, + } + baseline = contracts.routing_implementation_control_sha256(implementation) + treatment = copy.deepcopy(implementation) + treatment["provenance"].update({ + "local_experts": 36, + "num_experts": 288, + "jit_cache_key": "case-two", + "jit_cubins": [{"cache_key": "two", "cubin_sha256": "3" * 64}], + "jit_kernel_keys": ["changed-a", "changed-b", "changed-c"], + "jit_shared_objects": hybrid_jit_provenance(3)[1], + "realized_config": { + **hybrid_realized_config(), + "num_of_experts_per_rank": 36, + }, + }) + self.assertEqual( + contracts.routing_implementation_control_sha256(treatment), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["loaded_libraries"][0]["sha256"] = "4" * 64 + self.assertEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["deepep_tree"] = "b" * 40 + self.assertNotEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["num_sms"] = 20 + self.assertNotEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + + def test_runtime_pins_uccl_wheel_and_hybrid_source_tree(self) -> None: + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + common = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn("cd /ix/experimental/CollectiveX", runtime) + for launcher_name in ("launch_single-slurm.sh", "launch_gb-nv.sh"): + launcher = (ROOT / "launchers" / launcher_name).read_text() + self.assertIn("$MOUNT_SRC:/ix", launcher) + self.assertIn("cx_prepare_backend_cache", launcher) + self.assertNotIn('$(cx_prepare_backend_cache', launcher) + self.assertIn('BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE"', launcher) + self.assertIn("$BACKEND_CACHE:/cx-cache", launcher) + self.assertIn("CX_BACKEND_CACHE_ROOT=/cx-cache", launcher) + self.assertIn("CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources", launcher) + self.assertIn('|| [ "$CX_BENCH" = deepep-hybrid ]', launcher) + self.assertIn("cx_prepare_backend_source", launcher) + cache_block = launcher[launcher.index('if [ "$CX_BENCH" = deepep-v2 ]'):] + self.assertLess( + cache_block.index("cx_set_failure_stage backend-setup"), + cache_block.index("cx_prepare_backend_cache"), + ) + self.assertLess( + cache_block.index("cx_prepare_backend_source"), + cache_block.index("cx_set_failure_stage scheduler-allocation"), + ) + self.assertIn("--frandom-seed=$seed", runtime) + self.assertIn("DEEPEP_V2_JIT_RANDOM_SEED", runtime) + persisted = runtime[runtime.index("cx_persist_backend_env()") :] + self.assertIn("CUDA_HOME CPATH NVCC_PREPEND_FLAGS", persisted) + self.assertIn( + "390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec", + runtime, + ) + self.assertIn("--require-hashes", runtime) + self.assertIn("d77aeab7f1bb52b615666fe178d26ced41fae08e", common) + self.assertIn("HEAD^{tree}", runtime) + self.assertIn("$PWD/.cx_backend/deepep-hybrid-", runtime) + self.assertIn("cx_materialize_backend_source deepep-hybrid", runtime) + self.assertIn("cx_materialize_backend_source deepep-v2", runtime) + self.assertIn("cx_deepep_hybrid_marker_content_sha256", runtime) + self.assertIn("cx_deepep_hybrid_cache_is_valid", runtime) + self.assertIn("cx_extension_pair_sha256", runtime) + self.assertIn(".collectivex-complete.tmp.", runtime) + self.assertNotIn("cx_fetch_revision", runtime) + self.assertIn("cx_fetch_revision", common) + self.assertIn("third-party/fmt", common) + hybrid = runtime[ + runtime.index("cx_build_deepep_hybrid()"): + runtime.index("# UCCL EP") + ] + self.assertIn("cx_prepare_cuda_cccl", hybrid) + self.assertIn("unset NVSHMEM_DIR HYBRID_EP_MULTINODE USE_NIXL", hybrid) + self.assertNotIn("cx_prepare_deepep_toolchain", hybrid) + toolchain = runtime[ + runtime.index("cx_prepare_deepep_toolchain()"): + runtime.index("cx_probe_deepep()") + ] + self.assertIn('overlay="$root/nvshmem-overlay"', toolchain) + self.assertIn("flock 8 || exit 1", toolchain) + self.assertIn('mv "$temporary" "$overlay" || exit 1', toolchain) + self.assertNotIn("/tmp/collectivex-nvshmem", toolchain) + jit = runtime[ + runtime.index("cx_enable_deepep_v2_jit_reproducibility()"): + runtime.index("cx_probe_deepep_v2()") + ] + self.assertIn('cccl="${CX_CUDA_CCCL:-}"', jit) + self.assertNotIn("/usr/local/cuda*", jit) + self.assertIn("deepep-v2-cache-v2|$cpu|sm${arch/./}", runtime) + self.assertNotIn("deepep-v2-cache-v1|", runtime) + self.assertIn('base="${CX_BACKEND_CACHE_ROOT:-}"', runtime) + self.assertNotIn("${CX_BACKEND_CACHE_ROOT:-$PWD/.cx_backend}", runtime) + self.assertIn( + "recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2", runtime + ) + self.assertNotIn("recipe=aot-source-date-epoch-arch-maxjobs16-v1", runtime) + self.assertNotIn("recipe=$source_sha", runtime) + self.assertIn("pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0", runtime) + self.assertIn("manual-unverified", runtime) + self.assertIn("cx_deepep_v2_content_sha256", runtime) + self.assertIn("DeepEP V2 cache validation failed", runtime) + probe = runtime[ + runtime.index("cx_probe_deepep_v2()"): + runtime.index("cx_deepep_v2_content_sha256()") + ] + self.assertNotIn("torch.cuda.nccl.version", probe) + self.assertIn("ncclGetVersion", probe) + self.assertIn("runtime_version.value == 23004", probe) + self.assertIn("cx_nvidia_package_root nvidia-nccl-cu13 nccl", runtime) + self.assertIn("cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem", runtime) + self.assertNotIn("import os,nvidia.nccl", runtime) + self.assertNotIn("import os,nvidia.nvshmem", runtime) + self.assertIn( + 'export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"', runtime + ) + self.assertIn('stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}"', runtime) + self.assertNotIn('export EP_JIT_CACHE_DIR="$root/jit"', runtime) + self.assertIn('EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"', runtime) + reference = (HERE / "ep_nccl.py").read_text() + self.assertIn("self.kernel_generation = contracts.collective_kernel_generation", reference) + + def test_deepep_v2_cache_recovers_from_an_unpublished_partial_build(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + content_hash = "b" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + stale = root / "stale-partial-build" + stale.write_text("partial\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2"; expected_revision="$3"; expected_tree="$4"; expected_fmt="$5" + expected_content="$6" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_activate_deepep_v2() { export DEEPEP_V2_COMMIT="$expected_revision"; } + cx_prepare_deepep_toolchain() { export NVSHMEM_DIR=/tmp/cx-test-nvshmem; } + cx_probe_deepep_v2() { return 0; } + cx_deepep_v2_content_sha256() { printf '%s' "$expected_content"; } + cx_deepep_v2_cache_is_valid() { + test -f "$2" && test "$(wc -l < "$2" | tr -d ' ')" = 5 + } + cx_enable_deepep_v2_jit_reproducibility() { return 0; } + cx_materialize_backend_source() { mkdir -p "$2/third-party/fmt"; } + flock() { return 0; } + python3() { + if [ "${1:-}" = -m ] && [ "${2:-}" = venv ]; then + mkdir -p "$3/bin" + printf '#!/bin/sh\nexit 0\n' > "$3/bin/python" + chmod 700 "$3/bin/python" + fi + return 0 + } + git() { + case " $* " in + *' third-party/fmt rev-parse HEAD '*) printf '%s\n' "$expected_fmt" ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' "$expected_tree" ;; + *' show -s --format=%ct HEAD '*) printf '1\n' ;; + *) return 0 ;; + esac + } + cx_git_in_tree() { shift; git "$@"; } + cx_build_deepep_v2 + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(runtime), str(root), + COMMIT, TREE, FMT_COMMIT, content_hash, + ], + check=True, + ) + self.assertFalse(stale.exists()) + self.assertEqual( + marker.read_text(), + f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n", + ) + self.assertEqual(list(root.glob(".collectivex-complete.tmp.*")), []) + + def test_deepep_v2_published_cache_is_never_deleted_after_probe_failure(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + marker.write_text("published\n") + sentinel = root / "active-reader" + sentinel.write_text("active\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_deepep_v2_cache_is_valid() { return 0; } + cx_activate_deepep_v2() { return 0; } + cx_prepare_deepep_toolchain() { return 0; } + cx_enable_deepep_v2_jit_reproducibility() { return 0; } + cx_probe_deepep_v2() { return 1; } + ! cx_build_deepep_v2 + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(root)], + check=True, + ) + self.assertEqual(sentinel.read_text(), "active\n") + self.assertEqual(marker.read_text(), "published\n") + + def test_deepep_v2_corrupt_published_cache_fails_without_reset(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + marker.write_text("corrupt\n") + sentinel = root / "active-reader" + sentinel.write_text("active\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_deepep_v2_cache_is_valid() { return 1; } + flock() { return 0; } + ! cx_build_deepep_v2 + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(root)], + check=True, + ) + self.assertEqual(sentinel.read_text(), "active\n") + self.assertEqual(marker.read_text(), "corrupt\n") + + def test_deepep_v2_marker_requires_private_owned_cache_objects(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) / "cache" + root.mkdir(mode=0o700) + (root / "source").mkdir(mode=0o700) + (root / "venv").mkdir(mode=0o700) + marker = root / ".collectivex-complete" + cache_key = "a" * 64 + content_hash = "b" * 64 + marker.write_text( + f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n" + ) + root.chmod(0o2700) + marker.chmod(0o600) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_v2_marker_content_sha256()/,/^}/p' "$1")" + cx_deepep_v2_marker_content_sha256 "$2" "$3" "$4" "$5" "$6" "$7" + ''' + args = [ + "bash", "-c", command, "_", str(runtime), str(root), str(marker), + COMMIT, TREE, FMT_COMMIT, cache_key, + ] + valid = subprocess.run(args, text=True, capture_output=True, check=True) + self.assertEqual(valid.stdout, content_hash) + marker.chmod(0o644) + self.assertNotEqual(subprocess.run(args).returncode, 0) + + def test_deepep_hybrid_marker_requires_a_private_regular_file(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) / "cache" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + content_hash = "b" * 64 + marker.write_text(f"{COMMIT}\n{TREE}\n{content_hash}\n") + root.chmod(0o2700) + marker.chmod(0o600) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$1")" + cx_deepep_hybrid_marker_content_sha256 "$2" "$3" "$4" "$5" + ''' + args = [ + "bash", "-c", command, "_", str(runtime), str(root), str(marker), + COMMIT, TREE, + ] + valid = subprocess.run(args, text=True, capture_output=True, check=True) + self.assertEqual(valid.stdout, content_hash) + marker_contract = runtime.read_text() + marker_contract = marker_contract[ + marker_contract.index("cx_deepep_hybrid_marker_content_sha256()"): + marker_contract.index("cx_deepep_hybrid_cache_is_valid()") + ] + self.assertIn("marker_item.st_uid != root_item.st_uid", marker_contract) + self.assertNotIn("st_uid != os.getuid()", marker_contract) + marker.chmod(0o644) + self.assertNotEqual(subprocess.run(args).returncode, 0) + + def test_deepep_v2_installed_content_digest_binds_every_distribution_file(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + site = Path(temporary) / "venv" / "lib" / "python3.11" / "site-packages" + package = site / "deep_ep" + info = site / "deep_ep-2.0.0.dist-info" + package.mkdir(parents=True) + info.mkdir() + (package / "__init__.py").write_text("__version__ = '2.0.0'\n") + extension = package / "_C.so" + extension.write_bytes(b"extension-one") + (info / "METADATA").write_text( + "Metadata-Version: 2.1\nName: deep_ep\nVersion: 2.0.0\n" + ) + (info / "RECORD").write_text( + "deep_ep/__init__.py,,\n" + "deep_ep/_C.so,,\n" + "deep_ep-2.0.0.dist-info/METADATA,,\n" + "deep_ep-2.0.0.dist-info/RECORD,,\n" + ) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_v2_content_sha256()/,/^}/p' "$1")" + cx_deepep_v2_content_sha256 + ''' + env = { + **os.environ, + "PYTHONPATH": str(site), + "VIRTUAL_ENV": str(Path(temporary) / "venv"), + } + first = subprocess.run( + ["bash", "-c", command, "_", str(runtime)], + text=True, capture_output=True, check=True, env=env, + ).stdout + extension.write_bytes(b"extension-two") + second = subprocess.run( + ["bash", "-c", command, "_", str(runtime)], + text=True, capture_output=True, check=True, env=env, + ).stdout + self.assertRegex(first, r"^[0-9a-f]{64}$") + self.assertRegex(second, r"^[0-9a-f]{64}$") + self.assertNotEqual(first, second) + extension.unlink() + outside = Path(temporary) / "outside.so" + outside.write_bytes(b"outside") + extension.symlink_to(outside) + self.assertNotEqual( + subprocess.run( + ["bash", "-c", command, "_", str(runtime)], env=env, + ).returncode, + 0, + ) + + def test_uccl_content_identity_excludes_install_generated_files(self) -> None: + keep = load_uccl_function( + "_is_uccl_runtime_payload", {"PurePosixPath": PurePosixPath} + ) + self.assertTrue(keep("uccl/ep.abi3.so")) + self.assertTrue(keep("uccl.libs/libnuma.so")) + self.assertFalse(keep("uccl/__pycache__/collective.cpython-312.pyc")) + self.assertFalse(keep("uccl-0.1.1.dist-info/RECORD")) + + def test_uccl_dependency_versions_are_exact(self) -> None: + installed = dict(contracts.UCCL_DEPENDENCY_VERSIONS) + dependency_versions = load_uccl_function( + "_uccl_dependency_versions", + { + "contracts": contracts, + "metadata": types.SimpleNamespace( + version=lambda package: installed[package] + ), + }, + ) + self.assertEqual(dependency_versions(), contracts.UCCL_DEPENDENCY_VERSIONS) + installed["intervaltree"] = "3.2.0" + with self.assertRaisesRegex(RuntimeError, "differ from the v1 contract"): + dependency_versions() + + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + dependency_schema = schema["properties"]["implementation"]["properties"][ + "provenance" + ]["properties"]["uccl_dependency_versions"] + self.assertFalse(dependency_schema["additionalProperties"]) + self.assertEqual( + { + package: definition["const"] + for package, definition in dependency_schema["properties"].items() + }, + contracts.UCCL_DEPENDENCY_VERSIONS, + ) + + def test_uccl_support_dependency_content_is_path_free(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + source_entry = PurePosixPath("intervaltree/__init__.py") + cache_entry = PurePosixPath("intervaltree/__pycache__/__init__.pyc") + metadata_entry = PurePosixPath("intervaltree-3.1.0.dist-info/RECORD") + for entry in (source_entry, cache_entry, metadata_entry): + path = root / entry + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(entry.as_posix().encode()) + distribution = types.SimpleNamespace( + files=[source_entry, cache_entry, metadata_entry], + locate_file=lambda item: root / item, + ) + evidence_for = load_uccl_function( + "_python_dependency_evidence", + { + "Path": Path, + "PurePosixPath": PurePosixPath, + "contracts": contracts, + "metadata": types.SimpleNamespace( + distribution=lambda package: distribution + ), + }, + ) + evidence = evidence_for("intervaltree", "3.1.0") + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="intervaltree-distribution", + name="intervaltree-3.1.0", + files=[(source_entry.as_posix(), root / source_entry)], + ), + ) + self.assertNotIn(str(root), json.dumps(evidence)) + + def test_uccl_hashes_the_mapped_pinned_libcudart_without_exposing_paths( + self, + ) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + entry = PurePosixPath("nvidia/cuda_runtime/lib/libcudart.so.12") + library = root / entry + library.parent.mkdir(parents=True) + library.write_bytes(b"pinned CUDA 12 runtime") + distribution = types.SimpleNamespace( + files=[entry], + locate_file=lambda item: root / item, + ) + evidence_for = load_uccl_function( + "_loaded_libcudart_evidence", + { + "Path": Path, + "PurePosixPath": PurePosixPath, + "contracts": contracts, + "metadata": types.SimpleNamespace( + distribution=lambda package: distribution + ), + }, + ) + maps = root / "maps" + maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {library}\n") + evidence = evidence_for("12.9.79", maps) + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="cuda-runtime", + name="nvidia-cuda-runtime-cu12-12.9.79", + files=[("libcudart.so", library)], + ), + ) + self.assertNotIn(str(root), json.dumps(evidence)) + + unowned = root / "unowned" / library.name + unowned.parent.mkdir() + unowned.write_bytes(library.read_bytes()) + maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {unowned}\n") + with self.assertRaisesRegex(RuntimeError, "not owned") as raised: + evidence_for("12.9.79", maps) + self.assertNotIn(str(root), str(raised.exception)) + + def test_private_runtime_logs_are_not_public_artifacts(self) -> None: + path = subprocess.check_output( + [ + "bash", "-c", 'source "$1"; cx_private_log_path test', "_", + str(ROOT / "runtime" / "common.sh"), + ], + text=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": "contract-test"}, + ).strip() + try: + log = Path(path) + self.assertEqual(stat.S_IMODE(log.stat().st_mode), 0o600) + self.assertEqual(stat.S_IMODE(log.parent.stat().st_mode), 0o700) + self.assertFalse(log.is_relative_to(ROOT)) + finally: + shutil.rmtree(Path(path).parent, ignore_errors=True) + + def test_private_runtime_logs_reject_traversal_and_symlinks(self) -> None: + common = str(ROOT / "runtime" / "common.sh") + for variable, value in ( + ("COLLECTIVEX_EXECUTION_ID", ".."), + ("CX_TEST_LABEL", ".."), + ): + environment = { + **os.environ, + "COLLECTIVEX_EXECUTION_ID": "contract-adversarial", + "CX_TEST_LABEL": "test", + variable: value, + } + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path "$CX_TEST_LABEL"', "_", common], + text=True, + capture_output=True, + env=environment, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn(value, result.stderr) + + private_root = Path(f"/tmp/inferencex-collectivex-{os.getuid()}") + private_root.mkdir(mode=0o700, exist_ok=True) + self.assertFalse(private_root.is_symlink()) + os.chmod(private_root, 0o700) + with tempfile.TemporaryDirectory() as temporary: + target = Path(temporary) + tag = f"contract-symlink-{os.getpid()}" + link = private_root / tag + link.symlink_to(target, target_is_directory=True) + try: + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertEqual(list(target.iterdir()), []) + finally: + link.unlink(missing_ok=True) + + tag = f"contract-log-symlink-{os.getpid()}" + directory = private_root / tag + directory.mkdir(mode=0o700) + target_file = target / "target" + target_file.write_text("unchanged") + log_link = directory / "test.log" + log_link.symlink_to(target_file) + try: + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertEqual(target_file.read_text(), "unchanged") + finally: + log_link.unlink(missing_ok=True) + directory.rmdir() + + def test_operator_config_failure_is_value_free(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + config = Path(temporary) / "operator.env" + config.write_text("printf 'private-config-token\\n' >&2\nfalse\n") + config.chmod(0o600) + result = subprocess.run( + ["bash", "-c", + 'export COLLECTIVEX_EXECUTION_ID="operator-failure-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; source \"$1\"; " + "cx_load_operator_config", "_", + str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("runner-local configuration failed", result.stderr) + self.assertNotIn("private-config-token", result.stderr) + + def test_ephemeral_operator_config_is_removed_after_source(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + config = Path(temporary) / "operator.env" + decoy = Path(temporary) / "decoy" + decoy.write_text("keep") + config.write_text(json.dumps(operator_config(Path(temporary) / "storage"))) + config.chmod(0o600) + result = subprocess.run( + [ + "bash", "-c", + 'export COLLECTIVEX_EXECUTION_ID="operator-ephemeral-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; " + 'config="$COLLECTIVEX_OPERATOR_CONFIG"; source "$1"; ' + 'cx_load_operator_config; test ! -e "$config"; ' + 'test "$CX_PARTITION" = test; ' + 'test -z "${COLLECTIVEX_OPERATOR_CONFIG+x}"', + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + "COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL": "1", + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertFalse(config.exists()) + self.assertEqual(decoy.read_text(), "keep") + + def test_operator_config_is_strict_per_runner_json(self) -> None: + command = ( + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="operator-config-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'test "$CX_PARTITION" = test; ' + 'test -z "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT+x}"; ' + 'test -z "${ENROOT_CACHE_PATH+x}"' + ) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + document = operator_config(root / "storage") + config = root / "operator.json" + config.write_text(json.dumps(document)) + config.chmod(0o600) + for runner in capability.PLATFORMS: + with self.subTest(runner=runner): + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": runner, + "ENROOT_CACHE_PATH": "/private/stale-enroot-cache", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + + lock_dir = root / "amd-locks" + document["runners"]["mi355x"]["lock_dir"] = str(lock_dir) + config.write_text(json.dumps(document)) + config.chmod(0o600) + canonical = subprocess.run( + [ + "bash", + "-c", + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="canonical-lock-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'cx_lock_canonical_gha_env mi355x; test "$CX_LOCK_DIR" = "$2"', + "_", + str(ROOT / "runtime" / "common.sh"), + str(lock_dir), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "mi355x", + "CX_SHARD_FILE": ".shards/test.json", + "CX_SHARD_SKU": "mi355x", + "CX_NODES": "1", + "CX_GPUS_PER_NODE": "8", + "COLLECTIVEX_CANONICAL_GHA": "1", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + "COLLECTIVEX_SOURCE_SHA": "a" * 40, + "GITHUB_ACTIONS": "true", + "GITHUB_RUN_ATTEMPT": "1", + "GITHUB_RUN_ID": "1", + "GITHUB_WORKSPACE": str(root.resolve()), + }, + ) + self.assertEqual(canonical.returncode, 0, canonical.stderr) + + selected_only = { + "schema_version": 1, + "runners": {"h100-dgxc": document["runners"]["h100-dgxc"]}, + } + result = subprocess.run( + [ + "bash", "-c", command + '; test -z "${CX_STAGE_DIR+x}"', "_", + str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "CX_STAGE_DIR": "/private/stale-stage", + "ENROOT_CACHE_PATH": "/private/stale-enroot-cache", + "COLLECTIVEX_OPERATOR_CONFIG_LOADED": "1", + "COLLECTIVEX_OPERATOR_CONFIG_CONTENT": json.dumps(selected_only), + "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1", + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + + rejected = json.loads(json.dumps(document)) + rejected["runners"]["h100-dgxc"]["shell"] = "private-command" + boolean_version = {**document, "schema_version": True} + for invalid in (rejected, boolean_version): + config.write_text(json.dumps(invalid)) + config.chmod(0o600) + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn("private-command", result.stderr) + + config.write_text(json.dumps(document)) + config.chmod(0o644) + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_publisher.py b/experimental/CollectiveX/tests/test_publisher.py new file mode 100644 index 0000000000..86b1e9607d --- /dev/null +++ b/experimental/CollectiveX/tests/test_publisher.py @@ -0,0 +1,2334 @@ +#!/usr/bin/env python3 +"""Focused end-to-end tests for the isolated CollectiveX publisher.""" +from __future__ import annotations + +import copy +import hashlib +import itertools +import json +import os +from pathlib import Path +import subprocess +import sys +import tempfile +import types +import unittest +from unittest import mock +import zipfile + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import contracts # noqa: E402 +import identity # noqa: E402 +import publisher # noqa: E402 +import summarize # noqa: E402 +import sweep_matrix # noqa: E402 + + +RUN = { + "repository": "SemiAnalysisAI/InferenceX", + "run_id": "12345", + "run_attempt": 1, + "source_sha": "a" * 40, +} + + +def _unsupported_delivery( + root: Path, ordinals: tuple[int, ...] = (1,), run: dict = RUN, +) -> tuple[Path, Path]: + matrix = sweep_matrix.resolve_matrix(backends="all") + wrapper = next(item for item in matrix["requested_cases"] if item["disposition"] == "unsupported") + matrix = { + "format": "collectivex.matrix.v1", + "schema_version": 1, + "requested_cases": [wrapper], + "include": [], + } + case = {key: value for key, value in wrapper["case"].items() if key != "case_id"} + artifact_name = f"cxunsupported-{run['run_id']}-{run['run_attempt']}" + git_run = { + "artifact": artifact_name, + "job": "setup", + "ref": "collectivex", + "repo": run["repository"], + "run_attempt": str(run["run_attempt"]), + "run_id": run["run_id"], + "source_sha": run["source_sha"], + } + allocation = { + "artifact": artifact_name, + "execution_id": f"{run['run_id']}_{run['run_attempt']}_unsupported", + "job": "setup", + "repo": run["repository"], + "run_attempt": str(run["run_attempt"]), + "run_id": run["run_id"], + "runner": "capability-resolver", + "source_sha": run["source_sha"], + } + matrix_path = root / "matrix.json" + artifact = root / artifact_name + artifact.mkdir() + matrix_path.write_text(json.dumps(matrix)) + control_sha256 = hashlib.sha256(matrix_path.read_bytes()).hexdigest() + for ordinal in ordinals: + terminal = contracts.make_terminal_document( + allocation_factors=allocation, attempt_ordinal=ordinal, case=case, + case_factors={"case": case, "profile": identity.V1_CASE_PROFILE, + "sku": wrapper["sku"]}, + control_sha256=control_sha256, failure_mode="capability", + generated_at="2026-07-04T00:00:00Z", git_run=git_run, + reason=wrapper["reason"], return_code=5, source="matrix-capability-resolver", + status="unsupported", expected_case_id=wrapper["case"]["case_id"], + ) + (artifact / f"unsupported-{ordinal}.json").write_text(json.dumps(terminal)) + return matrix_path, artifact + + +def _args( + store: Path, matrix: Path, artifact: Path, run: dict = RUN +) -> types.SimpleNamespace: + return types.SimpleNamespace( + store_root=str(store), + matrix=str(matrix), + artifact=[str(artifact)], + repository=run["repository"], + run_id=run["run_id"], + run_attempt=run["run_attempt"], + source_sha=run["source_sha"], + ) + + +def _ids(seed: str) -> tuple[str, str, str, str, str, str]: + case = identity.digest("case", {"seed": seed}) + allocation = identity.allocation_id({"seed": seed}) + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + series = identity.series_id({"seed": seed}) + point = identity.point_id(series=series, tokens_per_rank=8) + evidence = identity.evidence_id( + point=point, allocation=allocation, attempt=attempt, sample_sha256="b" * 64 + ) + return case, allocation, attempt, series, point, evidence + + +def _component(scale: float = 1.0) -> dict: + latency = {"p50": 10.0 * scale, "p90": 12.0 * scale, + "p95": 14.0 * scale, "p99": 20.0 * scale} + logical_bytes = 100_000 + return { + "origin": "measured", + "latency_us": latency, + "logical_bytes": logical_bytes, + "logical_payload_rate_gbps_at_latency_percentile": { + name: logical_bytes / (value * 1000.0) for name, value in latency.items() + }, + "sample_count": 512, + } + + +def _hybrid_provenance(ep_size: int = 1) -> dict: + realized = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS} + for field in contracts.HYBRID_REALIZED_BOOL_FIELDS: + realized[field] = True + realized.update({ + "num_of_experts_per_rank": 1, + "num_of_nodes": 1, + "num_of_ranks_per_node": ep_size, + "token_data_type": "UINT16", + }) + kernel_keys = ["combine-key", "dispatch-key", "preprocess-key"] + return { + "backend_lineage": "deepep-hybrid", "branch": "hybrid-ep", + "deepep_commit": "a" * 40, "deepep_tree": "b" * 40, + "device_sms": 1, + "jit_kernel_keys": kernel_keys, + "jit_shared_objects": [ + { + "kernel_key": key, + "rank_artifacts": [ + {"bytes": 1, "rank": rank, "sha256": f"{index + 1:x}" * 64} + for rank in range(ep_size) + ], + } + for index, key in enumerate(kernel_keys) + ], + "loaded_libraries": [ + {"name": "deep_ep_cpp", "role": "deepep-extension", "sha256": "4" * 64}, + {"name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", "sha256": "5" * 64}, + ], + "realized_config": realized, + "resource_mode": "tuned", + "tuned_source": "deepep-hybrid-configurer-autotune-v1", + } + + +def _native_fixture(backend: str = "nccl-ep") -> tuple[dict, dict]: + def digest(value: object) -> str: + return hashlib.sha256(contracts.canonical_json_bytes(value)).hexdigest() + + scheduled = { + "backend": backend, "canonical": True, "eplb": False, "ep": 1, + "experts": 1, "gpus_per_node": 1, "hidden": 1, "ladder": "1", "nodes": 1, + "phase": "decode", "required_publication": "official", "routing": "uniform", + "samples_per_point": 512, "scale_up_domain": 1, "suite": "ep-core-v1", + "timing": "8:64:32", "topk": 1, + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + "workload": "deepseek-v3-v1", + } + case_factors = {"case": scheduled, "profile": identity.V1_CASE_PROFILE, "sku": "fixture"} + case_id = identity.digest("case", case_factors) + git_run = { + "artifact": "cxshard-fixture-999-1", "job": "sweep", "ref": "collectivex", + "repo": RUN["repository"], "run_attempt": "1", "run_id": "999", + "source_sha": RUN["source_sha"], + } + allocation_factors = { + "artifact": git_run["artifact"], "execution_id": "999_1_fixture", + "job": git_run["job"], "repo": git_run["repo"], "run_attempt": "1", + "run_id": "999", "runner": "fixture", "source_sha": git_run["source_sha"], + } + allocation_id = identity.allocation_id(allocation_factors) + attempt_id = identity.attempt_id(allocation=allocation_id, case=case_id, ordinal=1) + member_id, member_checksums, routing_hash, routing_rows, routing_weights = ( + contracts._expected_canonical_trace( + "uniform", hidden=1, topk=1, logical_experts=1, physical_experts=1, + ep_size=1, tokens_per_rank=1, seed=67, eplb_enabled=False, + reference_tokens_per_rank=2048, + ) + ) + workload_id = identity.workload_id({ + "members": [{"checksums": member_checksums, "workload_id": member_id}] + }) + runtime = { + "accelerator_runtime": {"kind": "cuda", "version": "13.0"}, + "collective_library": {"kind": "nccl", "version": "2.30.4"}, + "device": { + "arch": "sm100", "compute_units": 1, "memory_bytes": 1, + "product": "Fixture GPU", "warp_size": 32, + }, + "driver_version": "1", "framework": {"kind": "torch", "version": "2.10.0"}, + "machine": "fixture", "python_version": "3.12", "vendor": "nvidia", + } + implementation_provenance = ( + { + "backend": "nccl-ep", "backend_lineage": "nccl", + "collective_library": "nccl", "nccl_version": "2.30.4", + "reference_semantics": "fixture-v1", + } + if backend == "nccl-ep" + else _hybrid_provenance() + ) + kernel_generation = "nccl" if backend == "nccl-ep" else "hybrid" + implementation = { + "kernel_generation": kernel_generation, + "name": backend, + "provenance": implementation_provenance, + "resource_profile": contracts.project_resource_profile(implementation_provenance), + } + public_config = contracts.public_series_config( + kernel_generation=implementation["kernel_generation"], + provenance=implementation_provenance, + resource_profile=implementation["resource_profile"], + resource_mode="tuned", + device_product=runtime["device"]["product"], + ) + series_factors = { + "backend": backend, "case_id": case_id, + "image_digest": "sha256:" + "d" * 64, + "implementation_contract_sha256": digest({ + **implementation, + "provenance": contracts.series_provenance(implementation_provenance), + }), + "public_config_sha256": contracts.public_series_config_sha256(public_config), + "routing_control_sha256": contracts.routing_implementation_control_sha256( + implementation + ), + "runtime_fingerprint_sha256": digest(runtime), + "source_sha": RUN["source_sha"], "squash_sha256": "e" * 64, + "workload_id": workload_id, + } + series_id = identity.series_id(series_factors) + point_id = identity.point_id(series=series_id, tokens_per_rank=1) + sample_components = { + name: { + "availability": "measured", "sample_count": 512, + "trials": [[latency] * 8 for _ in range(64)], + } + for name, latency in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0)) + } + sample_sha = digest({"components": sample_components, "tokens_per_rank": 1}) + evidence_id = identity.evidence_id( + point=point_id, allocation=allocation_id, attempt=attempt_id, + sample_sha256=sample_sha, + ) + samples = { + "allocation_id": allocation_id, "attempt_id": attempt_id, "case_id": case_id, + "format": contracts.SAMPLES_FORMAT, + "points": [{ + "components": sample_components, "evidence_id": evidence_id, + "point_id": point_id, "sample_sha256": sample_sha, "tokens_per_rank": 1, + }], + "sampling": { + "iterations_per_trial": 8, "reduction": "cross-rank-max-per-iteration", + "trials": 64, + }, + "schema_version": 1, "series_id": series_id, + } + sample_bytes = contracts.canonical_json_bytes(samples) + oracle = { + "atol": 0.02, + "checks": {name: True for name in ( + "combine_values", "counts", "metadata", "multiplicity", "payload", + "source_set", "weights", + )}, + "combine_weight_semantics": "unweighted-rank-sum", + "contract": "expert-specific-transform-v1", "dispatch_sha256": "1" * 64, + "max_absolute_error": 0.0, "max_elementwise_relative_error": 0.0, + "max_relative_error": 0.0, "max_weight_error": 0.0, + "order_sha256": "2" * 64, "ordering_contract": "fixture-order-v1", + "passed": True, "receive_count": 1, "rtol": 0.05, + } + def pct(value: float) -> dict[str, float]: + return {name: value for name in ("p50", "p90", "p95", "p99")} + + def measured(value: float) -> dict: + return { + "availability": "measured", "origin": "measured", + "percentiles_us": pct(value), "sample_count": 512, + } + row = { + "anomalies": [], + "components": { + "combine": measured(20.0), "dispatch": measured(10.0), + "isolated_sum": { + "availability": "derived", "origin": "derived-percentile-sum", + "percentiles_us": pct(30.0), "sample_count": 0, + }, + "roundtrip": measured(40.0), + }, + "correctness": { + "contract": "expert-specific-transform-v1", "max_relative_error": 0.0, + "passed": True, + "rank_evidence": [{ + "input_unchanged": True, "order_stable": True, + "post_timing": copy.deepcopy(oracle), "pre_timing": copy.deepcopy(oracle), + "rank": 0, + }], + "scope": "dispatch-metadata-and-transformed-combine", + }, + "evidence_id": evidence_id, "global_tokens": 1, + "logical_bytes": {"combine": 2, "dispatch": 2, "roundtrip": 4}, + "point_id": point_id, + "receive": {"max": 1, "mean": 1.0, "min": 1, "total": 1}, + "routing": contracts._expected_routing_summary( + routing_rows, + routing_weights, + physical_experts=1, + ep_size=1, + tokens_per_rank=1, + gpus_per_node=1, + scale_up_domain=1, + ), + "sample_histograms": { + name: contracts._expected_histogram([value] * 512) + for name, value in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0)) + }, + "sample_sha256": sample_sha, + "token_rate_at_latency_percentile": pct(25_000.0), "tokens_per_rank": 1, + } + raw = { + "case": { + "attempt_ordinal": 1, "backend": backend, + "eplb": { + "enabled": False, "imbalance_after": None, "imbalance_before": None, + "mapping_hash": None, "max_replicas": None, "num_logical_experts": 1, + "num_physical_experts": 1, "num_redundant": 0, "planner": None, + "reference_tokens_per_rank": None, "replicated_experts": 0, + }, + "ep_size": 1, "mode": "normal", "phase": "decode", + "required_publication": "official", "resource_mode": "tuned", "runner": "fixture", + "shape": { + "activation_profile": "canonical-counter-source-v3", "dispatch_dtype": "bf16", + "eplb": False, "experts": 1, "experts_per_rank": 1, "hidden": 1, + "kernel_gen": kernel_generation, "num_logical_experts": 1, + "quant": { + "combine_accum_dtype": "fp32", "combine_input_dtype": "bf16", + "combine_output_dtype": "bf16", "combine_quant_mode": "none", + "scale_layout": None, + }, + "routing": "uniform", "topk": 1, + }, + "suite": "ep-core-v1", "workload_name": "deepseek-v3-v1", + }, + "format": contracts.RAW_FORMAT, "generated_at": "2026-07-04T00:00:00Z", + "identity": { + "allocation_factors": allocation_factors, "allocation_id": allocation_id, + "attempt_id": attempt_id, "attempt_ordinal": 1, "case_factors": case_factors, + "case_id": case_id, "series_factors": series_factors, "series_id": series_id, + }, + "implementation": implementation, + "measurement": { + "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2", + "conditioning": { + "contract": "fixed-phase-ramp-8-roundtrips-v1", + "ladder": [1, 2, 4, 8, 16, 32, 64, 128], + "roundtrips_per_shape": 8, + }, + "contract": "layout-and-dispatch-v1", + "rows": [row], + "sampling": { + "contract": "fixed-512-v1", "iterations_per_trial": 8, + "percentile_method": "nearest-rank", + "reduction": "cross-rank-max-per-iteration", "samples_per_component": 512, + "trials": 64, "warmup_iterations": 32, + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + }, + "source_allocation": "even", + }, + "outcome": { + "publication_status": "diagnostic", "reasons": [], "status": "success", + "validity": { + "anomaly_free": True, "execution_status": "complete", + "measurement_conformance": "conformant", "provenance_complete": True, + "resource_conformance": implementation["resource_profile"]["conformance_class"], + "sampling_conformance": "conformant", + "semantic_correctness": "pass", + "workload_identity": "consistent-across-ranks", + "workload_source": "canonical-serialized", + }, + }, + "provenance": { + "command": "run_ep", "distributed_launcher": "torchrun", "git_run": git_run, + "image": { + "arch": "amd64", "digest": "sha256:" + "d" * 64, + "digest_verified": True, "reference": "fixture:1", "squash_sha256": "e" * 64, + }, + "redaction": "sanitized-v1", + }, + "record_type": "case-attempt", + "runtime_fingerprint": runtime, + "sample_artifact": { + "bytes": len(sample_bytes), "format": contracts.SAMPLES_FORMAT, + "path": "samples.json", "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }, + "schema_version": 1, + "topology": { + "device_count": 1, "device_product": "Fixture GPU", "gpus_per_node": 1, + "nodes": 1, "placement": "packed", + "realized_placement": { + "gpus_per_node": 1, "nodes": 1, "ranks_per_node": 1, + "unique_local_ranks": True, "valid": True, + }, + "scale_up_domain": 1, "topology_class": "fixture", "transport": "nvlink", + "world_size": 1, + }, + "workload": { + "activation_generator": "collectivex-activation-counter-v3", + "activation_identity": hashlib.sha256( + b"counter|seed=67|hidden=1|gen=collectivex-activation-counter-v3" + ).hexdigest(), + "activation_profile": "canonical-counter-source-v3", "cross_rank_consistent": True, + "manifest_checksums": {member_id: member_checksums}, "members": [member_id], + "routing_generator": "collectivex-routing-counter-v3", "source": "canonical-serialized", + "trace_hashes": [routing_hash], + "trace_signature": hashlib.sha256(routing_hash.encode()).hexdigest(), + "workload_id": workload_id, + }, + } + return raw, samples + + +def _series(seed: str, backend: str, *, decision_grade: bool = False) -> tuple[dict, dict]: + case, allocation, attempt, series_id, point_id, evidence = _ids(seed) + allocations = [identity.allocation_id({"seed": seed, "run": run}) for run in range(3)] + eligibility = publisher._eligibility_record( + allocations if decision_grade else [allocation], + complete=decision_grade, + correct=True, + measured=True, + stable_ordering=True, + p50_ratio=1.01 if decision_grade else None, + p99_ratio=1.02 if decision_grade else None, + ) + component = _component(1.0 if backend == "deepep" else 1.2) + item = { + "series_id": series_id, + "label": f"H100 / {backend}", + "status": "decision-grade" if decision_grade else "diagnostic", + "case_ids": [case], + "allocation_ids": allocations if decision_grade else [allocation], + "model": "deepseek-v3-v1", + "suite": "ep-core-v1", + "phase": "decode", + "publication_tier": "official", + "backend": { + "id": backend, "label": publisher.BACKEND_LABELS[backend], + "role": "reference" if backend == "nccl-ep" else "library", + "generation": "nccl" if backend == "nccl-ep" else None, + "version": "1.0"}, + "build": { + "implementation_contract_sha256": hashlib.sha256(backend.encode()).hexdigest(), + "public_config_sha256": "0" * 64, + "routing_control_sha256": hashlib.sha256(backend.encode()).hexdigest(), + "runtime_fingerprint_sha256": "3" * 64, + "image_digest": "sha256:" + "1" * 64, + "source_sha": "a" * 40, + "squash_sha256": "2" * 64, + }, + "system": { + "sku": "h100-dgxc", "label": "NVIDIA H100", "vendor": "nvidia", + "topology_class": "h100-nvlink-island", "transport": "nvlink", + "world_size": 8, "ep_size": 8, "placement": "packed", + }, + "workload": { + "workload_id": identity.workload_id({"shape": "shared"}), + "hidden": 7168, "top_k": 8, "experts": 256, + "routing": "uniform", "eplb": False, + "dispatch_dtype": "bf16", "combine_dtype": "bf16", + "activation_profile": "canonical-counter-source-v3", + }, + "eplb": { + "enabled": False, "planner": None, "mapping_sha256": None, + "logical_experts": 256, "physical_experts": 256, + "redundant_experts": 0, "reference_tokens_per_rank": None, + "replicated_experts": 0, "max_replicas": None, + "imbalance_before": None, "imbalance_after": None, + }, + "resource": {"mode": "tuned", "profile": "profile-1", "comm_units_kind": "sm", "configured_units": 24}, + "measurement": { + "contract": "layout-and-dispatch-v1", "sampling_contract": "fixed-512-v1", + "iters": 8, "trials": 64, "warmups": 32, "samples_per_component": 512, + "headline_component": "roundtrip", "headline_percentile": "p99", + }, + "points": [{ + "point_id": point_id, "tokens_per_rank": 8, "global_tokens": 64, + "correct": True, + "routing": { + "fanout_mean": 4.0, "recv_tokens_max": 64, + "expert_load_cv": 0.5, "payload_rank_cv": 0.25, + "hotspot_ratio": 2.0, "empty_expert_count": 0, + "empty_rank_count": 0, "routed_copies": 256, + }, + "components": {"dispatch": None, "combine": None, + "roundtrip": component, "isolated_sum": None}, + "roundtrip_token_rate_at_latency_percentile": { + name: 64 / (latency * 1e-6) + for name, latency in component["latency_us"].items() + }, + "evidence_ids": [evidence], + }], + "eligibility": eligibility, + } + item["build"]["public_config_sha256"] = contracts.public_series_config_sha256( + publisher._public_series_config(item) + ) + case = identity.digest("case", publisher._public_case_factors(item)) + item["case_ids"] = [case] + build = item["build"] + series_id = identity.series_id({ + "backend": item["backend"]["id"], + "case_id": case, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build["implementation_contract_sha256"], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": item["workload"]["workload_id"], + }) + item["series_id"] = series_id + point_id = identity.point_id(series=series_id, tokens_per_rank=8) + item["points"][0]["point_id"] = point_id + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + evidence = identity.evidence_id( + point=point_id, allocation=allocation, attempt=attempt, + sample_sha256=hashlib.sha256(seed.encode()).hexdigest(), + ) + item["points"][0]["evidence_ids"] = [evidence] + runs = { + str(run): {8: { + "latency_us": { + statistic: component["latency_us"][statistic] * (1 + run / 100) + for statistic in ("p50", "p99") + }, + "logical_payload_rate_gbps_at_latency_percentile": { + statistic: component["logical_payload_rate_gbps_at_latency_percentile"][statistic] / (1 + run / 100) + for statistic in ("p50", "p99") + }, + }} + for run in range(3) + } + internal = {"run_metrics": runs} + return item, internal + + +def _dataset() -> dict: + item, _ = _series("one", "deepep") + case = item["case_ids"][0] + allocation = item["allocation_ids"][0] + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + evidence = item["points"][0]["evidence_ids"][0] + return { + "format": "collectivex.public.v1", "schema_version": 1, + "generated_at": "2026-07-04T00:00:00Z", "source_bundle_ids": ["c" * 64], + "promotion": { + "status": "diagnostic", "reason": None, "matrix_id": "d" * 64, + "allocation_ids": [allocation], "required_allocations": 3, + "requested_cases": 1, "terminal_cases": 1, + "policy": "collectivex-decision-grade-v1", + }, + "coverage": [{ + "case_id": case, "label": "case", "required": True, "sku": "h100-dgxc", + "backend": "deepep", "phase": "decode", "disposition": "runnable", + "selected_attempt_id": attempt, + "outcome": "success", "failure_mode": None, "reason": None, + "attempt_ids": [attempt], + }], + "attempts": [{ + "attempt_id": attempt, + "evidence": [{"evidence_id": evidence, + "point_id": item["points"][0]["point_id"]}], + "case_id": case, + "allocation_id": allocation, "run_id": "1", "run_attempt": 1, + "attempt_index": 1, + "selected": True, "outcome": "success", "failure_mode": None, "reason": None, + "series_id": item["series_id"], + "completed_at": "2026-07-04T00:00:00Z", + }], + "series": [item], "cohorts": [], "rankings": [], "recommendations": [], + "sensitivities": [], + } + + +def _promoted_dataset() -> dict: + specifications = ( + ("library-fast", "deepep", None, False), + ("library-slow", "uccl", None, False), + ("chip-peer", "deepep", "h200-dgxc", False), + ("system-one", "nccl-ep", None, True), + ("system-two", "nccl-ep", "h200-dgxc", True), + ("routing-zipf", "deepep", None, False), + ("routing-zipf-eplb", "deepep", None, False), + ) + series = [] + internals = {} + attempts = [] + coverage = [] + for seed, backend, peer_sku, reference in specifications: + item, internal = _series(seed, backend, decision_grade=True) + if peer_sku: + platform = publisher.capability.PLATFORMS[peer_sku] + item["system"].update({ + "sku": peer_sku, + "label": f"NVIDIA {platform['product'].upper()}", + "topology_class": platform["topology_class"], + "transport": platform["transport"], + }) + if reference: + item["backend"]["role"] = "reference" + if seed.startswith("routing-zipf"): + item["suite"] = "ep-routing-v1" + item["publication_tier"] = "comparable-experimental" + item["workload"]["routing"] = "zipf" + if seed == "routing-zipf-eplb": + item["workload"]["eplb"] = True + plan = contracts._expected_eplb_plan( + "zipf", 8, 256, 288, item["system"]["ep_size"], 67, 2048 + ) + item["eplb"] = { + "enabled": True, "planner": "greedy-rank-major-v1", + "mapping_sha256": contracts.eplb_contract.mapping_hash(plan), + "logical_experts": 256, "physical_experts": 288, + "redundant_experts": 32, "reference_tokens_per_rank": 2048, + "replicated_experts": plan["replicated_experts"], + "max_replicas": plan["max_replicas"], + "imbalance_before": plan["imbalance_before"], + "imbalance_after": plan["imbalance_after"], + } + item["build"]["implementation_contract_sha256"] = "8" * 64 + case_id = identity.digest("case", publisher._public_case_factors(item)) + item["case_ids"] = [case_id] + build = item["build"] + build["public_config_sha256"] = contracts.public_series_config_sha256( + publisher._public_series_config(item) + ) + item["series_id"] = identity.series_id({ + "backend": item["backend"]["id"], + "case_id": case_id, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build["implementation_contract_sha256"], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": item["workload"]["workload_id"], + }) + point = item["points"][0] + point["point_id"] = identity.point_id( + series=item["series_id"], tokens_per_rank=point["tokens_per_rank"] + ) + case_attempts = [] + evidence_ids = [] + for run_id, allocation_id in enumerate(item["allocation_ids"], 1): + attempt_id = identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=1 + ) + evidence_id = identity.evidence_id( + point=point["point_id"], allocation=allocation_id, + attempt=attempt_id, + sample_sha256=hashlib.sha256(f"{seed}-{run_id}".encode()).hexdigest(), + ) + attempts.append({ + "attempt_id": attempt_id, + "evidence": [{"evidence_id": evidence_id, "point_id": point["point_id"]}], + "case_id": case_id, "allocation_id": allocation_id, + "run_id": str(run_id), "run_attempt": 1, + "attempt_index": 1, "selected": True, + "outcome": "success", "failure_mode": None, "reason": None, + "series_id": item["series_id"], + "completed_at": "2026-07-04T00:00:00Z", + }) + case_attempts.append(attempt_id) + evidence_ids.append(evidence_id) + point["evidence_ids"] = evidence_ids + coverage.append({ + "case_id": case_id, "label": seed, "required": True, + "sku": item["system"]["sku"], "backend": backend, + "phase": item["phase"], "disposition": "runnable", + "selected_attempt_id": case_attempts[-1], "outcome": "success", + "failure_mode": None, "reason": None, "attempt_ids": case_attempts, + }) + series.append(item) + internals[item["series_id"]] = internal + + unsupported_case = identity.digest("case", {"seed": "planned-unsupported"}) + unsupported_attempts = [] + for run_id in range(1, 4): + allocation_id = identity.allocation_id( + {"seed": "planned-unsupported", "run": run_id} + ) + attempt_id = identity.attempt_id( + allocation=allocation_id, case=unsupported_case, ordinal=1 + ) + attempts.append({ + "attempt_id": attempt_id, "evidence": [], "case_id": unsupported_case, + "allocation_id": allocation_id, "run_id": str(run_id), + "run_attempt": 1, + "attempt_index": 1, "selected": True, "outcome": "unsupported", + "failure_mode": "capability", "reason": "backend-platform-unsupported", + "series_id": None, "completed_at": "2026-07-04T00:00:00Z", + }) + unsupported_attempts.append(attempt_id) + coverage.append({ + "case_id": unsupported_case, "label": "planned unsupported", "required": True, + "sku": "mi355x", "backend": "deepep", "phase": "decode", + "disposition": "unsupported", "selected_attempt_id": unsupported_attempts[-1], + "outcome": "unsupported", "failure_mode": "capability", + "reason": "backend-platform-unsupported", "attempt_ids": unsupported_attempts, + }) + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + series, internals + ) + return { + "format": "collectivex.public.v1", "schema_version": 1, + "generated_at": "2026-07-04T00:00:00Z", + "source_bundle_ids": ["a" * 64, "b" * 64, "c" * 64], + "promotion": { + "status": "promoted", "reason": None, + "matrix_id": publisher.CANONICAL_FULL_V1_MATRIX_SHA256, + "allocation_ids": sorted({item["allocation_id"] for item in attempts}), + "required_allocations": 3, "requested_cases": len(coverage), + "terminal_cases": len(coverage), "policy": "collectivex-decision-grade-v1", + }, + "coverage": sorted(coverage, key=lambda item: item["case_id"]), + "attempts": sorted(attempts, key=lambda item: item["attempt_id"]), + "series": sorted(series, key=lambda item: item["series_id"]), + "cohorts": cohorts, "rankings": rankings, + "recommendations": recommendations, "sensitivities": sensitivities, + } + + +def _cohort_counts(dataset: dict) -> dict[str, int]: + return { + kind: sum(item["kind"] == kind for item in dataset["cohorts"]) + for kind in ("library", "system", "routing") + } + + +class PublisherTest(unittest.TestCase): + def test_terminal_allocation_and_source_status_are_bound(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + path = next(artifact.glob("*.json")) + terminal = contracts.strict_load(path) + self.assertIs(contracts.validate_terminal_document(terminal), terminal) + self.assertEqual( + contracts.validate_delivery( + [str(path)], str(matrix), disposition="unsupported" + ), + 1, + ) + + for control_sha256 in (None, "0" * 64): + broken = copy.deepcopy(terminal) + broken["provenance"]["control_sha256"] = control_sha256 + path.write_text(json.dumps(broken)) + with self.assertRaisesRegex(contracts.ContractError, "exact control document"): + contracts.validate_delivery( + [str(path)], str(matrix), disposition="unsupported" + ) + path.write_text(json.dumps(terminal)) + + for field in ( + "artifact", "job", "repo", "run_attempt", "run_id", "source_sha", "runner" + ): + broken = copy.deepcopy(terminal) + broken["identity"]["allocation_factors"][field] = f"forged-{field}" + allocation_id = identity.allocation_id( + broken["identity"]["allocation_factors"] + ) + broken["identity"]["allocation_id"] = allocation_id + broken["identity"]["attempt_id"] = identity.attempt_id( + allocation=allocation_id, + case=broken["identity"]["case_id"], + ordinal=broken["identity"]["attempt_ordinal"], + ) + with self.assertRaisesRegex( + contracts.ContractError, "allocation factors differ" + ): + contracts.validate_terminal_document(broken) + + broken = copy.deepcopy(terminal) + broken["outcome"]["status"] = "failed" + with self.assertRaisesRegex(contracts.ContractError, "source and outcome"): + contracts.validate_terminal_document(broken) + broken = copy.deepcopy(terminal) + broken["provenance"]["source"] = "runtime-emitter" + with self.assertRaisesRegex(contracts.ContractError, "source and outcome"): + contracts.validate_terminal_document(broken) + + for path_parts, replacement in ( + (("provenance", "source"), "unregistered-producer"), + (("outcome", "failure_mode"), "unsupported-capability"), + (("outcome", "reason"), "unregistered-capability"), + ): + with self.subTest(path=path_parts): + broken = copy.deepcopy(terminal) + broken[path_parts[0]][path_parts[1]] = replacement + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + runtime_allocation = copy.deepcopy( + terminal["identity"]["allocation_factors"] + ) + runtime_allocation["runner"] = terminal["identity"]["case_factors"]["sku"] + runtime = contracts.make_terminal_document( + allocation_factors=runtime_allocation, + attempt_ordinal=1, + case=terminal["case"], + case_factors=terminal["identity"]["case_factors"], + control_sha256=terminal["provenance"]["control_sha256"], + failure_mode="setup", + generated_at=terminal["generated_at"], + git_run=terminal["provenance"]["git_run"], + reason="launcher-setup-failed", + return_code=1, + source="runtime-emitter", + status="failed", + expected_case_id=terminal["identity"]["case_id"], + ) + publisher._schema("terminal-outcome-v1.schema.json", runtime) + broken = copy.deepcopy(runtime) + broken["outcome"]["reason"] = "backend-setup-failed" + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + def test_post_emit_demotion_uses_closed_failure_taxonomy(self) -> None: + raw, _ = _native_fixture() + expected = { + 5: "runtime-identity", + 6: "execution", + 124: "timeout", + 137: "execution", + 134: "execution", + 9: "execution", + } + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + for return_code, failure_mode in expected.items(): + with self.subTest(return_code=return_code): + path = root / f"attempt-{return_code}.json" + path.write_text(json.dumps(raw)) + terminal = contracts.demote_raw_attempt(path, return_code) + self.assertEqual( + terminal["outcome"], + { + "failure_mode": failure_mode, + "reason": "post-emit-distributed-command-failed", + "return_code": return_code, + "status": "failed", + }, + ) + self.assertEqual(terminal["provenance"]["source"], "post-emit-command") + publisher._schema("terminal-outcome-v1.schema.json", terminal) + + broken = copy.deepcopy(terminal) + broken["outcome"]["reason"] = "distributed-command-failed" + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + def test_artifact_safety_accepts_current_v1_fixtures(self) -> None: + raw, samples = _native_fixture() + publisher.artifact_safety.assert_publication_safe([ + sweep_matrix.resolve_matrix(backends="all"), + raw, + samples, + _dataset(), + _promoted_dataset(), + ]) + + def test_native_raw_and_sample_schema_match_semantic_validator(self) -> None: + raw, samples = _native_fixture() + publisher._schema("samples-v1.schema.json", samples) + publisher._schema("raw-case-v1.schema.json", raw) + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + (root / "samples.json").write_bytes(contracts.canonical_json_bytes(samples)) + (root / "raw.json").write_bytes(contracts.canonical_json_bytes(raw)) + self.assertEqual(contracts.load_raw_attempt(root / "raw.json"), raw) + for target in ("raw", "samples"): + broken_raw, broken_samples = copy.deepcopy((raw, samples)) + broken = broken_raw if target == "raw" else broken_samples + broken["unexpected"] = True + with self.assertRaises(publisher.PublisherError): + publisher._schema( + "raw-case-v1.schema.json" if target == "raw" else "samples-v1.schema.json", + broken, + ) + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(broken_raw, broken_samples) + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2 + with self.assertRaisesRegex(contracts.ContractError, "token_rate_at_latency_percentile"): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + tampered["case"]["shape"]["hidden"] = 2 + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + configured = tampered["implementation"]["resource_profile"]["configured_units"] + tampered["implementation"]["resource_profile"]["configured_units"] = ( + 1 if configured is None else configured + 1 + ) + with self.assertRaisesRegex(contracts.ContractError, "resource profile"): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + oracle = tampered["measurement"]["rows"][0]["correctness"]["rank_evidence"][0] + oracle["pre_timing"]["checks"]["combine_values"] = False + with self.assertRaisesRegex(contracts.ContractError, "passed differs"): + contracts.validate_raw_document(tampered, samples) + + def test_hybrid_raw_binds_realized_config_and_every_rank_artifact(self) -> None: + raw, samples = _native_fixture("deepep-hybrid") + publisher._schema("raw-case-v1.schema.json", raw) + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + + mutations = { + "hidden_dim": lambda provenance: provenance["realized_config"].update( + hidden_dim=2 + ), + "experts_per_rank": lambda provenance: provenance["realized_config"].update( + num_of_experts_per_rank=2 + ), + "ranks_per_node": lambda provenance: provenance["realized_config"].update( + num_of_ranks_per_node=2 + ), + "num_nodes": lambda provenance: provenance["realized_config"].update( + num_of_nodes=2 + ), + "token_data_type": lambda provenance: provenance["realized_config"].update( + token_data_type="UINT8" + ), + "rank_coverage": lambda provenance: [ + artifact["rank_artifacts"].append({ + "bytes": 1, "rank": 1, "sha256": "9" * 64, + }) + for artifact in provenance["jit_shared_objects"] + ], + } + for name, mutate in mutations.items(): + with self.subTest(name=name): + changed = copy.deepcopy(raw) + mutate(changed["implementation"]["provenance"]) + with self.assertRaisesRegex( + contracts.ContractError, + "DeepEP Hybrid realized config/JIT evidence differs", + ): + contracts.validate_raw_document(changed, samples) + + def test_native_contract_recomputes_routing_receive_histograms_and_anomalies(self) -> None: + raw, samples = _native_fixture() + + tampered = copy.deepcopy(raw) + changed = tampered["measurement"]["rows"][0] + changed["routing"]["routed_copies"] *= 2 + for name in ("combine", "dispatch", "roundtrip"): + changed["logical_bytes"][name] *= 2 + with self.assertRaisesRegex(contracts.ContractError, "routing.routed_copies"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + changed = tampered["measurement"]["rows"][0] + changed["routing"]["payload_copies_per_rank"] = [2] + changed["receive"] = {"max": 2, "mean": 2.0, "min": 2, "total": 2} + with self.assertRaisesRegex(contracts.ContractError, "payload_copies_per_rank"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["sample_histograms"]["roundtrip"][ + "counts" + ] = [511] + with self.assertRaisesRegex(contracts.ContractError, "sample_histograms"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["anomalies"] = [{ + "type": "roundtrip_gt_isolated_sum", + "T": 1, + "roundtrip_p99": 40.0, + "isolated_sum_p99": 30.0, + "ratio": 1.33, + "threshold": 3.0, + }] + tampered["outcome"]["validity"]["anomaly_free"] = False + with self.assertRaisesRegex(contracts.ContractError, "anomalies"): + contracts.validate_raw_document(tampered, samples) + + anomalous_raw, anomalous_samples = copy.deepcopy((raw, samples)) + sample_point = anomalous_samples["points"][0] + sample_point["components"]["roundtrip"]["trials"] = [ + [100.0] * 8 for _ in range(64) + ] + sample_core = { + "components": sample_point["components"], + "tokens_per_rank": sample_point["tokens_per_rank"], + } + sample_sha = hashlib.sha256( + contracts.canonical_json_bytes(sample_core) + ).hexdigest() + point_id = sample_point["point_id"] + evidence_id = identity.evidence_id( + point=point_id, + allocation=anomalous_raw["identity"]["allocation_id"], + attempt=anomalous_raw["identity"]["attempt_id"], + sample_sha256=sample_sha, + ) + sample_point.update({"sample_sha256": sample_sha, "evidence_id": evidence_id}) + changed = anomalous_raw["measurement"]["rows"][0] + changed["sample_sha256"] = sample_sha + changed["evidence_id"] = evidence_id + changed["components"]["roundtrip"]["percentiles_us"] = { + name: 100.0 for name in ("p50", "p90", "p95", "p99") + } + changed["token_rate_at_latency_percentile"] = { + name: 10_000.0 for name in ("p50", "p90", "p95", "p99") + } + changed["sample_histograms"]["roundtrip"] = contracts._expected_histogram( + [100.0] * 512 + ) + changed["anomalies"] = contracts._expected_anomalies(1, changed["components"]) + anomalous_raw["outcome"]["validity"]["anomaly_free"] = False + sample_bytes = contracts.canonical_json_bytes(anomalous_samples) + anomalous_raw["sample_artifact"].update({ + "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + self.assertIs( + contracts.validate_raw_document(anomalous_raw, anomalous_samples), + anomalous_raw, + ) + changed["anomalies"] = [] + anomalous_raw["outcome"]["validity"]["anomaly_free"] = True + with self.assertRaisesRegex(contracts.ContractError, "anomalies"): + contracts.validate_raw_document(anomalous_raw, anomalous_samples) + + def test_native_contract_rejects_every_schema_only_nested_mutation(self) -> None: + raw, samples = _native_fixture() + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + + def locate(document: object, path: tuple[object, ...]) -> object: + value = document + for part in path: + value = value[part] # type: ignore[index] + return value + + def reject_raw(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("raw-case-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(document, samples) + + required_fields = ( + (("measurement", "rows", 0, "receive"), "total"), + (("measurement", "rows", 0, "routing"), "fanout_mean"), + (("measurement", "rows", 0, "routing", "source_token_stats"), "ranks"), + (("measurement", "rows", 0, "sample_histograms"), "roundtrip"), + (("measurement", "rows", 0, "sample_histograms", "roundtrip"), "n"), + (("runtime_fingerprint", "accelerator_runtime"), "kind"), + (("runtime_fingerprint", "collective_library"), "kind"), + (("runtime_fingerprint", "framework"), "kind"), + ) + for path, required in required_fields: + with self.subTest(path=path, mutation="missing"): + broken = copy.deepcopy(raw) + del locate(broken, path)[required] # type: ignore[index] + reject_raw(broken) + with self.subTest(path=path, mutation="extra"): + broken = copy.deepcopy(raw) + locate(broken, path)["unexpected"] = None # type: ignore[index] + reject_raw(broken) + + invalid_values = ( + (("measurement", "rows", 0, "receive", "mean"), "one"), + (("measurement", "rows", 0, "routing", "fanout_mean"), "one"), + (("measurement", "rows", 0, "sample_histograms", "roundtrip", "bins"), 0), + (("provenance", "image", "arch"), "AMD64"), + (("runtime_fingerprint", "accelerator_runtime", "kind"), "rocm"), + ) + for path, invalid in invalid_values: + with self.subTest(path=path, mutation="value"): + broken = copy.deepcopy(raw) + parent = locate(broken, path[:-1]) + parent[path[-1]] = invalid # type: ignore[index] + reject_raw(broken) + + def reject_samples(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("samples-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_samples_document(document) + + for path, required in ( + (("points", 0), "evidence_id"), + (("points", 0, "components"), "roundtrip"), + (("points", 0, "components", "roundtrip"), "trials"), + (("sampling",), "reduction"), + ): + with self.subTest(path=path, artifact="samples"): + broken = copy.deepcopy(samples) + del locate(broken, path)[required] # type: ignore[index] + reject_samples(broken) + + def test_terminal_contract_and_schema_reject_the_same_shape_gaps(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + _, artifact = _unsupported_delivery(Path(temporary).resolve()) + terminal = contracts.strict_load(next(artifact.glob("*.json"))) + publisher._schema("terminal-outcome-v1.schema.json", terminal) + self.assertIs(contracts.validate_terminal_document(terminal), terminal) + + def reject(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(document) + + for path, invalid in ( + (("outcome", "failure_mode"), "Not Safe"), + (("outcome", "reason"), "x" * 241), + (("provenance", "source"), "Not Safe"), + (("provenance", "git_run", "ref"), ""), + ): + with self.subTest(path=path): + broken = copy.deepcopy(terminal) + parent = broken + for part in path[:-1]: + parent = parent[part] + parent[path[-1]] = invalid + reject(broken) + + def test_invalid_retry_is_quarantined_before_valid_retry_upload(self) -> None: + raw, samples = _native_fixture() + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + sample_bytes = contracts.canonical_json_bytes(samples) + bad = copy.deepcopy(raw) + bad["sample_artifact"].update({ + "path": "a01.samples.json", "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + bad["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2 + (root / "a01.samples.json").write_bytes(sample_bytes) + (root / "a01.json").write_bytes(contracts.canonical_json_bytes(bad)) + self.assertTrue(contracts.quarantine_invalid_attempt(root / "a01.json")) + valid = copy.deepcopy(raw) + valid["sample_artifact"].update({ + "path": "a02.samples.json", "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + (root / "a02.samples.json").write_bytes(sample_bytes) + (root / "a02.json").write_bytes(contracts.canonical_json_bytes(valid)) + paths = sorted(str(path) for path in root.glob("*.json")) + self.assertEqual(contracts.validate_attempt_paths(paths), 1) + self.assertTrue((root / "a01.json.quarantine").is_file()) + self.assertTrue((root / "a01.samples.json.quarantine").is_file()) + + def test_ingest_archives_first_and_publishes_latest_attempt(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + self.assertEqual(len(summarize.load_results(str(artifact), None, None)), 1) + result = publisher.ingest_command(_args(root / "store", matrix, artifact)) + store = publisher.Store(root / "store") + pointer = store.verify_channel("latest-attempt") + self.assertEqual(result["status"], "accepted") + self.assertEqual(pointer["dataset"]["sha256"], result["dataset_sha256"]) + self.assertTrue((store.incoming / result["incoming_id"] / "COMPLETE").is_file()) + self.assertTrue((store.bundles / result["bundle_id"] / "COMPLETE").is_file()) + self.assertFalse((store.channels / "dev-latest.json").exists()) + self.assertEqual(os.stat(store.private).st_mode & 0o777, 0o700) + self.assertEqual(os.stat(store.public).st_mode & 0o777, 0o755) + self.assertEqual(os.stat(store.bundles / result["bundle_id"]).st_mode & 0o777, 0o500) + dataset_dir = store.datasets / result["dataset_sha256"] + self.assertEqual(os.stat(dataset_dir).st_mode & 0o777, 0o555) + self.assertEqual(os.stat(dataset_dir / "dataset.json").st_mode & 0o777, 0o444) + + def test_repeated_ingest_is_content_idempotent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + args = _args(root / "store", matrix, artifact) + first = publisher.ingest_command(args) + store = publisher.Store(root / "store") + pointer_before = (store.channels / "latest-attempt.json").read_bytes() + second = publisher.ingest_command(args) + self.assertEqual(second, first) + self.assertEqual( + (store.channels / "latest-attempt.json").read_bytes(), pointer_before + ) + self.assertEqual(len(list(store.incoming.iterdir())), 1) + self.assertEqual(len(list(store.bundles.iterdir())), 1) + self.assertEqual(len(list(store.datasets.iterdir())), 1) + bundle = publisher.strict_load( + store.bundles / first["bundle_id"] / "bundle.json" + ) + terminal = publisher.strict_load(next(artifact.glob("*.json"))) + self.assertEqual(bundle["created_at"], terminal["generated_at"]) + + def test_dataset_is_invariant_to_bundle_argument_order(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + store_root = root / "store" + bundle_ids = [] + for run_id in (9, 11, 10): + run = {**RUN, "run_id": str(run_id)} + delivery = root / f"run-{run_id}" + delivery.mkdir() + matrix, artifact = _unsupported_delivery(delivery, run=run) + result = publisher.ingest_command( + _args(store_root, matrix, artifact, run=run) + ) + bundle_ids.append(result["bundle_id"]) + datasets = [ + publisher.build_dataset( + publisher.Store(store_root), order, promote=False, + ) + for order in itertools.permutations(bundle_ids) + ] + self.assertTrue(all(dataset == datasets[0] for dataset in datasets[1:])) + self.assertEqual(datasets[0]["generated_at"], "2026-07-04T00:00:00Z") + selected = datasets[0]["coverage"][0]["selected_attempt_id"] + selected_attempt = next( + item for item in datasets[0]["attempts"] + if item["attempt_id"] == selected + ) + self.assertEqual(selected_attempt["run_id"], "11") + + def test_diagnostic_dataset_orders_reruns_by_run_attempt(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + store_root = root / "store" + bundle_ids = [] + for run_attempt in (1, 2): + run = {**RUN, "run_attempt": run_attempt} + delivery = root / f"attempt-{run_attempt}" + delivery.mkdir() + matrix, artifact = _unsupported_delivery(delivery, run=run) + result = publisher.ingest_command( + _args(store_root, matrix, artifact, run=run) + ) + bundle_ids.append(result["bundle_id"]) + dataset = publisher.build_dataset( + publisher.Store(store_root), bundle_ids, promote=False + ) + selected_id = dataset["coverage"][0]["selected_attempt_id"] + selected = next( + item for item in dataset["attempts"] + if item["attempt_id"] == selected_id + ) + self.assertEqual(selected["run_attempt"], 2) + + def test_promotion_requires_every_runnable_case_to_succeed_in_every_bundle(self) -> None: + cases = { + "runnable": {"_disposition": "runnable"}, + "planned-unsupported": {"_disposition": "unsupported"}, + } + bundles = [] + for _ in range(3): + runnable = { + "identity": {"case_id": "runnable"}, + "outcome": {"status": "success"}, + } + unsupported = { + "identity": {"case_id": "planned-unsupported"}, + "outcome": {"status": "unsupported"}, + } + bundles.append({ + "selected": {"runnable": runnable, "planned-unsupported": unsupported}, + "documents": {"runnable": runnable, "planned-unsupported": unsupported}, + }) + publisher._require_runnable_promotion_success(bundles, cases) + + for status in ("failed", "invalid", "unsupported", "diagnostic"): + with self.subTest(status=status): + broken = copy.deepcopy(bundles) + broken[1]["selected"]["runnable"]["outcome"]["status"] = status + with self.assertRaisesRegex( + publisher.PublisherError, "every runnable matrix case" + ): + publisher._require_runnable_promotion_success(broken, cases) + + broken = copy.deepcopy(bundles) + broken[1]["documents"]["retry"] = { + "identity": {"case_id": "runnable"}, + "outcome": {"status": "failed"}, + } + with self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"): + publisher._require_runnable_promotion_success(broken, cases) + + def test_promoted_public_dataset_rejects_failed_retry_history(self) -> None: + dataset = _promoted_dataset() + successful = next( + item for item in dataset["attempts"] + if item["outcome"] == "success" + ) + failed = copy.deepcopy(successful) + old_attempt_id = successful["attempt_id"] + successful["attempt_index"] = 2 + successful["attempt_id"] = identity.attempt_id( + allocation=successful["allocation_id"], case=successful["case_id"], ordinal=2 + ) + failed.update({ + "attempt_id": old_attempt_id, + "attempt_index": 1, + "outcome": "failed", + "failure_mode": "execution", + "reason": "execution-failed", + "series_id": None, + "selected": False, + "evidence": [], + }) + dataset["attempts"].append(failed) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + coverage = next( + item for item in dataset["coverage"] + if item["case_id"] == failed["case_id"] + ) + coverage["attempt_ids"] = [ + successful["attempt_id"] if value == old_attempt_id else value + for value in coverage["attempt_ids"] + ] + coverage["attempt_ids"].append(failed["attempt_id"]) + coverage["attempt_ids"].sort() + if coverage["selected_attempt_id"] == old_attempt_id: + coverage["selected_attempt_id"] = successful["attempt_id"] + + fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"]) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"): + publisher.validate_public_dataset(dataset) + + def test_unselected_success_does_not_reference_an_unpublished_series(self) -> None: + raw, _ = _native_fixture() + retained = publisher._public_attempt(raw, selected=False) + selected = publisher._public_attempt(raw, selected=True) + self.assertEqual(retained["outcome"], "success") + self.assertIsNone(retained["series_id"]) + self.assertEqual(selected["series_id"], raw["identity"]["series_id"]) + + def test_public_dataset_selects_latest_derived_retry(self) -> None: + dataset = _dataset() + first = dataset["attempts"][0] + second = copy.deepcopy(first) + second.update({ + "attempt_id": identity.attempt_id( + allocation=first["allocation_id"], case=first["case_id"], ordinal=2 + ), + "attempt_index": 2, + "selected": False, + "series_id": None, + "evidence": [], + }) + dataset["attempts"].append(second) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + dataset["coverage"][0]["attempt_ids"].append(second["attempt_id"]) + dataset["coverage"][0]["attempt_ids"].sort() + with self.assertRaisesRegex(publisher.PublisherError, "select the latest retry"): + publisher.validate_public_dataset(dataset) + + second["attempt_id"] = identity.digest("attempt", {"not": "derived"}) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + dataset["coverage"][0]["attempt_ids"] = [ + item["attempt_id"] for item in dataset["attempts"] + ] + with self.assertRaisesRegex(publisher.PublisherError, "retry identity differs"): + publisher.validate_public_dataset(dataset) + + def test_promotion_requires_an_eligible_cohort_for_every_comparison_kind(self) -> None: + stable_fast, stable_fast_internal = _series( + "stable-fast", "deepep", decision_grade=True + ) + stable_slow, stable_slow_internal = _series( + "stable-slow", "uccl", decision_grade=True + ) + unstable_fast, unstable_fast_internal = _series( + "unstable-fast", "deepep", decision_grade=True + ) + unstable_slow, unstable_slow_internal = _series( + "unstable-slow", "uccl", decision_grade=True + ) + unstable_fast["phase"] = unstable_slow["phase"] = "prefill" + unstable_fast["series_id"] = identity.series_id({"test": "unstable-fast"}) + unstable_slow["series_id"] = identity.series_id({"test": "unstable-slow"}) + for statistic in ("p50", "p99"): + unstable_slow_internal["run_metrics"]["1"][8]["latency_us"][statistic] = ( + unstable_fast_internal["run_metrics"]["1"][8]["latency_us"][statistic] + / 2 + ) + unstable_slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = ( + unstable_fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] + * 2 + ) + series = [stable_fast, stable_slow, unstable_fast, unstable_slow] + internals = { + stable_fast["series_id"]: stable_fast_internal, + stable_slow["series_id"]: stable_slow_internal, + unstable_fast["series_id"]: unstable_fast_internal, + unstable_slow["series_id"]: unstable_slow_internal, + } + cohorts, _, _, _ = publisher.build_decisions(series, internals) + eligible = [item for item in cohorts if item["eligibility"]["decision_grade"]] + ineligible = [item for item in cohorts if not item["eligibility"]["decision_grade"]] + self.assertEqual({item["kind"] for item in eligible}, {"library"}) + self.assertTrue(ineligible) + anchor_series = [ + { + "series_id": name, + "workload": {"routing": routing, "eplb": eplb}, + "build": {"implementation_contract_sha256": "1" * 64}, + } + for name, routing, eplb in ( + ("uniform", "uniform", False), + ("zipf", "zipf", False), + ("zipf-eplb", "zipf", True), + ) + ] + required = eligible + [ + { + "kind": kind, + "eligibility": {"decision_grade": True}, + **({"series_ids": [item["series_id"] for item in anchor_series]} + if kind == "routing" else {}), + } + for kind in publisher.REQUIRED_COHORT_KINDS + if kind != "library" + ] + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", {} + ), mock.patch.object( + publisher, "_expected_chip_cohort_count", return_value=1 + ): + publisher._require_promotion_cohorts( + required + ineligible, anchor_series + ) + for kind in publisher.REQUIRED_COHORT_KINDS: + with self.subTest(missing_kind=kind), self.assertRaisesRegex( + publisher.PublisherError, rf"cohort kinds:.*{kind}" + ): + publisher._require_promotion_cohorts([ + item for item in required + ineligible + if item["kind"] != kind or not item["eligibility"]["decision_grade"] + ], anchor_series) + + def test_promotion_requires_exact_counts_and_routing_anchors(self) -> None: + dataset = _promoted_dataset() + counts = _cohort_counts(dataset) + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + routing = next( + item for item in dataset["cohorts"] if item["kind"] == "routing" + ) + eplb = next( + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + and item["workload"]["eplb"] + ) + eplb["workload"]["eplb"] = False + with self.assertRaisesRegex(publisher.PublisherError, "exact uniform"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + dataset = _promoted_dataset() + routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + zipf = next( + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + and item["workload"]["routing"] == "zipf" + and not item["workload"]["eplb"] + ) + zipf["build"]["implementation_contract_sha256"] = "f" * 64 + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ), self.assertRaisesRegex(publisher.PublisherError, "identical off-EPLB"): + publisher._require_promotion_cohorts(dataset["cohorts"], dataset["series"]) + + wrong_counts = {**counts, "library": counts["library"] + 1} + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", wrong_counts + ), self.assertRaisesRegex(publisher.PublisherError, "exactly"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + def test_promotion_requires_every_derived_chip_cohort_to_be_stable(self) -> None: + dataset = _promoted_dataset() + chip = next(item for item in dataset["cohorts"] if item["kind"] == "chip") + self.assertEqual( + publisher._expected_chip_cohort_count(dataset["series"]), + sum(item["kind"] == "chip" for item in dataset["cohorts"]), + ) + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset) + ): + missing = [item for item in dataset["cohorts"] if item is not chip] + with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"): + publisher._require_promotion_cohorts(missing, dataset["series"]) + + chip["eligibility"]["decision_grade"] = False + with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + def test_promotion_rejects_more_than_three_bundles(self) -> None: + bundles = { + str(run_id): { + "id": str(run_id), "cases": [], + "manifest": { + "matrix": {"sha256": publisher.CANONICAL_FULL_V1_MATRIX_SHA256}, + "run": {"run_id": str(run_id), "run_attempt": 1}, + }, + } + for run_id in range(1, 5) + } + with mock.patch.object( + publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id] + ), self.assertRaisesRegex(publisher.PublisherError, "three independent"): + publisher.build_dataset(object(), list(bundles), promote=True) + + dataset = _promoted_dataset() + dataset["source_bundle_ids"].append("d" * 64) + counts = _cohort_counts(dataset) + with mock.patch.object( + publisher, + "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(dataset["coverage"]), + ), mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ), self.assertRaisesRegex(publisher.PublisherError, "complete coverage"): + publisher.validate_public_dataset(dataset) + + def test_standalone_promotion_binds_matrix_and_requested_dispositions(self) -> None: + dataset = _promoted_dataset() + fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"]) + with self.assertRaisesRegex( + publisher.PublisherError, "canonical case/disposition catalog" + ): + publisher.validate_public_dataset(dataset) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), mock.patch.object( + publisher, + "REQUIRED_PROMOTION_COHORT_COUNTS", + _cohort_counts(dataset), + ): + publisher.validate_public_dataset(dataset) + + diagnostic = copy.deepcopy(dataset) + item = diagnostic["series"][0] + item["status"] = "diagnostic" + item["eligibility"].update({ + "decision_grade": False, + "stable_p50": False, + "p50_max_min_ratio": 1.20, + "reasons": ["unstable-p50"], + }) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), mock.patch.object( + publisher, + "REQUIRED_PROMOTION_COHORT_COUNTS", + _cohort_counts(dataset), + ), self.assertRaisesRegex( + publisher.PublisherError, "unstable or incomplete required series" + ): + publisher.validate_public_dataset(diagnostic) + + broken = copy.deepcopy(dataset) + broken["promotion"]["matrix_id"] = "d" * 64 + with self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"): + publisher.validate_public_dataset(broken) + + for original, replacement in (("runnable", "unsupported"), + ("unsupported", "runnable")): + with self.subTest(original=original): + broken = copy.deepcopy(dataset) + item = next( + coverage for coverage in broken["coverage"] + if coverage["disposition"] == original + ) + item["disposition"] = replacement + with mock.patch.object( + publisher, + "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(broken["coverage"]), + ), self.assertRaisesRegex( + publisher.PublisherError, "requested dispositions" + ): + publisher.validate_public_dataset(broken) + + def test_workflow_matrix_and_catalog_digests_do_not_drift(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + matrix_path = Path(temporary) / "matrix_full.json" + result = subprocess.run( + [ + sys.executable, str(ROOT / "sweep_matrix.py"), + "--suites", "all", "--max-cases", "128", + "--backends", "all", "--out", str(matrix_path), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual( + hashlib.sha256(matrix_path.read_bytes()).hexdigest(), + publisher.CANONICAL_FULL_V1_MATRIX_SHA256, + ) + matrix = contracts.strict_load(matrix_path) + coverage = [ + { + "case_id": item["case"]["case_id"], + "disposition": item["disposition"], + } + for item in matrix["requested_cases"] + ] + self.assertEqual( + publisher._case_disposition_catalog_sha256(coverage), + publisher.CANONICAL_FULL_V1_CASE_CATALOG_SHA256, + ) + self.assertEqual( + ( + len(matrix["include"]), len(coverage), + sum(item["disposition"] == "runnable" for item in coverage), + sum(item["disposition"] == "unsupported" for item in coverage), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + ), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + if item["disposition"] == "runnable" + ), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + if item["disposition"] == "unsupported" + ), + ), + (38, 360, 228, 132, 840, 532, 308), + ) + library: dict[tuple, set[str]] = {} + system: dict[tuple, set[str]] = {} + routing: dict[tuple, list[tuple[str, bool]]] = {} + for requested in matrix["requested_cases"]: + if requested["disposition"] != "runnable": + continue + case = requested["case"] + shape = tuple( + case[field] + for field in ("workload", "hidden", "topk", "experts", "ep", "phase") + ) + route = (case["routing"], case["eplb"]) + if case["backend"] != "nccl-ep": + library.setdefault((requested["sku"], shape, route), set()).add( + case["backend"] + ) + else: + system.setdefault((shape, route), set()).add(requested["sku"]) + routing.setdefault( + (requested["sku"], case["backend"], shape), [] + ).append(route) + anchors = {("uniform", False), ("zipf", False), ("zipf", True)} + self.assertEqual( + { + "library": sum(len(variants) >= 2 for variants in library.values()), + "system": sum(len(variants) >= 2 for variants in system.values()), + "routing": sum( + len(variants) == 3 and set(variants) == anchors + for variants in routing.values() + ), + }, + publisher.REQUIRED_PROMOTION_COHORT_COUNTS, + ) + + def test_build_promotion_requires_canonical_full_matrix(self) -> None: + bundles = { + str(run_id): { + "id": str(run_id), "cases": [], + "manifest": { + "matrix": {"sha256": "d" * 64}, + "run": {"run_id": str(run_id), "run_attempt": 1}, + }, + } + for run_id in range(1, 4) + } + with mock.patch.object( + publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id] + ), self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"): + publisher.build_dataset(object(), list(bundles), promote=True) + + def test_rejection_updates_latest_but_never_dev_latest(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + sentinel = b"existing-promoted-pointer\n" + (store.channels / "dev-latest.json").write_bytes(sentinel) + (artifact / "unknown.json").write_text('{"format":"unknown"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + self.assertEqual((store.channels / "dev-latest.json").read_bytes(), sentinel) + pointer = store.verify_channel("latest-attempt") + dataset = publisher.strict_load(store.public / pointer["dataset"]["path"]) + self.assertEqual(dataset["promotion"]["status"], "quarantined") + self.assertTrue(any(store.quarantine.iterdir())) + + def test_repeated_rejection_is_content_idempotent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + (artifact / "unknown.json").write_text('{"format":"unknown"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + pointer = (store.channels / "latest-attempt.json").read_bytes() + counts = tuple( + len(list(path.iterdir())) + for path in (store.incoming, store.quarantine, store.datasets) + ) + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + self.assertEqual((store.channels / "latest-attempt.json").read_bytes(), pointer) + self.assertEqual( + tuple( + len(list(path.iterdir())) + for path in (store.incoming, store.quarantine, store.datasets) + ), + counts, + ) + + def test_distinct_rejections_advance_latest_attempt(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + unknown = artifact / "unknown.json" + unknown.write_text('{"format":"unknown-one"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + first = (store.channels / "latest-attempt.json").read_bytes() + unknown.write_text('{"format":"unknown-two"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + second = (store.channels / "latest-attempt.json").read_bytes() + self.assertNotEqual(second, first) + self.assertEqual(len(list(store.datasets.iterdir())), 2) + + def test_zip_traversal_is_rejected(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "bad.zip" + with zipfile.ZipFile(archive, "w") as handle: + handle.writestr("../escape.json", "{}") + with self.assertRaisesRegex(publisher.PublisherError, "escapes"): + publisher.extract_archive(archive, root / "out") + + def test_store_and_directory_archive_reject_symlinks(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + real = root / "real" + real.mkdir() + alias = root / "alias" + alias.symlink_to(real, target_is_directory=True) + with self.assertRaisesRegex(publisher.PublisherError, "symlinked parent"): + publisher.Store(alias / "store") + self.assertFalse((real / "store").exists()) + artifact = root / f"cxunsupported-{RUN['run_id']}-{RUN['run_attempt']}" + artifact.mkdir() + target = root / "target.json" + target.write_text("{}") + (artifact / "linked.json").symlink_to(target) + with self.assertRaisesRegex(publisher.PublisherError, "symlink"): + publisher._archive_download_directory(artifact, root / "artifact.zip") + + def test_offline_caller_metadata_is_validated_before_store_creation(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store_root = root / "store" + args = _args(store_root, matrix, artifact) + args.run_id = "0" + with self.assertRaisesRegex(publisher.PublisherError, "run-id"): + publisher.ingest_command(args) + self.assertFalse(store_root.exists()) + + promote = types.SimpleNamespace( + store_root=str(store_root), bundle=["not-a-digest"] + ) + with self.assertRaisesRegex(publisher.PublisherError, "bundle IDs"): + publisher.promote_command(promote) + self.assertFalse(store_root.exists()) + with self.assertRaisesRegex(publisher.PublisherError, "absolute path"): + publisher._store_from_args(types.SimpleNamespace(store_root="relative-store")) + + def test_store_rejects_group_or_world_writable_root(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() / "unsafe-store" + root.mkdir() + root.chmod(0o772) + with self.assertRaisesRegex(publisher.PublisherError, "group/world writable"): + publisher.Store(root) + + def test_retry_ordinals_must_be_contiguous_from_one(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root, (1, 3)) + with self.assertRaisesRegex(publisher.PublisherError, "contiguous ordinals"): + publisher.ingest_command(_args(root / "store", matrix, artifact)) + + def test_delivery_rejects_extra_archive_and_non_native_member(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + extra = root / f"cxshard-extra-{RUN['run_id']}-{RUN['run_attempt']}" + extra.mkdir() + (extra / "extra.json").write_text("{}") + args = _args(root / "store-extra", matrix, artifact) + args.artifact.append(str(extra)) + with self.assertRaisesRegex(publisher.PublisherError, "archive set"): + publisher.ingest_command(args) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + (artifact / "notes.txt").write_text("not native evidence") + with self.assertRaisesRegex(publisher.PublisherError, "unconsumed"): + publisher.ingest_command(_args(root / "store-member", matrix, artifact)) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + path = next(artifact.glob("*.json")) + terminal = json.loads(path.read_text()) + terminal["outcome"]["reason"] = next( + reason for reason in contracts.CAPABILITY_FAILURE_REASONS + if reason != terminal["outcome"]["reason"] + ) + path.write_text(json.dumps(terminal)) + with self.assertRaisesRegex(publisher.PublisherError, "reason differs"): + publisher.ingest_command(_args(root / "store-reason", matrix, artifact)) + + def test_rates_invert_latency_and_global_tokens_use_ep_size(self) -> None: + dataset = _dataset() + publisher.validate_public_dataset(dataset) + rates = dataset["series"][0]["points"][0]["components"]["roundtrip"]["logical_payload_rate_gbps_at_latency_percentile"] + self.assertGreater(rates["p50"], rates["p99"]) + broken = copy.deepcopy(dataset) + broken["series"][0]["points"][0]["global_tokens"] = 128 + with self.assertRaisesRegex(publisher.PublisherError, "EP size"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["series"][0]["points"][0]["roundtrip_token_rate_at_latency_percentile"]["p99"] *= 2 + with self.assertRaisesRegex(publisher.PublisherError, "token throughput"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["attempts"][0]["evidence"][0]["point_id"] = identity.point_id( + series=broken["series"][0]["series_id"], tokens_per_rank=16 + ) + with self.assertRaisesRegex(publisher.PublisherError, "point evidence"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["attempts"][0]["series_id"] = None + with self.assertRaisesRegex(publisher.PublisherError, "present exactly for selected success"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + component = broken["series"][0]["points"][0]["components"]["roundtrip"] + component["logical_bytes"] = None + component["logical_payload_rate_gbps_at_latency_percentile"] = None + with self.assertRaisesRegex(publisher.PublisherError, "logical bandwidth is missing"): + publisher.validate_public_dataset(broken) + + for mutate in ( + lambda item: item.update({"model": "different-model"}), + lambda item: item["workload"].update({"hidden": 4096}), + lambda item: item["workload"].update({"top_k": 4}), + lambda item: item["workload"].update({"experts": 128}), + ): + broken = copy.deepcopy(dataset) + mutate(broken["series"][0]) + with self.assertRaisesRegex(publisher.PublisherError, "frozen v1"): + publisher.validate_public_dataset(broken) + + broken = copy.deepcopy(dataset) + broken["series"][0]["eplb"]["mapping_sha256"] = "f" * 64 + with self.assertRaisesRegex(publisher.PublisherError, "claims a plan"): + publisher.validate_public_dataset(broken) + + broken = copy.deepcopy(dataset) + broken["series"][0]["backend"].update({ + "id": "nccl-ep", "label": publisher.BACKEND_LABELS["nccl-ep"], + "role": "reference", "generation": "rccl", + }) + broken["coverage"][0]["backend"] = "nccl-ep" + with self.assertRaisesRegex(publisher.PublisherError, "configuration"): + publisher.validate_public_dataset(broken) + + def test_routing_and_eplb_facts_must_match_across_repeats(self) -> None: + raw, _ = _native_fixture() + descriptor = publisher._eplb_descriptor(raw) + facts = publisher._routing_facts(raw["measurement"]["rows"][0]) + self.assertEqual( + publisher._exact_repeat_value([descriptor, copy.deepcopy(descriptor)], "EPLB"), + descriptor, + ) + self.assertEqual( + publisher._exact_repeat_value([facts, copy.deepcopy(facts)], "routing"), + facts, + ) + changed = copy.deepcopy(facts) + changed["hotspot_ratio"] += 0.1 + with self.assertRaisesRegex(publisher.PublisherError, "routing differs"): + publisher._exact_repeat_value([facts, changed], "routing") + + dataset = _promoted_dataset() + dataset["promotion"]["status"] = "diagnostic" + eplb = next(item for item in dataset["series"] if item["eplb"]["enabled"]) + eplb["points"][0]["routing"]["empty_expert_count"] = 280 + publisher.validate_public_dataset(dataset) + eplb["points"][0]["routing"]["empty_expert_count"] = 288 + with self.assertRaisesRegex(publisher.PublisherError, "routing/load facts"): + publisher.validate_public_dataset(dataset) + + for field, value in ( + ("mapping_sha256", "0" * 64), + ("redundant_experts", 31), + ("replicated_experts", 1), + ("max_replicas", 2), + ("replicated_experts", 257), + ("max_replicas", 999), + ("imbalance_after", 0.4), + ("planner", "different-planner"), + ("reference_tokens_per_rank", 1024), + ): + broken = _promoted_dataset() + broken["promotion"]["status"] = "diagnostic" + descriptor = next( + item["eplb"] for item in broken["series"] if item["eplb"]["enabled"] + ) + descriptor[field] = value + with self.subTest(eplb_field=field), self.assertRaisesRegex( + publisher.PublisherError, "EPLB descriptor" + ): + publisher.validate_public_dataset(broken) + + def test_publisher_owns_stable_rankings_and_recommendations(self) -> None: + fast, fast_internal = _series("fast", "deepep", decision_grade=True) + slow, slow_internal = _series("slow", "uccl", decision_grade=True) + reference, reference_internal = _series("reference", "nccl-ep", decision_grade=True) + reference_peer, reference_peer_internal = _series( + "reference-peer", "nccl-ep", decision_grade=True + ) + reference["backend"]["role"] = "reference" + reference_peer["backend"]["role"] = "reference" + reference_peer["system"].update({"sku": "h200-dgxc", "label": "NVIDIA H200"}) + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow, reference, reference_peer], { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + reference["series_id"]: reference_internal, + reference_peer["series_id"]: reference_peer_internal, + } + ) + library = next(item for item in cohorts if item["kind"] == "library") + ranking = next(item for item in rankings if item["cohort_id"] == library["cohort_id"] + and item["metric"]["measure"] == "latency_us" + and item["metric"]["statistic"] == "p99") + self.assertTrue(library["eligibility"]["decision_grade"]) + self.assertEqual(ranking["entries"][0]["series_id"], fast["series_id"]) + self.assertTrue(any(item["series_id"] == fast["series_id"] for item in recommendations)) + self.assertFalse(any( + entry["series_id"] == reference["series_id"] + for item in rankings if item["cohort_id"] == library["cohort_id"] + for entry in item["entries"] + )) + self.assertTrue(any( + item["kind"] == "system" and reference["series_id"] in item["series_ids"] + for item in cohorts + )) + + def test_routing_evidence_is_experimental_and_not_a_configuration_recommendation(self) -> None: + dataset = _promoted_dataset() + routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + members = [ + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + ] + self.assertEqual( + {(item["workload"]["routing"], item["workload"]["eplb"]) for item in members}, + {("uniform", False), ("zipf", False), ("zipf", True)}, + ) + self.assertIn("implementation-static-build", routing["controlled_factors"]) + self.assertIn("resource", routing["controlled_factors"]) + self.assertEqual( + routing["varying_factors"], + ["workload.routing", "workload.eplb", "implementation-config"], + ) + self.assertEqual( + len({item["build"]["routing_control_sha256"] for item in members}), + 1, + ) + self.assertGreater( + len({item["build"]["implementation_contract_sha256"] for item in members}), + 1, + ) + self.assertEqual(len({json.dumps(item["resource"], sort_keys=True) for item in members}), 1) + self.assertEqual(routing["publication_tier"], "comparable-experimental") + self.assertTrue(any( + item["cohort_id"] == routing["cohort_id"] for item in dataset["rankings"] + )) + self.assertFalse(any( + item["cohort_id"] == routing["cohort_id"] for item in dataset["recommendations"] + )) + self.assertTrue(all( + item["publication_tier"] == "official" + for item in dataset["recommendations"] + )) + self.assertFalse(any( + dataset_cohort["publication_tier"] == "comparable-experimental" + and item["cohort_id"] == dataset_cohort["cohort_id"] + for item in dataset["recommendations"] + for dataset_cohort in dataset["cohorts"] + )) + self.assertTrue(all( + item["publication_tier"] == "comparable-experimental" + for item in dataset["sensitivities"] + if item["cohort_id"] == routing["cohort_id"] + )) + + def test_routing_implementation_mismatch_blocks_all_decisions(self) -> None: + dataset = _promoted_dataset() + published = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + members = [ + item for item in dataset["series"] + if item["series_id"] in published["series_ids"] + ] + zipf = next( + item for item in members + if item["workload"]["routing"] == "zipf" and not item["workload"]["eplb"] + ) + zipf["build"]["implementation_contract_sha256"] = "f" * 64 + internals = {} + for member in members: + point = member["points"][0] + roundtrip = point["components"]["roundtrip"] + metrics = { + "latency_us": { + name: roundtrip["latency_us"][name] for name in ("p50", "p99") + }, + "logical_payload_rate_gbps_at_latency_percentile": { + name: roundtrip[ + "logical_payload_rate_gbps_at_latency_percentile" + ][name] + for name in ("p50", "p99") + }, + } + internals[member["series_id"]] = { + "run_metrics": { + str(run): {point["tokens_per_rank"]: metrics} + for run in range(3) + } + } + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + members, internals + ) + routing = next(item for item in cohorts if item["kind"] == "routing") + self.assertFalse(routing["eligibility"]["decision_grade"]) + self.assertIn( + "implementation-config-mismatch", routing["eligibility"]["reasons"] + ) + self.assertEqual((rankings, recommendations, sensitivities), ([], [], [])) + + def test_promoted_series_fields_are_bound_to_case_and_series_identities(self) -> None: + dataset = _promoted_dataset() + changed = copy.deepcopy(dataset) + series = next( + item for item in changed["series"] + if item["system"]["sku"] == "h100-dgxc" + ) + series["system"].update({ + "sku": "h200-dgxc", "label": "NVIDIA H200", + "topology_class": "h200-nvlink-island", + }) + for case_id in series["case_ids"]: + next( + item for item in changed["coverage"] if item["case_id"] == case_id + )["sku"] = "h200-dgxc" + with self.assertRaisesRegex(publisher.PublisherError, "configuration|case identity"): + publisher.validate_public_dataset(changed) + + for field, value in ( + ("source_sha", "b" * 40), + ("image_digest", "sha256:" + "4" * 64), + ("squash_sha256", "5" * 64), + ("runtime_fingerprint_sha256", "6" * 64), + ("implementation_contract_sha256", "7" * 64), + ("public_config_sha256", "9" * 64), + ("routing_control_sha256", "8" * 64), + ): + changed = copy.deepcopy(dataset) + changed["series"][0]["build"][field] = value + with self.subTest(build_field=field), self.assertRaisesRegex( + publisher.PublisherError, "commit" + ): + publisher.validate_public_dataset(changed) + changed = copy.deepcopy(dataset) + changed["series"][0]["workload"]["workload_id"] = identity.workload_id( + {"changed": True} + ) + with self.assertRaisesRegex(publisher.PublisherError, "committed factors"): + publisher.validate_public_dataset(changed) + + for mutate, message in ( + (lambda item: item["backend"].update({ + "generation": "fabricated", "version": "fabricated-999", + }), "configuration"), + (lambda item: item["resource"].update({ + "profile": "profile-fabricated", "configured_units": 99, + }), "configuration"), + (lambda item: item["system"].update({"label": "Fabricated H100"}), "projection"), + ): + changed = copy.deepcopy(dataset) + mutate(changed["series"][0]) + with self.assertRaisesRegex(publisher.PublisherError, message): + publisher.validate_public_dataset(changed) + + diagnostic = _dataset() + diagnostic["series"][0]["build"]["source_sha"] = "b" * 40 + with self.assertRaisesRegex(publisher.PublisherError, "committed factors"): + publisher.validate_public_dataset(diagnostic) + + def test_all_decision_metrics_require_stable_repeat_ordering(self) -> None: + fast, fast_internal = _series("ordering-fast", "deepep", decision_grade=True) + slow, slow_internal = _series("ordering-slow", "uccl", decision_grade=True) + internals = { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + } + + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow], internals + ) + library = next(item for item in cohorts if item["kind"] == "library") + self.assertTrue(library["eligibility"]["decision_grade"]) + self.assertEqual( + len([item for item in rankings if item["cohort_id"] == library["cohort_id"]]), + 4, + ) + self.assertEqual( + len([ + item for item in recommendations + if item["cohort_id"] == library["cohort_id"] + ]), + 4, + ) + + for statistic in ("p50", "p99"): + slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = ( + fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] * 2 + ) + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow], internals + ) + library = next(item for item in cohorts if item["kind"] == "library") + self.assertFalse(library["eligibility"]["decision_grade"]) + self.assertIn("unstable-ordering", library["eligibility"]["reasons"]) + self.assertFalse(any( + item["cohort_id"] == library["cohort_id"] for item in rankings + )) + self.assertFalse(any( + item["cohort_id"] == library["cohort_id"] for item in recommendations + )) + + def test_extra_eligibility_reason_blocks_decision_grade(self) -> None: + allocations = [identity.allocation_id({"run": run}) for run in range(3)] + eligibility = publisher._eligibility_record( + allocations, complete=True, correct=True, measured=True, + stable_ordering=True, p50_ratio=1.01, p99_ratio=1.02, + extra_reasons=["incomplete-provenance"], + ) + self.assertFalse(eligibility["decision_grade"]) + self.assertEqual(eligibility["reasons"], ["incomplete-provenance"]) + self.assertIs(publisher._eligibility(eligibility, "fixture"), eligibility) + broken = {**eligibility, "decision_grade": True} + with self.assertRaisesRegex(publisher.PublisherError, "promotion gates"): + publisher._eligibility(broken, "fixture") + + def test_schema_is_strict_and_channel_target_must_be_complete(self) -> None: + dataset = _dataset() + dataset["unexpected"] = True + with self.assertRaises(publisher.PublisherError): + publisher.validate_public_dataset(dataset) + with mock.patch.object(publisher, "MAX_PUBLIC_DATASET_BYTES", 1), self.assertRaisesRegex( + publisher.PublisherError, "serving size limit" + ): + publisher.validate_public_dataset(_dataset()) + with tempfile.TemporaryDirectory() as temporary: + store = publisher.Store(Path(temporary).resolve()) + dataset = _dataset() + digest, size = store.install_dataset(dataset) + store.update_channel("latest-attempt", digest, size, dataset["generated_at"]) + self.assertEqual(store.verify_channel("latest-attempt")["dataset"]["sha256"], digest) + channel_path = store.channels / "latest-attempt.json" + pointer = publisher.strict_load(channel_path) + pointer["generated_at"] = "2099-01-01T00:00:00Z" + channel_path.write_bytes(contracts.canonical_json_bytes(pointer)) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.verify_channel("latest-attempt") + store.update_channel("latest-attempt", digest, size, dataset["generated_at"]) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.update_channel( + "latest-attempt", digest, size + 1, dataset["generated_at"] + ) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.update_channel( + "latest-attempt", digest, size, "2026-07-05T00:00:00Z" + ) + os.chmod(channel_path, 0o666) + with self.assertRaisesRegex(publisher.PublisherError, "regular 644"): + store.verify_channel("latest-attempt") + os.chmod(channel_path, 0o644) + dataset_dir = store.datasets / digest + os.chmod(dataset_dir, 0o755) + with self.assertRaisesRegex(publisher.PublisherError, "mode differs"): + store.verify_channel("latest-attempt") + os.chmod(dataset_dir, 0o555) + os.chmod(dataset_dir / "dataset.json", 0o644) + with self.assertRaisesRegex(publisher.PublisherError, "mode differs"): + store.verify_channel("latest-attempt") + os.chmod(dataset_dir / "dataset.json", 0o444) + os.chmod(dataset_dir, 0o755) + (dataset_dir / "COMPLETE").unlink() + os.chmod(dataset_dir, 0o555) + with self.assertRaisesRegex(publisher.PublisherError, "incomplete"): + store.verify_channel("latest-attempt") + + def test_store_modes_do_not_depend_on_process_umask(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + previous = os.umask(0o077) + try: + store = publisher.Store(Path(temporary).resolve()) + dataset = _dataset() + digest, size = store.install_dataset(dataset) + store.update_channel( + "latest-attempt", digest, size, dataset["generated_at"] + ) + with store.locked(): + pass + finally: + os.umask(previous) + self.assertEqual( + store.root.stat().st_mode & 0o777, + 0o750, + ) + self.assertEqual( + (store.channels / "latest-attempt.json").stat().st_mode & 0o777, + 0o644, + ) + self.assertEqual( + (store.datasets / digest / "dataset.json").stat().st_mode & 0o777, + 0o444, + ) + self.assertEqual( + (store.locks / "publisher.lock").stat().st_mode & 0o777, + 0o600, + ) + + def test_verify_requires_bootstrap_but_dev_latest_is_optional(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + args = types.SimpleNamespace( + store_root=str(root / "store"), channel=None, bundle=[] + ) + with self.assertRaises(publisher.PublisherError): + publisher.verify_command(args) + store = publisher.Store(args.store_root) + dataset = publisher._quarantine_dataset( + "awaiting-v1-runs", "2026-07-04T00:00:00Z" + ) + digest, size = store.install_dataset(dataset) + store.update_channel( + "latest-attempt", digest, size, "2026-07-04T00:00:00Z" + ) + result = publisher.verify_command(args) + self.assertEqual(set(result["channels"]), {"latest-attempt"}) + explicit = types.SimpleNamespace( + store_root=args.store_root, channel=["dev-latest"], bundle=[] + ) + with self.assertRaises(publisher.PublisherError): + publisher.verify_command(explicit) + dev_pointer = copy.deepcopy(store.verify_channel("latest-attempt")) + dev_pointer["channel"] = "dev-latest" + (store.channels / "dev-latest.json").write_bytes( + contracts.canonical_json_bytes(dev_pointer) + ) + with self.assertRaisesRegex(publisher.PublisherError, "non-promoted"): + publisher.verify_command(args) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_sampling_contract.py b/experimental/CollectiveX/tests/test_sampling_contract.py new file mode 100644 index 0000000000..fa4b4005ea --- /dev/null +++ b/experimental/CollectiveX/tests/test_sampling_contract.py @@ -0,0 +1,2287 @@ +#!/usr/bin/env python3 +"""CPU-only behavioral tests for the CollectiveX v1 execution contract.""" +from __future__ import annotations + +import argparse +import ast +import copy +import hashlib +import io +import json +import os +from pathlib import Path +import re +import subprocess +import sys +import tempfile +import types +import unittest +from unittest import mock + +import numpy as np + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import artifact_safety # noqa: E402 +import capability # noqa: E402 +import contracts # noqa: E402 +import eplb # noqa: E402 +import ep_harness # noqa: E402 +import identity # noqa: E402 +import run_ep # noqa: E402 +import summarize # noqa: E402 +import sweep_matrix # noqa: E402 +import workload # noqa: E402 + + +class SamplingContractTest(unittest.TestCase): + def test_identity_and_fixed_sampling_profile(self) -> None: + identity.verify_test_vector() + self.assertTrue(identity.is_typed_id(identity.IDENTITY_TEST_VECTOR["series_id"], "series")) + self.assertEqual(ep_harness.SAMPLING_CONTRACT, "fixed-512-v1") + self.assertEqual( + ( + ep_harness.TIMED_ITERS_PER_TRIAL, + ep_harness.TRIALS_PER_POINT, + ep_harness.TIMED_SAMPLES_PER_POINT, + ep_harness.WARMUP_ITERS_PER_TRIAL, + ), + (8, 64, 512, 32), + ) + self.assertEqual(identity.V1_CASE_PROFILE["activation_profile"], "canonical-counter-source-v3") + self.assertEqual( + identity.V1_CASE_PROFILE["activation_generator"], + "collectivex-activation-counter-v3", + ) + self.assertEqual(identity.V1_CASE_PROFILE["sampling_contract"], "fixed-512-v1") + self.assertEqual(identity.V1_CASE_PROFILE["percentile_method"], "nearest-rank") + self.assertEqual( + identity.V1_CASE_PROFILE["rank_reduction"], + "cross-rank-max-per-iteration", + ) + self.assertEqual( + identity.V1_CASE_PROFILE["oracle_contract"], + "expert-specific-transform-v1", + ) + parser = argparse.ArgumentParser() + ep_harness.add_common_args(parser) + args = parser.parse_args( + ["--runner", "test", "--topology-class", "test", "--out", "result.json"] + ) + self.assertEqual((args.iters, args.trials, args.warmup), (8, 64, 32)) + for profile in ((8, 64, 32), (128, 4, 32), (8, 1, 4), (0, 64, 32)): + with self.subTest(profile=profile): + self.assertEqual( + ep_harness.sampling_contract_error(*profile) is None, + profile == (8, 64, 32), + ) + + def test_nearest_rank_percentiles_use_all_512_samples(self) -> None: + samples = list(range(1, 513)) + self.assertEqual(ep_harness.percentile(samples, 50), 256) + self.assertEqual(ep_harness.percentile(samples, 99), 507) + + def test_terminal_summary_uses_bound_sku_and_route(self) -> None: + terminal = { + "format": contracts.TERMINAL_FORMAT, + "case": { + "backend": "deepep", "phase": "prefill", "ep": 8, + "suite": "ep-routing-v1", "routing": "zipf", "eplb": True, + "required_publication": "comparable-experimental", + }, + "identity": {"case_factors": {"sku": "h100-dgxc"}}, + } + self.assertEqual( + summarize._identity(terminal), + ( + "h100-dgxc", "ep-routing-v1", "zipf", "prefill", True, + "comparable-experimental", 8, + ), + ) + + def test_matrix_cases_and_shards_are_identity_bound(self) -> None: + matrix = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix(backends="all") + ) + requested = {item["case"]["case_id"]: item for item in matrix["requested_cases"]} + assigned = [case_id for shard in matrix["include"] for case_id in shard["case_ids"]] + runnable = { + case_id for case_id, item in requested.items() + if item["disposition"] == "runnable" + } + self.assertEqual( + ( + len(matrix["include"]), + len(matrix["requested_cases"]), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + ), + ), + (38, 360, 840), + ) + routing_points = { + phase: { + int(point) + for item in matrix["requested_cases"] + if item["case"]["suite"] == "ep-routing-v1" + and item["case"]["phase"] == phase + for point in item["case"]["ladder"].split() + } + for phase in ("decode", "prefill") + } + self.assertEqual(routing_points, {"decode": {128}, "prefill": {512}}) + skus = sorted({shard["sku"] for shard in matrix["include"]}) + self.assertEqual( + [shard["sku"] for shard in matrix["include"][:len(skus)]], + skus, + ) + self.assertEqual(set(assigned), runnable) + self.assertEqual(len(assigned), len(set(assigned))) + excluded = { + "uccl": {"b200-dgxc", "b300"}, + } + for backend, skus in excluded.items(): + for sku in skus: + with self.subTest(backend=backend, sku=sku): + self.assertFalse(capability.resolve(sku, backend)[0]) + for case_id, item in requested.items(): + case = {key: value for key, value in item["case"].items() if key != "case_id"} + self.assertEqual( + case_id, + identity.case_id(sku=item["sku"], profile=identity.V1_CASE_PROFILE, case=case), + ) + self.assertEqual(case["timing"], "8:64:32") + self.assertEqual(case["samples_per_point"], 512) + + bad_matrix = copy.deepcopy(matrix) + bad_matrix["schema_version"] = True + with self.assertRaises(sweep_matrix.MatrixError): + sweep_matrix.validate_matrix_document(bad_matrix) + + bad_catalog = copy.deepcopy(matrix) + wrapper = next( + item for item in bad_catalog["requested_cases"] + if item["disposition"] == "runnable" + ) + old_id = wrapper["case"]["case_id"] + wrapper["case"]["hidden"] = 1 + factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"} + new_id = identity.case_id( + sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors + ) + wrapper["case"]["case_id"] = new_id + for shard in bad_catalog["include"]: + shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]] + with self.assertRaisesRegex(sweep_matrix.MatrixError, "frozen v1"): + sweep_matrix.validate_matrix_document(bad_catalog) + + shard_meta = matrix["include"][0] + requested_cases = {item["case"]["case_id"]: item["case"] for item in matrix["requested_cases"]} + shard = { + "schema_version": True, + "id": shard_meta["id"], + "sku": shard_meta["sku"], + "backend": shard_meta["backend"], + "nodes": shard_meta["nodes"], + "n": shard_meta["n"], + "cases": [requested_cases[value] for value in shard_meta["case_ids"]], + } + with self.assertRaises(sweep_matrix.MatrixError): + sweep_matrix.validate_shard_control( + shard, sku=shard_meta["sku"], backend=shard_meta["backend"], + nodes=shard_meta["nodes"], + ) + + def test_matrix_yaml_and_config_validation_are_strict(self) -> None: + suites = sweep_matrix._load("suites.yaml") + workloads = sweep_matrix._load("workloads.yaml") + invalid = ( + ("unknown top", lambda s, _w: s.update({"typo": True})), + ( + "unknown suite field", + lambda s, _w: s["suites"]["ep-core-v1"].update({"modes": ["normal"]}), + ), + ( + "unknown workload field", + lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"unused": 1}), + ), + ( + "string phases", + lambda s, _w: s["suites"]["ep-core-v1"].update({"phases": "decode"}), + ), + ( + "unknown routing", + lambda s, _w: s["suites"]["ep-core-v1"].update({"routings": ["random"]}), + ), + ( + "integer EPLB", + lambda s, _w: s["suites"]["ep-routing-v1"].update({"eplb": [0, 1]}), + ), + ( + "duplicate platform", + lambda s, _w: s["suites"]["ep-core-v1"]["platforms"].append("h100-dgxc"), + ), + ("missing top field", lambda s, _w: s.pop("schema_version")), + ( + "string dimension", + lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"hidden": "7168"}), + ), + ( + "unreachable phase ladder", + lambda s, _w: s["suites"]["ep-routing-v1"].update({"phases": ["prefill"]}), + ), + ) + for label, mutate in invalid: + with self.subTest(label=label), self.assertRaises(SystemExit): + bad_suites, bad_workloads = copy.deepcopy(suites), copy.deepcopy(workloads) + mutate(bad_suites, bad_workloads) + sweep_matrix.validate_config_documents(bad_suites, bad_workloads) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "configs").mkdir() + (root / "configs" / "duplicate.yaml").write_text( + "schema_version: 1\nsuites:\n same: 1\n same: 2\n" + ) + with mock.patch.object(sweep_matrix, "HERE", root), self.assertRaisesRegex( + SystemExit, "duplicate YAML key" + ): + sweep_matrix._load("duplicate.yaml") + + def test_semantically_duplicate_suite_points_are_rejected(self) -> None: + suites = sweep_matrix._load("suites.yaml") + workloads = sweep_matrix._load("workloads.yaml") + suites["suites"]["ep-core-copy-v1"] = copy.deepcopy( + suites["suites"]["ep-core-v1"] + ) + + def load(name: str) -> dict[str, object]: + return workloads if name == "workloads.yaml" else suites + + with mock.patch.object(sweep_matrix, "_load", side_effect=load), self.assertRaisesRegex( + SystemExit, "duplicate semantic point" + ): + sweep_matrix.resolve_matrix() + + def test_only_three_shared_launchers_are_registered(self) -> None: + expected = { + "launch_single-slurm.sh", + "launch_gb-nv.sh", + "launch_mi-amds.sh", + } + self.assertEqual({path.name for path in (ROOT / "launchers").glob("launch_*.sh")}, expected) + self.assertEqual( + {platform["launcher"] for platform in capability.PLATFORMS.values()}, + {"single-slurm", "gb-nv", "mi-amds"}, + ) + for platform in capability.PLATFORMS.values(): + launcher = ROOT / "launchers" / f"launch_{platform['launcher']}.sh" + self.assertTrue(launcher.is_file()) + source = launcher.read_text() + self.assertNotIn("RUNNER_NAME", source) + self.assertIn("cx_preflight_allocation", source) + lock_environment = 'cx_lock_canonical_gha_env "$RUNNER"' + self.assertIn(lock_environment, source) + self.assertLess( + source.index("cx_load_operator_config"), + source.index(lock_environment), + ) + validate = 'cx_validate_shard_control "$CX_DIR"' + stage = 'MOUNT_SRC="$(cx_stage_repo ' + self.assertIn(validate, source) + self.assertLess(source.index(validate), source.index(stage)) + self.assertLess(source.index(validate), source.index("cx_require_vars")) + + common = (ROOT / "runtime" / "common.sh").read_text() + workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text() + self.assertNotIn("RUNNER_NAME", common) + self.assertNotIn("RUNNER_NAME:", workflow) + self.assertNotIn("flashinfer", capability.BACKENDS) + self.assertFalse((HERE / "ep_flashinfer.py").exists()) + + def test_image_pinned_deepep_and_input_integrity_order_are_explicit(self) -> None: + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + probe = runtime[runtime.index("cx_probe_deepep()"): + runtime.index("cx_activate_deepep_v2()")] + self.assertIn('expected_version="1.2.1"', probe) + self.assertIn('expected_version="1.1.0+814e508"', probe) + self.assertNotIn("pip install", probe) + self.assertNotIn("cx_fetch_revision", probe) + self.assertIn("Path(deep_ep.__file__).resolve() in recorded_files", probe) + self.assertIn("Path(buffer_module.__file__).resolve() in recorded_files", probe) + + harness = (HERE / "ep_harness.py").read_text() + pass_one = harness[harness.index("# ---- Pass 1"): + harness.index("# ---- Pass 2")] + self.assertLess( + pass_one.index("input_snapshots[T] ="), + pass_one.index("oracle = _run_expert_oracle"), + ) + self.assertIn("pre_input_unchanged", pass_one) + self.assertIn("hh = prep()\n torch.cuda.synchronize()", harness) + + def test_squash_imports_are_reproducible_and_use_a_fresh_cache_key(self) -> None: + common = (ROOT / "runtime" / "common.sh").read_text() + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + self.assertIn('CX_SQUASH_FORMAT_VERSION="repro-v1"', common) + self.assertIn("SOURCE_DATE_EPOCH=\"$CX_SQUASH_SOURCE_DATE_EPOCH\"", common) + self.assertIn("${COLLECTIVEX_IMAGE_DIGEST#sha256:}", common) + self.assertIn("cx_ensure_squash_on_job", amd) + self.assertIn('"${CX_LOCK_DIR:-}"', amd) + self.assertNotIn('"${CX_LOCK_DIR:-/tmp}"', amd) + self.assertIn('[ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"', common) + self.assertGreaterEqual(common.count("--chdir=/tmp"), 2) + self.assertGreaterEqual(amd.count("--chdir=/tmp"), 2) + self.assertIn('ENROOT_CACHE_PATH="$compute_home/enroot-cache"', common) + self.assertIn('ENROOT_RUNTIME_PATH="$compute_home/enroot-run"', common) + self.assertEqual(common.count('cx_reverify_registry_image "$image"'), 2) + result = subprocess.run( + [ + "bash", + "-c", + f'source "{ROOT / "runtime" / "common.sh"}"; ' + 'COLLECTIVEX_IMAGE_DIGEST="sha256:$(printf b%.0s {1..64})"; ' + 'CX_IMAGE_PLATFORM=linux/amd64; cx_squash_path /cache repo/image:tag; ' + 'printf "\\n"; CX_IMAGE_PLATFORM=linux/arm64; ' + 'cx_squash_path /cache repo/image:tag', + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + digest = "b" * 64 + self.assertEqual( + result.stdout.splitlines(), + [ + f"/cache/repro-v1_{digest}_repo_image_tag.sqsh", + f"/cache/repro-v1_linux_arm64_{digest}_repo_image_tag.sqsh", + ], + ) + + def test_launchers_preserve_platform_specific_runtime_requirements(self) -> None: + single = (ROOT / "launchers" / "launch_single-slurm.sh").read_text() + gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text() + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + common = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn("ALLOC_EXTRA=(--mem=0)", single) + self.assertIn("ALLOC_EXTRA=(-N 1 --mem=0)", single) + self.assertIn("SRUN_EXTRA=(--mpi=none --container-remap-root)", single) + self.assertIn("CX_ENROOT_LOCAL_IMPORT=1", single) + self.assertIn('PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-', gb) + self.assertIn("cx_ensure_squash_on_job", gb) + self.assertIn("--mem=0 --cpus-per-task=72", gb) + self.assertIn("--mem=0 --cpus-per-task=35", gb) + self.assertIn("--container-writable", gb) + self.assertIn("--container-remap-root", gb) + workload_stage = gb[ + gb.index("workload_args=("):gb.index("workload_log=", gb.index("workload_args=(")) + ] + self.assertNotIn("--workload", workload_stage) + self.assertIn("mi325x) CPUS_PER_TASK=256", amd) + self.assertIn("/dev/kfd:/dev/kfd,/dev/dri:/dev/dri", amd) + collect = common[common.index("cx_collect_results()"): + common.index("cx_cleanup_stage()")] + cleanup = common[common.index("cx_launcher_cleanup()"): + common.index("cx_install_launcher_fail_safe()")] + self.assertNotIn("cx_cleanup_stage", collect) + self.assertLess(cleanup.index("cx_cancel_job"), cleanup.index("cx_cleanup_stage")) + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + self.assertIn('distribution.read_text("direct_url.json")', runtime) + self.assertIn("6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac", runtime) + self.assertIn("2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882", runtime) + + def test_deferred_backend_provenance_resolves_before_measurement(self) -> None: + harness = (ROOT / "tests" / "ep_harness.py").read_text() + conditioning = harness.index("for wt in conditioning_ladder") + provenance = harness.index("# Setup may materialize deferred provenance") + measurement = harness.index("# ---- Pass 1: build each deterministic problem") + self.assertLess(conditioning, provenance) + self.assertLess(provenance, measurement) + + def test_backend_specific_routing_contracts_are_explicit(self) -> None: + hybrid = (ROOT / "tests" / "ep_deepep_hybrid.py").read_text() + self.assertIn("self.domain_rank = int(self.buffer.local_rank)", hybrid) + self.assertIn( + "probability_columns = self.domain_rank * self.local_experts + local_expert_ids", + hybrid, + ) + self.assertIn("h.recv_probs[:count][rows, probability_columns]", hybrid) + + mori = (ROOT / "tests" / "ep_mori.py").read_text() + self.assertIn("topk_idx=indices", mori) + self.assertIn("indices=indices", mori) + self.assertIn( + "combine_indices = p.indices if self._async_ll else h.dispatch_indices", + mori, + ) + self.assertIn("h.combine_input,\n None,\n combine_indices", mori) + self.assertIn('"use_external_inp_buf": self._async_ll', mori) + self.assertIn("self.block_num = self._block_target = 64", mori) + self.assertIn('config_kwargs["block_num"] = self.block_num', mori) + self.assertIn( + 'config_kwargs["warp_num_per_block"] = self.dispatch_warps', mori + ) + self.assertIn("count > tensor.size(0)", mori) + self.assertIn("return combined[:p.T]", mori) + self.assertNotIn("return combined\n", mori) + self.assertIn( + "raw_expert_ids < local_start + experts_per_rank", + mori, + ) + self.assertNotIn("MoRI returned a non-local expert", mori) + harness = (ROOT / "tests" / "ep_harness.py").read_text() + self.assertIn("problem.recv_tokens = backend.recv_tokens(handle)", harness) + + def test_mori_masks_global_topk_metadata_to_the_local_rank(self) -> None: + path = HERE / "ep_mori.py" + tree = ast.parse(path.read_text(), str(path)) + helper = next( + node + for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_project_local_metadata" + ) + namespace: dict[str, object] = {} + exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace) + raw_ids = np.array([[0, 32, 63, -1], [64, 95, 7, 96]], dtype=np.int64) + raw_weights = np.arange(8, dtype=np.float32).reshape(2, 4) + torch_module = types.SimpleNamespace( + where=np.where, + full_like=np.full_like, + zeros_like=np.zeros_like, + ) + ids, weights, local_ids = namespace["_project_local_metadata"]( + torch_module, raw_ids, raw_weights, 1, 32 + ) + np.testing.assert_array_equal( + ids, + np.array([[-1, 32, 63, -1], [-1, -1, -1, -1]], dtype=np.int64), + ) + np.testing.assert_array_equal( + weights, + np.array([[0, 1, 2, 0], [0, 0, 0, 0]], dtype=np.float32), + ) + counts = np.bincount(local_ids, minlength=32) + self.assertEqual((counts[0], counts[31], int(counts.sum())), (1, 1, 2)) + commit_helper = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_mori_source_commit" + ) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + module = root / "python" / "mori" / "__init__.py" + module.parent.mkdir(parents=True) + module.touch() + git = root / ".git" + git.mkdir() + (git / "HEAD").write_text("a" * 40 + "\n") + commit_namespace = { + "Path": Path, + "re": re, + "mori": types.SimpleNamespace(__file__=str(module)), + } + exec( + compile(ast.Module(body=[commit_helper], type_ignores=[]), str(path), "exec"), + commit_namespace, + ) + self.assertEqual(commit_namespace["_mori_source_commit"](), "a" * 40) + (git / "HEAD").write_text("ref: refs/heads/main\n") + with self.assertRaisesRegex(RuntimeError, "detached commit"): + commit_namespace["_mori_source_commit"]() + + profile = contracts.project_resource_profile( + { + "block_num": 64, + "device_cus": 304, + "kernel_type": "AsyncLL", + "tuned_source": "upstream-asyncll-64x8-external-input", + } + ) + self.assertIsNone(profile["comm_units_kind"]) + self.assertIsNone(profile["configured_units"]) + + def test_squash_identity_rehashes_instead_of_trusting_a_sidecar(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + image = Path(temporary) / "image.sqsh" + image.write_bytes(b"current squash bytes") + sidecar = Path(f"{image}.sha256") + sidecar.write_text("a" * 64) + os.utime(sidecar, (image.stat().st_mtime + 10, image.stat().st_mtime + 10)) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; COLLECTIVEX_EXECUTION_ID="squash-hash-$$"; ' + 'cx_export_squash_identity "$2"; cx_cleanup_private_logs 0; ' + 'printf "%s" "$COLLECTIVEX_SQUASH_SHA256"', + "_", str(ROOT / "runtime" / "common.sh"), str(image), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout, hashlib.sha256(image.read_bytes()).hexdigest()) + + def test_salloc_job_id_parser_uses_the_portable_grant_message(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + directory = Path(temporary) + arguments = directory / "arguments" + salloc = directory / "salloc" + salloc.write_text( + "#!/usr/bin/env bash\n" + "printf '%s\\n' \"$@\" > \"$CX_TEST_SALLOC_ARGUMENTS\"\n" + "printf 'salloc: Granted job allocation 4242\\n' >&2\n" + ) + salloc.chmod(0o700) + result = subprocess.run( + [ + "bash", + "-c", + f'source "{ROOT / "runtime" / "common.sh"}"; ' + 'COLLECTIVEX_EXECUTION_ID="scheduler-parser-$$"; ' + 'JOB_ID=""; cx_salloc_jobid --partition=compute; ' + 'cx_cleanup_private_logs 0; printf "%s:%s" "$JOB_ID" "$CX_ALLOCATION_REQUESTED"', + ], + text=True, + capture_output=True, + env={ + **os.environ, + "PATH": f"{directory}:{os.environ['PATH']}", + "CX_TEST_SALLOC_ARGUMENTS": str(arguments), + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout, "4242:1") + self.assertEqual( + arguments.read_text().splitlines(), + ["--partition=compute", "--no-shell"], + ) + + def test_allocation_cleanup_fails_closed_when_scheduler_queries_fail(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + directory = Path(temporary) + for name, body in { + "scancel": "exit 0", + "squeue": "exit 2", + "sleep": "exit 0", + }.items(): + command = directory / name + command.write_text(f"#!/usr/bin/env bash\n{body}\n") + command.chmod(0o700) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_cancel_job 4242', + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={**os.environ, "PATH": f"{directory}:{os.environ['PATH']}"}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("did not terminate", result.stderr) + + workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text() + self.assertIn("cleanup-unsafe", workflow) + self.assertIn("cleanup-safe", workflow) + self.assertIn("Confirm allocation cleanup", workflow) + self.assertIn("Prepare pinned backend source archive", workflow) + self.assertIn("Install pinned backend source seed", workflow) + self.assertIn("CX_BACKEND_SOURCE_SEED_ROOT", workflow) + self.assertIn("steps.gen.outputs.source_backends", workflow) + self.assertIn("with tarfile.open", workflow) + artifact_validation = workflow[workflow.index("- name: Validate shard artifact safety"):] + self.assertIn("steps.allocation_cleanup.outcome == 'success'", artifact_validation) + sweep_workflow = workflow[workflow.index(" sweep:"):] + self.assertNotIn("GITHUB_WORKSPACE", sweep_workflow) + self.assertNotIn("RUNNER_WORKSPACE", sweep_workflow) + self.assertIn('CX_SOURCE_ROOT: /tmp/inferencex-collectivex-', sweep_workflow) + source_step = sweep_workflow[:sweep_workflow.index("- uses: actions/download-artifact")] + self.assertNotIn("unsafe_guards=", source_step) + self.assertIn("cutoff = time.time() - 86400", source_step) + self.assertIn("stat.S_IMODE(metadata.st_mode) != 0o700", source_step) + self.assertIn('for marker_name in ("cleanup-safe", "cleanup-unsafe")', source_step) + self.assertIn("stat.S_IMODE(marker.st_mode) == 0o600", source_step) + self.assertIn("shutil.rmtree(entry.path)", source_step) + self.assertLess( + source_step.index('rev-parse HEAD'), + source_step.index("echo 'prepared=true'"), + ) + upload = workflow[workflow.index("- name: Stage shard artifact"):] + self.assertIn("id: stage_artifact", upload) + self.assertIn("id: upload_artifact", upload) + self.assertIn("steps.stage_artifact.outcome == 'success'", upload) + cleanup = workflow[workflow.index("- name: Cleanup isolated workspace"):] + for step in ( + "sweep_shard", "allocation_cleanup", "artifact_safety", + "delivery_contracts", "stage_artifact", "upload_artifact", + ): + self.assertIn(f"steps.{step}.outcome", cleanup) + self.assertLess( + cleanup.index('cleanup-safe" ]'), + cleanup.index('rm -rf -- "$CX_JOB_ROOT"'), + ) + + def test_runtime_identity_and_realized_placement_are_behavioral(self) -> None: + self.assertFalse(capability.runtime_identity_issues( + "mi325x", vendor="amd", arch="gfx942", machine="amd64", + device_name="AMD Instinct MI325X", device_count=8, world_size=8, + )) + self.assertTrue(capability.runtime_identity_issues( + "mi355x", vendor="amd", arch="gfx942", machine="amd64", + device_name="AMD Instinct MI325X", device_count=8, world_size=8, + )) + records = [("private-a", rank) for rank in range(4)] + [ + ("private-b", rank) for rank in range(4) + ] + self.assertEqual( + run_ep._summarize_realized_placement( + records, expected_nodes=2, expected_gpus_per_node=4, expected_world_size=8 + ), + { + "gpus_per_node": 4, + "nodes": 2, + "ranks_per_node": 4, + "unique_local_ranks": True, + "valid": True, + }, + ) + with self.assertRaises(ValueError): + run_ep._summarize_realized_placement( + records[:-1] + [("private-b", 2)], + expected_nodes=2, + expected_gpus_per_node=4, + expected_world_size=8, + ) + + def test_collective_version_and_rccl_fingerprint_are_normalized(self) -> None: + self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4") + self.assertEqual(ep_harness.format_collective_version(21805), "2.18.5") + self.assertEqual(ep_harness.format_collective_version((2, 21, 5)), "2.21.5") + + properties = types.SimpleNamespace( + multi_processor_count=304, total_memory=1024, warp_size=64 + ) + fake = types.SimpleNamespace( + __version__="2.9.0", + version=types.SimpleNamespace(cuda=None, hip="7.2"), + cuda=types.SimpleNamespace( + get_device_properties=lambda _device: properties, + get_device_name=lambda _device: "AMD Instinct MI325X", + nccl=types.SimpleNamespace(version=lambda: 21805), + ), + ) + with mock.patch.object( + run_ep, "_loaded_collective_version", return_value="2.18.5" + ): + fingerprint = run_ep._runtime_fingerprint( + fake, "device", machine="amd64", vendor="amd", arch="gfx942" + ) + self.assertEqual(fingerprint["collective_library"], {"kind": "rccl", "version": "2.18.5"}) + self.assertEqual(fingerprint["accelerator_runtime"], {"kind": "hip", "version": "7.2"}) + + class FakeCollective: + @staticmethod + def ncclGetVersion(pointer) -> int: + pointer._obj.value = 23004 + return 0 + + maps = "0-1 r-xp 0 00:00 0 /runtime/libnccl.so.2\n" + with ( + mock.patch("builtins.open", return_value=io.StringIO(maps)), + mock.patch.object(run_ep.os.path, "isfile", return_value=True), + mock.patch.object( + run_ep.os.path, "realpath", return_value="/runtime/libnccl.so.2" + ), + mock.patch.object(run_ep.ctypes, "CDLL", return_value=FakeCollective()), + ): + self.assertEqual(run_ep._loaded_collective_version(), "2.30.4") + + path = HERE / "ep_nccl.py" + tree = ast.parse(path.read_text(), str(path)) + helper = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_runtime_collective" + ) + namespace = {"re": re} + exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace) + args = types.SimpleNamespace( + runtime_fingerprint={ + "collective_library": {"kind": "nccl", "version": "2.30.4"} + } + ) + cuda = types.SimpleNamespace(version=types.SimpleNamespace(hip=None)) + self.assertEqual(namespace["_runtime_collective"](args, cuda), ("nccl", "2.30.4")) + args.runtime_fingerprint["collective_library"]["version"] = None + with self.assertRaisesRegex(RuntimeError, "runtime identity is unavailable"): + namespace["_runtime_collective"](args, cuda) + self.assertNotIn("torch.cuda.nccl.version", path.read_text()) + + def test_workloads_bind_generator_activation_and_trace(self) -> None: + args = ("uniform", 7168, 8, 256, 8, 64, 67) + first = workload.compute_workload_id(*args) + self.assertTrue(identity.is_typed_id(first, "workload")) + self.assertEqual(first, workload.compute_workload_id(*args)) + self.assertNotEqual(first, workload.compute_workload_id(*args[:-1], 68)) + self.assertNotEqual( + first, + workload.compute_workload_id(*args, trace_checksum="a" * 64), + ) + _, _, manifest = workload.build_workload(8, 2, 4, "uniform", 4, 67, 2) + member, checksums, _, _ = workload.canonical_member( + "uniform", 8, 2, 4, 2, 2, 67 + ) + self.assertEqual(member, manifest["workload_id"]) + self.assertEqual(checksums, manifest["checksums"]) + + def test_canonical_members_are_bound_to_each_scheduled_row(self) -> None: + case = { + "routing": "uniform", "hidden": 8, "topk": 2, "experts": 4, "ep": 2, + } + eplb_record = { + "enabled": False, "mapping_hash": None, "num_physical_experts": 4, + } + + def expected( + *, tokens: int = 1, hidden: int = 8 + ) -> tuple[str, dict[str, str], str]: + member, checksums, row_hash, _, _ = contracts._expected_canonical_trace( + "uniform", hidden, 2, 4, 4, 2, tokens, 67, False, 2048 + ) + return member, checksums, row_hash + + member, checksums, row_hash = expected() + rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}] + proof = { + "manifest_checksums": {member: checksums}, + "members": [member], + "workload_id": identity.workload_id({ + "members": [{"checksums": checksums, "workload_id": member}] + }), + } + contracts._validate_canonical_workload(proof, case, rows, eplb_record) + + def replace_member(document: dict, replacement: tuple[str, dict[str, str], str]) -> None: + replacement_id, replacement_checksums, _ = replacement + document["members"] = [replacement_id] + document["manifest_checksums"] = {replacement_id: replacement_checksums} + document["workload_id"] = identity.workload_id({ + "members": [{ + "checksums": replacement_checksums, + "workload_id": replacement_id, + }] + }) + + mutations = { + "wrong member token": lambda document, mutated_rows: replace_member( + document, expected(tokens=2) + ), + "wrong member dimensions": lambda document, mutated_rows: replace_member( + document, expected(hidden=16) + ), + "wrong member checksum": lambda document, mutated_rows: replace_member( + document, + ( + member, + {**checksums, "topk_idx": "0" * 64}, + row_hash, + ), + ), + "row hash unrelated to member": lambda document, mutated_rows: mutated_rows[0][ + "routing" + ].update({"hash": "f" * 64}), + } + for label, mutate in mutations.items(): + with self.subTest(label=label), self.assertRaises(contracts.ContractError): + bad_proof, bad_rows = copy.deepcopy(proof), copy.deepcopy(rows) + mutate(bad_proof, bad_rows) + contracts._validate_canonical_workload( + bad_proof, case, bad_rows, eplb_record + ) + + def test_eplb_row_hash_is_bound_to_the_frozen_remap(self) -> None: + case = {"routing": "zipf", "hidden": 8, "topk": 2, "experts": 4, "ep": 2} + physical = eplb.physical_count(4, 32, 2) + plan = contracts._expected_eplb_plan("zipf", 2, 4, physical, 2, 67, 2048) + eplb_record = { + "enabled": True, + "mapping_hash": eplb.mapping_hash(plan), + "num_physical_experts": physical, + } + member, checksums, row_hash, _, _ = contracts._expected_canonical_trace( + "zipf", 8, 2, 4, physical, 2, 1, 67, True, 2048 + ) + self.assertNotEqual(row_hash, checksums["trace"]) + workload_proof = { + "manifest_checksums": {member: checksums}, + "members": [member], + "workload_id": identity.workload_id({ + "members": [{"checksums": checksums, "workload_id": member}] + }), + } + rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}] + contracts._validate_canonical_workload(workload_proof, case, rows, eplb_record) + with self.assertRaisesRegex(contracts.ContractError, "EPLB mapping"): + contracts._validate_canonical_workload( + workload_proof, case, rows, {**eplb_record, "mapping_hash": "0" * 64} + ) + + def test_oracle_pass_cannot_ignore_combined_value_failure(self) -> None: + oracle = { + "atol": ep_harness.ORACLE_ATOL, + "checks": { + "combine_values": True, + "counts": True, + "metadata": True, + "multiplicity": True, + "payload": True, + "source_set": True, + "weights": True, + }, + "combine_weight_semantics": "unweighted-rank-sum", + "contract": ep_harness.ORACLE_CONTRACT, + "dispatch_sha256": "a" * 64, + "max_absolute_error": 0.0, + "max_elementwise_relative_error": 0.0, + "max_relative_error": 0.0, + "max_weight_error": 0.0, + "order_sha256": "b" * 64, + "ordering_contract": "stable-v1", + "passed": True, + "receive_count": 1, + "rtol": ep_harness.ORACLE_RTOL, + } + contracts._validate_oracle(oracle, "oracle") + weighted = copy.deepcopy(oracle) + weighted["combine_weight_semantics"] = "native-gate-weighted" + with self.assertRaisesRegex(contracts.ContractError, "differs from v1"): + contracts._validate_oracle(weighted, "oracle") + tampered = copy.deepcopy(oracle) + tampered["checks"]["combine_values"] = False + with self.assertRaises(contracts.ContractError): + contracts._validate_oracle(tampered, "oracle") + + def test_oracle_stability_canonicalizes_native_receive_order(self) -> None: + source = (HERE / "ep_harness.py").read_text() + canonical = source[source.index("canonical_order = torch.argsort"): + source.index("problem.recv_tokens = receive_count")] + self.assertIn("canonical_sources", canonical) + self.assertIn("canonical_ids", canonical) + self.assertIn("canonical_weights", canonical) + self.assertNotIn("_tensor_sha256(source_ids", canonical) + mori = (HERE / "ep_mori.py").read_text() + self.assertIn( + 'self.kernel_generation = "async-ll" if self._async_ll else "intranode"', + mori, + ) + backend = types.SimpleNamespace(name="mori", kernel_generation="async-ll") + self.assertEqual(ep_harness.kernel_generation(backend), "async-ll") + + def test_terminal_fail_safe_fills_only_missing_shard_cases(self) -> None: + matrix = sweep_matrix.resolve_matrix(backends="all", max_cases=128) + shard = next(item for item in matrix["include"] if item["n"] >= 2) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + matrix_path = root / "matrix.json" + control_path = root / "control.json" + out_dir = root / "results" + matrix_path.write_text(json.dumps(matrix)) + control = sweep_matrix.extract_shard( + matrix_path, shard["id"], control_path, + sku=shard["sku"], backend=shard["backend"], nodes=shard["nodes"], + ) + control["cases"] = control["cases"][:2] + control["n"] = 2 + control_path.write_text(json.dumps(control)) + first = {key: value for key, value in control["cases"][0].items() if key != "case_id"} + git_run = { + "artifact": "artifact", "job": "job", "ref": "collectivex", + "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", + "run_id": "123", "source_sha": "a" * 40, + } + allocation = { + "artifact": "artifact", "execution_id": "execution", "job": "job", + "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", "run_id": "123", + "runner": shard["sku"], "source_sha": "a" * 40, + } + out_dir.mkdir() + existing = contracts.make_terminal_document( + allocation_factors=allocation, attempt_ordinal=1, case=first, + case_factors={"case": first, "profile": identity.V1_CASE_PROFILE, "sku": shard["sku"]}, + control_sha256=hashlib.sha256(control_path.read_bytes()).hexdigest(), + failure_mode="setup", generated_at="2026-07-04T00:00:00Z", git_run=git_run, + reason="launcher-setup-failed", return_code=7, source="runtime-emitter", + status="failed", + expected_case_id=control["cases"][0]["case_id"], + ) + (out_dir / "existing.json").write_text(json.dumps(existing)) + (out_dir / "partial.json").write_text(json.dumps({ + "format": contracts.RAW_FORMAT, + "identity": {"case_id": control["cases"][1]["case_id"]}, + "sample_artifact": {"path": "partial.samples.json"}, + })) + (out_dir / "partial.samples.json").write_text("{broken") + environment = { + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SHARD_FILE": str(control_path), + "CX_SHARD_SKU": shard["sku"], + "CX_RUNNER": shard["sku"], + "CX_BENCH": shard["backend"], + "CX_NODES": str(shard["nodes"]), + "COLLECTIVEX_EXECUTION_ID": "execution", + "COLLECTIVEX_ARTIFACT_NAME": "artifact", + "GITHUB_JOB": "job", "GITHUB_REF_NAME": "collectivex", + "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX", + "GITHUB_RUN_ATTEMPT": "1", "GITHUB_RUN_ID": "123", + "GITHUB_SHA": "a" * 40, + } + subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_emit_setup_failures "$2" "$3" "$4" 7', + "_", str(ROOT / "runtime" / "common.sh"), str(ROOT), + str(out_dir), shard["backend"], + ], + check=True, + env=environment, + ) + attempts = [contracts.strict_load(path) for path in out_dir.glob("*.json")] + self.assertEqual(len(attempts), 2) + self.assertEqual( + contracts.validate_attempt_paths([str(path) for path in out_dir.glob("*.json")]), + 2, + ) + delivery = [str(path) for path in out_dir.glob("*.json")] + self.assertEqual(contracts.validate_delivery(delivery, str(control_path)), 2) + with self.assertRaises(contracts.ContractError): + contracts.validate_delivery(delivery[:1], str(control_path)) + self.assertEqual( + {attempt["identity"]["case_id"] for attempt in attempts}, + {case["case_id"] for case in control["cases"]}, + ) + self.assertTrue((out_dir / "partial.json.quarantine").is_file()) + self.assertTrue((out_dir / "partial.samples.json.quarantine").is_file()) + + preallocation = root / "preallocation" + preallocation_results = preallocation / "experimental" / "CollectiveX" / "results" + preallocation_results.mkdir(parents=True) + failed = subprocess.run( + [ + "bash", "-c", + 'source "$1"; REPO_ROOT="$2"; export REPO_ROOT; ' + 'cx_install_launcher_fail_safe; cx_load_operator_config', + "_", str(ROOT / "runtime" / "common.sh"), str(preallocation), + ], + env={**environment, "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1"}, + ) + self.assertNotEqual(failed.returncode, 0) + preallocation_attempts = [ + contracts.validate_terminal_document(contracts.strict_load(path)) + for path in preallocation_results.glob("*.json") + ] + self.assertEqual( + {attempt["identity"]["case_id"] for attempt in preallocation_attempts}, + {case["case_id"] for case in control["cases"]}, + ) + + def test_runtime_identity_mismatch_is_failed_not_unsupported(self) -> None: + wrapper = next( + item for item in sweep_matrix.resolve_matrix()["requested_cases"] + if item["disposition"] == "runnable" + ) + case = wrapper["case"] + environment = { + "CX_RUNNER": wrapper["sku"], "CX_CASE_ID": case["case_id"], + "CX_SUITE": case["suite"], "CX_WORKLOAD_NAME": case["workload"], + "CX_REQUIRED_PUBLICATION": case["required_publication"], + "CX_ROUTING": case["routing"], "CX_EPLB": "1" if case["eplb"] else "", + "CX_EP": str(case["ep"]), "CX_NGPUS": str(case["ep"]), + "CX_HIDDEN": str(case["hidden"]), "CX_TOPK": str(case["topk"]), + "CX_EXPERTS": str(case["experts"]), "CX_NODES": str(case["nodes"]), + "CX_GPUS_PER_NODE": str(case["gpus_per_node"]), + "CX_SCALE_UP_DOMAIN": str(case["scale_up_domain"]), + "CX_TOKENS_LADDER": case["ladder"], "CX_CANONICAL": "1", + "CX_ITERS": "8", "CX_TRIALS": "64", "CX_WARMUP": "32", + "CX_SAMPLES_PER_POINT": "512", "GITHUB_RUN_ID": "123", + "GITHUB_RUN_ATTEMPT": "1", "GITHUB_REF_NAME": "collectivex", + "GITHUB_SHA": "a" * 40, "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX", + "GITHUB_JOB": "sweep", "COLLECTIVEX_ARTIFACT_NAME": "artifact", + "COLLECTIVEX_EXECUTION_ID": "execution", + } + with mock.patch.dict(os.environ, environment, clear=False): + terminal = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=5 + ) + self.assertEqual(terminal["identity"]["case_id"], case["case_id"]) + self.assertEqual( + terminal["outcome"], + { + "failure_mode": "runtime-identity", + "reason": "runtime-identity-mismatch", + "return_code": 5, + "status": "failed", + }, + ) + for mode, reason in contracts.RUNTIME_FAILURE_REASONS.items(): + with self.subTest(mode=mode), mock.patch.dict(os.environ, environment, clear=False): + staged = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=1, + failure_mode=mode, + ) + self.assertEqual(staged["outcome"]["reason"], reason) + mismatched = copy.deepcopy(staged) + mismatched["outcome"]["reason"] = "distributed-command-failed" + if reason == "distributed-command-failed": + mismatched["outcome"]["reason"] = "backend-setup-failed" + with self.assertRaisesRegex( + contracts.ContractError, "source and outcome are not registered" + ): + contracts.validate_terminal_document(mismatched) + with mock.patch.dict(os.environ, environment, clear=False): + with self.assertRaisesRegex( + contracts.ContractError, "runtime failure mode is not registered" + ) as raised: + contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=1, + failure_mode="raw-private-error", + ) + self.assertNotIn("raw-private-error", str(raised.exception)) + with mock.patch.dict(os.environ, environment, clear=False): + generic = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=6, + ) + self.assertEqual( + generic["outcome"], + { + "failure_mode": "execution", + "reason": "distributed-command-failed", + "return_code": 6, + "status": "failed", + }, + ) + + def test_launchers_use_private_logs_and_allowlisted_failure_stages(self) -> None: + expected = { + "launch_single-slurm.sh": { + "setup", "registry-verification", "container-import", "container-hash", + "repository-stage", "scheduler-allocation", "container-launch", + "artifact-collection", + }, + "launch_gb-nv.sh": { + "setup", "registry-verification", "container-import", "container-hash", + "repository-stage", "scheduler-allocation", "container-launch", "backend-setup", + "execution", "artifact-collection", + }, + "launch_mi-amds.sh": { + "setup", "repository-stage", "registry-verification", "scheduler-allocation", + "container-import", "container-hash", "container-launch", "artifact-collection", + }, + } + for name, stages in expected.items(): + launcher = (ROOT / "launchers" / name).read_text() + self.assertNotIn("--export=ALL", launcher) + self.assertIn("cx_container_exports", launcher) + self.assertIn("collect_rc=0", launcher) + for stage in stages: + with self.subTest(launcher=name, stage=stage): + self.assertIn(f"cx_set_failure_stage {stage}", launcher) + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + self.assertIn("cx_ensure_squash_on_job", amd) + self.assertIn("cx_fail_stage container-hash", amd) + self.assertNotIn('cat "$import_log"', amd) + common = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn('bash -s -- "$sq" "$lock" "$image"', common) + self.assertIn("> \"$log\" 2>&1 <<'BASH'", common) + self.assertIn("cx_fail_stage container-import", common) + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + export_start = common.index("\ncx_container_exports() {") + exports = common[export_start:common.index("\n}", export_start)] + export_names = { + name + for payload in re.findall(r"printf '%s' '([^']*)'", exports) + for name in payload.split(",") if name + } + for private_name in ( + "COLLECTIVEX_OPERATOR_CONFIG", "GITHUB_TOKEN", "GITHUB_WORKSPACE", "HOME", + "CX_PARTITION", "CX_ACCOUNT", "CX_SQUASH_DIR", "CX_STAGE_DIR", + ): + self.assertNotIn(private_name, export_names) + self.assertIn("CX_BACKEND_CACHE_ROOT", export_names) + self.assertIn("CX_BACKEND_CACHE_SENTINEL_SHA256", export_names) + self.assertNotIn("CX_PREPARED_BACKEND_CACHE", export_names) + self.assertIn("MORI_COMMIT", export_names) + self.assertIn("cx_write_runtime_stage backend-setup", runtime) + self.assertIn("cx_write_runtime_stage execution", runtime) + gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text() + self.assertIn("cx_private_log_path shard-summary", gb) + self.assertIn("cx_fail_stage execution", gb) + + def test_case_failure_diagnostic_precedes_normal_srun_footer(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + log.write_text( + "WARN: deepep decode run failed rc=1 (CX_RUN_TIMEOUT=900s)\n" + "SHARD done: 6/6 case(s) failed\n" + "srun: error: task exited 1\n" + ) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 1) + self.assertIn("diagnostic=benchmark-case-failure", result.stderr) + + def test_non_timeout_failure_warning_is_classified_as_case_failure(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + log.write_text("WARN: deepep decode run failed rc=1\nsrun: task exited 1\n") + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 1) + self.assertNotIn("diagnostic=network-or-timeout", result.stderr) + self.assertIn("diagnostic=benchmark-case-failure", result.stderr) + + def test_private_runtime_failure_signatures_override_case_footer(self) -> None: + signatures = { + "DeepEP V2 no-GIN run is outside one realized LSA domain": + "accelerator-topology", + "CUDA error: call requires newer driver": "accelerator-driver", + "NCCL failure in ncclCommWindowRegister": "nccl-device-api", + "NVCC compilation failed": "jit-toolchain", + "CUDA out of memory": "accelerator-memory", + "torch rendezvous timed out": "network-or-timeout", + } + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + for signature, diagnostic in signatures.items(): + log.write_text(f"{signature}\nSHARD done: 6/6 case(s) failed\n") + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 1) + self.assertIn(f"diagnostic={diagnostic}", result.stderr) + + def test_runtime_stage_marker_distinguishes_launch_from_execution(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + mount = Path(temporary) + root = mount / "experimental" / "CollectiveX" + root.mkdir(parents=True) + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_EXECUTION_ID=test_1_shard CX_TS=test + cx_set_failure_stage container-launch + cx_prepare_runtime_marker "$2" + (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage backend-setup) + cx_adopt_runtime_stage "$2" + test "$CX_FAILSAFE_MODE" = backend-setup + (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage execution) + cx_adopt_runtime_stage "$2" + test "$CX_FAILSAFE_MODE" = execution + ''' + subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(mount)], + check=True, + ) + + def test_canonical_gha_environment_is_locked_but_manual_overrides_survive(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 + export CX_IMAGE=untrusted CX_IMAGE_DIGEST=untrusted CX_NGPUS=99 + export CX_NCCL_HOME=/untrusted CX_LOCK_DIR=/tmp CX_SQUASH_DIR=/shared/containers + export CX_STAGE_DIR=/private/stale-stage + export CX_MORI_KERNEL_TYPE=intranode MORI_ENABLE_SDMA=0 + export NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 CX_DRYRUN=1 + export CX_BACKEND_CACHE_ROOT=/untrusted CX_BACKEND_CACHE_SENTINEL_SHA256=bad + export CX_PREPARED_BACKEND_CACHE=/untrusted CX_BACKEND_SOURCE_ROOT=/untrusted + cx_lock_canonical_gha_env mi325x + test "$CX_IMAGE" = "$CX_IMAGE_AMD_MORI_MI325" + test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_AMD_MORI_MI325_DIGEST" + test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:1800 + test "$CX_MORI_KERNEL_TYPE:$MORI_DISABLE_AUTO_XGMI:$MORI_ENABLE_SDMA" = asyncll:0:1 + test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI325" + test "$MORI_APP_LOG_LEVEL:$MORI_SHMEM_LOG_LEVEL:$MORI_IO_LOG_LEVEL" = info:info:info + test "$CX_STAGE_DIR" = "$GITHUB_WORKSPACE" + test -z "${CX_NCCL_HOME+x}${CX_LOCK_DIR+x}${NCCL_MNNVL_ENABLE+x}${MC_FORCE_MNNVL+x}" + test -z "${CX_BACKEND_CACHE_ROOT+x}${CX_BACKEND_CACHE_SENTINEL_SHA256+x}" + test -z "${CX_PREPARED_BACKEND_CACHE+x}${CX_BACKEND_SOURCE_ROOT+x}" + test -z "${CX_DRYRUN+x}" + + unset CX_STAGE_DIR + export CX_SHARD_SKU=gb300 CX_NODES=2 CX_GPUS_PER_NODE=4 + export CX_IMAGE=untrusted CX_NGPUS=1 CX_MORI_KERNEL_TYPE=untrusted + export MORI_ENABLE_SDMA=0 CX_NCCL_HOME=/untrusted CX_MASTER_PORT=1 + cx_lock_canonical_gha_env gb300 + test "$CX_IMAGE" = "$CX_IMAGE_MULTIARCH" + test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST" + test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:900 + test "$CX_NCCL_HOME:$CX_MASTER_PORT" = /usr:29551 + test "$CX_STAGE_DIR" = /shared/containers/.stage + test -z "${CX_MORI_KERNEL_TYPE+x}${MORI_ENABLE_SDMA+x}" + + export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ + export CX_SHARD_SKU=mi355x CX_NODES=1 CX_GPUS_PER_NODE=8 + export CX_LOCK_DIR=/validated/amd-locks + cx_lock_canonical_gha_env mi355x + test "$CX_LOCK_DIR" = /validated/amd-locks + test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI355" + + unset COLLECTIVEX_CANONICAL_GHA + unset COLLECTIVEX_OPERATOR_CONFIG_LOADED + CX_IMAGE=manual CX_IMAGE_DIGEST=manual CX_NGPUS=3 + CX_MORI_KERNEL_TYPE=manual + cx_lock_canonical_gha_env mi355x + test "$CX_IMAGE:$CX_IMAGE_DIGEST:$CX_NGPUS:$CX_MORI_KERNEL_TYPE" = manual:manual:3:manual + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as workspace: + Path(workspace).chmod(0o720) + subprocess.run( + ["bash", "-c", command, "_", str(common)], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": workspace, + }, + ) + self.assertEqual(list(Path(workspace).iterdir()), []) + + def test_canonical_amd_stage_rejects_a_world_writable_workspace(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers + cx_lock_canonical_gha_env mi325x + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as workspace: + Path(workspace).chmod(0o702) + result = subprocess.run( + ["bash", "-c", command, "_", str(common)], + text=True, + capture_output=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": workspace, + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("canonical AMD staging workspace is unsafe", result.stderr) + self.assertNotIn(workspace, result.stderr) + + def test_canonical_amd_stage_rejects_a_symlinked_workspace(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers + cx_lock_canonical_gha_env mi325x + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as temporary: + root = Path(temporary) + real = root / "real" + real.mkdir() + link = root / "workspace" + link.symlink_to(real, target_is_directory=True) + result = subprocess.run( + ["bash", "-c", command, "_", str(common)], + text=True, + capture_output=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": str(link), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("canonical AMD staging workspace is unsafe", result.stderr) + self.assertNotIn(str(root), result.stderr) + + def test_image_selection_and_registry_verification_are_fail_closed(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + test "$(cx_default_image mi325x)" = "$CX_IMAGE_AMD_MORI_MI325" + test "$(cx_default_image mi355x)" = "$CX_IMAGE_AMD_MORI" + pinned="sha256:$(printf 'a%.0s' {1..64})" + curl() { + case "$*" in + *auth.docker.io*) printf '{"token":"test"}' ;; + *) printf 'Docker-Content-Digest: %s\r\n' "$pinned" ;; + esac + } + test "$(cx_resolve_registry_digest ubuntu:latest)" = "$pinned" + test "$(cx_resolve_registry_digest docker.io/library/ubuntu:latest)" = "$pinned" + ! (cx_resolve_registry_digest "ubuntu@$pinned") + ! (cx_resolve_registry_digest ghcr.io/example/image:tag) + ! (cx_resolve_registry_digest 'ubuntu@sha256:bad') + curl() { + case "$*" in *auth.docker.io*) printf '{"token":"test"}';; esac + } + ! (cx_resolve_registry_digest ubuntu:latest) + cx_resolve_registry_digest() { printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST"; } + cx_verify_registry_image "$CX_IMAGE_MULTIARCH" + test "$COLLECTIVEX_IMAGE_DIGEST_VERIFIED" = 1 + test "$COLLECTIVEX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST" + cx_reverify_registry_image "$CX_IMAGE_MULTIARCH" + cx_resolve_registry_digest() { printf 'sha256:%064d' 0; } + ! (cx_reverify_registry_image "$CX_IMAGE_MULTIARCH") + ! (cx_verify_registry_image "$CX_IMAGE_MULTIARCH") + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common)], + check=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + + def test_canonical_gha_requires_compute_visible_staging(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + repo = Path(temporary) / "repo" + squash = Path(temporary) / "squash" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + squash.mkdir() + (source / "public.py").write_text("public\n") + (source / "private-infra.md").write_text("private\n") + command = r''' + set -euo pipefail + source "$1" + unset CX_SHARD_FILE CX_STAGE_DIR + ! (COLLECTIVEX_CANONICAL_GHA=1; cx_stage_repo "$2" "") + staged="$(COLLECTIVEX_CANONICAL_GHA=0; cx_stage_repo "$2" "")" + test "$staged" != "$2" + test -f "$staged/experimental/CollectiveX/public.py" + test ! -e "$staged/experimental/CollectiveX/private-infra.md" + cx_cleanup_stage "$staged" "$2" + test ! -e "$staged" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(repo)], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SQUASH_DIR": str(squash), + }, + ) + self.assertEqual(list(squash.iterdir()), []) + + def test_manual_stage_does_not_write_to_checkout_parent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary) / "readonly-parent" + repo = parent / "repo" + squash = parent / "squash" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + squash.mkdir(mode=0o700) + (source / "public.py").write_text("public\n") + original_mode = parent.stat().st_mode & 0o777 + parent.chmod(0o555) + try: + command = r''' + set -euo pipefail + source "$1" + unset CX_STAGE_DIR + staged="$(cx_stage_repo "$2" "")" + case "$staged" in "$3"/.collectivex-stage-*) ;; *) exit 1 ;; esac + test -f "$staged/experimental/CollectiveX/public.py" + test ! -e "$4/.collectivex-stage" + cx_cleanup_stage "$staged" "$2" + test ! -e "$staged" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), + str(squash), str(parent), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SQUASH_DIR": str(squash), + }, + ) + finally: + parent.chmod(original_mode) + self.assertEqual( + sorted(path.name for path in parent.iterdir()), + ["repo", "squash"], + ) + self.assertEqual(list(squash.iterdir()), []) + + def test_stage_refuses_to_reuse_an_execution_child(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + base = root / "stage" + child = base / "job_collision" + child.mkdir(parents=True, mode=0o700) + sentinel = child / "keep" + sentinel.write_text("keep") + command = r''' + source "$1" + ! (cx_stage_repo "$2" "$3") + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_CANONICAL_GHA": "1", + "COLLECTIVEX_EXECUTION_ID": "collision", + "CX_STAGE_DIR": str(base), + }, + ) + self.assertEqual(sentinel.read_text(), "keep") + + def test_stage_removes_its_execution_child_when_rsync_fails(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + base = root / "stage" + sentinel = root / "rsync-called" + command = r''' + source "$1" + rsync() { : > "$RSYNC_CALLED"; return 1; } + ! cx_stage_repo "$2" "$3" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_CANONICAL_GHA": "1", + "CX_STAGE_DIR": str(base), + "RSYNC_CALLED": str(sentinel), + }, + ) + self.assertTrue(sentinel.is_file()) + self.assertEqual(list(base.iterdir()), []) + + def test_backend_cache_reuses_v3_and_falls_back_once_without_repair(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary) / "stage" + parent.mkdir(mode=0o700) + concurrent = Path(temporary) / "concurrent" + concurrent.mkdir(mode=0o700) + command = r''' + set -euo pipefail + source "$1" + for worker in 1 2 3; do + ( + cx_prepare_backend_cache "$2" + printf '%s %s\n' "$CX_BACKEND_CACHE_SENTINEL_SHA256" \ + "$CX_PREPARED_BACKEND_CACHE" > "$3/$worker" + ) & + done + wait + cmp "$3/1" "$3/2" + cmp "$3/1" "$3/3" + cx_prepare_backend_cache "$2" + first="$CX_PREPARED_BACKEND_CACHE" + first_digest="$CX_BACKEND_CACHE_SENTINEL_SHA256" + chmod 2700 "$first" + cx_prepare_backend_cache "$2" + second="$CX_PREPARED_BACKEND_CACHE" + test "$first" = "$second" + test "$first_digest" = "$CX_BACKEND_CACHE_SENTINEL_SHA256" + test "$first" = "$(cd "$2" && pwd -P)/.collectivex-backend-cache-v3-$(id -u)" + export CX_BACKEND_CACHE_ROOT="$first" + cx_verify_backend_cache_mount + export CX_BACKEND_CACHE_SENTINEL_SHA256="$(printf '0%.0s' {1..64})" + ! cx_verify_backend_cache_mount + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(parent), + str(concurrent), + ], + check=True, + ) + cache = parent / f".collectivex-backend-cache-v3-{os.getuid()}" + self.assertTrue(cache.is_dir()) + self.assertEqual(cache.stat().st_mode & 0o777, 0o700) + self.assertEqual( + list(cache.glob(".collectivex-mount-sentinel-v1.tmp.*")), [] + ) + alias = Path(temporary) / "stage-alias" + alias.symlink_to(parent, target_is_directory=True) + canonical = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_prepare_backend_cache "$2"; ' + 'printf "%s\\n%s\\n" "$CX_PREPARED_BACKEND_CACHE" ' + '"$CX_BACKEND_CACHE_SENTINEL_SHA256"', + "_", str(common), str(alias), + ], + text=True, + capture_output=True, + check=True, + ) + cache_path, digest = canonical.stdout.splitlines() + self.assertEqual(cache_path, str(cache.resolve())) + self.assertRegex(digest, r"^[0-9a-f]{64}$") + saved = parent / "saved-cache" + cache.rename(saved) + cache.mkdir(mode=0o700) + replacement = cache / ".collectivex-mount-sentinel-v1" + replacement.write_bytes(b"replacement".ljust(32, b"!")) + replacement.chmod(0o600) + replaced = subprocess.run( + [ + "bash", "-c", + 'source "$1"; export CX_BACKEND_CACHE_ROOT="$2" ' + 'CX_BACKEND_CACHE_SENTINEL_SHA256="$3"; ' + 'cx_verify_backend_cache_mount', + "_", str(common), str(cache), digest, + ] + ) + self.assertNotEqual(replaced.returncode, 0) + replacement.unlink() + cache.rmdir() + saved.rename(cache) + (cache / ".collectivex-mount-sentinel-v1").unlink() + cache.rmdir() + target = Path(temporary) / "target" + target.mkdir(mode=0o700) + cache.symlink_to(target, target_is_directory=True) + fallback = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_prepare_backend_cache "$2"; ' + 'printf "%s\\n" "$CX_PREPARED_BACKEND_CACHE"', + "_", str(common), str(parent), + ], + text=True, + capture_output=True, + check=True, + ) + v4 = parent / f".collectivex-backend-cache-v4-{os.getuid()}" + self.assertEqual(fallback.stdout.strip(), str(v4.resolve())) + self.assertTrue(cache.is_symlink()) + self.assertTrue(v4.is_dir()) + (v4 / ".collectivex-mount-sentinel-v1").unlink() + v4.rmdir() + v4.symlink_to(target, target_is_directory=True) + result = subprocess.run( + [ + "bash", "-c", 'source "$1"; cx_prepare_backend_cache "$2"', + "_", str(common), str(parent), + ], + text=True, + capture_output=True, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn(str(parent), result.stderr) + self.assertTrue(cache.is_symlink()) + self.assertTrue(v4.is_symlink()) + + source = common.read_text().split("cx_prepare_backend_cache() {", 1)[1] + program = source.split("<<'PY'\n", 1)[1].split("\nPY\n", 1)[0] + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary) / "stage" + parent.mkdir(mode=0o700) + fake_os = types.ModuleType("os") + fake_os.__dict__.update(os.__dict__) + fake_os.fsync = mock.Mock(side_effect=OSError("forced fsync failure")) + with ( + mock.patch.dict(sys.modules, {"os": fake_os}), + mock.patch.object(sys, "argv", ["-", str(parent)]), + mock.patch.object(sys, "stdout", io.StringIO()), + self.assertRaises(SystemExit) as failure, + ): + exec(compile(program, "", "exec"), {}) + self.assertEqual(failure.exception.code, 1) + self.assertEqual( + list(parent.rglob(".collectivex-mount-sentinel-v1.tmp.*")), [] + ) + + def test_nvidia_namespace_package_roots_come_from_distribution_files(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + site = Path(temporary) / "site" + package = site / "nvidia" / "nccl" + (package / "include").mkdir(parents=True) + (package / "lib").mkdir() + (package / "include" / "nccl.h").write_text("header\n") + (package / "lib" / "libnccl.so.2").write_text("library\n") + info = site / "nvidia_nccl_cu13-2.30.4.dist-info" + info.mkdir() + (info / "METADATA").write_text( + "Metadata-Version: 2.1\nName: nvidia-nccl-cu13\nVersion: 2.30.4\n" + ) + (info / "RECORD").write_text( + "nvidia/nccl/include/nccl.h,,\n" + "nvidia/nccl/lib/libnccl.so.2,,\n" + "nvidia_nccl_cu13-2.30.4.dist-info/METADATA,,\n" + "nvidia_nccl_cu13-2.30.4.dist-info/RECORD,,\n" + ) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_nvidia_package_root()/,/^}/p' "$1")" + root="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" + test "$root" = "$2/nvidia/nccl" + ! cx_nvidia_package_root nvidia-nccl-cu13 nvshmem + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(site.resolve())], + check=True, + env={**os.environ, "PYTHONPATH": str(site)}, + ) + + def test_cuda_cccl_exports_the_resolved_jit_toolchain_root(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + toolkit = root / "cuda-13.0" + (toolkit / "bin").mkdir(parents=True) + (toolkit / "include").mkdir() + (toolkit / "lib64").mkdir() + cccl = toolkit / "targets" / "x86_64-linux" / "include" / "cccl" + cccl.mkdir(parents=True) + nvcc = toolkit / "bin" / "nvcc" + nvcc.write_text("#!/bin/sh\nexit 0\n") + nvcc.chmod(0o755) + alias = root / "cuda" + alias.symlink_to(toolkit, target_is_directory=True) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_prepare_cuda_cccl()/,/^}/p' "$1")" + cx_prepare_cuda_cccl + test "$CUDA_HOME" = "$2" + test "$CX_CUDA_CCCL" = "$2/targets/x86_64-linux/include/cccl" + test "$CPATH" = "$2/targets/x86_64-linux/include/cccl:" + test "$NVCC_PREPEND_FLAGS" = "-I$2/targets/x86_64-linux/include/cccl " + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(toolkit.resolve())], + check=True, + env={ + **os.environ, + "PATH": f"{alias / 'bin'}:{os.environ['PATH']}", + "CPATH": "", + "NVCC_PREPEND_FLAGS": "", + }, + ) + + def test_deepep_v2_toolchain_rejects_overlay_lock_failure(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_prepare_deepep_toolchain()/,/^}/p' "$1")" + cache_root="$2" + cx_nvidia_package_root() { printf '%s' /unused; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_log() { :; } + flock() { return 1; } + ! cx_prepare_deepep_toolchain + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), temporary], + check=True, + ) + + def test_pinned_source_fetch_retries_transient_failures(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_git()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_fetch_revision()/,/^}/p' "$1")" + attempts=0 + expected_directory="$(cd -P -- "$3" && pwd -P)" + sleep() { :; } + git() { + local argument has_directory=0 has_trust=0 + if [ "$1" = '-c' ] && [ "$3" = init ]; then + mkdir -p "${@: -1}" + return 0 + fi + for argument in "$@"; do + [ "$argument" != '-C' ] || has_directory=1 + [ "$argument" != "safe.directory=$expected_directory" ] || has_trust=1 + [ "$argument" != 'safe.directory=*' ] || return 1 + done + [ "$has_directory" = 0 ] || [ "$has_trust" = 1 ] || return 1 + case " $* " in + *' fetch '*) + attempts=$((attempts + 1)) + [ "$attempts" = 3 ] + ;; + *' rev-parse HEAD '*) printf '%s\n' "$revision" ;; + *) return 0 ;; + esac + } + cx_fetch_revision https://example.invalid/repo "$2" "$3" + test "$attempts" = 3 + ''' + revision = "a" * 40 + subprocess.run( + ["bash", "-c", command, "_", str(common), revision, temporary], + check=True, + ) + + def test_git_tree_trust_is_exact_and_command_scoped(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + repository = root / "repo" + repository.mkdir() + alias = root / "alias" + alias.symlink_to(repository, target_is_directory=True) + wildcard = root / "*" + wildcard.mkdir() + arguments = root / "arguments" + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_git()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")" + arguments="$4" + git() { printf '%s\n' "$@" > "$arguments"; } + cx_git_in_tree "$2" status --porcelain + ! cx_git_in_tree relative status + ! cx_git_in_tree "$3" status + ! cx_git_in_tree "$5" status + ''' + subprocess.run( + [ + "bash", + "-c", + command, + "_", + str(common), + str(repository), + str(alias), + str(arguments), + str(wildcard), + ], + check=True, + ) + self.assertEqual( + arguments.read_text().splitlines(), + [ + "-c", + "credential.helper=", + "-c", + f"safe.directory={repository.resolve()}", + "-C", + str(repository.resolve()), + "status", + "--porcelain", + ], + ) + self.assertNotIn("safe.directory=*", arguments.read_text()) + + def test_runtime_materializes_the_verified_host_source_without_network(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + seed = root / "seed" + seed.mkdir() + (seed / "pinned").write_text("source\n") + destination = root / "build" + fetched = root / "network-fetch" + command = r''' + set -euo pipefail + source "$1" + export CX_BACKEND_SOURCE_ROOT="$2/source" + SEED="$3" FETCHED="$5" + copy_mode= + cx_backend_source_path() { printf '%s' "$SEED"; } + cx_backend_source_is_valid() { test -f "$2/pinned"; } + cx_fetch_revision() { : > "$FETCHED"; return 1; } + cp() { + test "$1" = -R + copy_mode=recursive + command cp "$@" + } + cx_materialize_backend_source deepep-hybrid "$4" + test -f "$4/pinned" + test "$copy_mode" = recursive + python3 - "$4" <<'PY' +import os +import stat +import sys +assert stat.S_IMODE(os.stat(sys.argv[1]).st_mode) == 0o700 +PY + test ! -e "$FETCHED" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(root), + str(seed), str(destination), str(fetched), + ], + check=True, + ) + + def test_backend_source_validation_rejects_status_errors_and_ignored_files(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + source "$1" + cx_backend_source_pin() { printf '%s|%s|' revision tree; } + git() { + case " $* " in + *' rev-parse HEAD '*) printf '%s\n' revision ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' tree ;; + *' status --porcelain '*) [ "$mode" != status-error ] ;; + *' ls-files --others --ignored '*) + [ "$mode" != ignored ] || printf '%s\n' ignored.bin + ;; + *) return 1 ;; + esac + } + mode=status-error + ! cx_backend_source_is_valid backend "$2" + mode=ignored + ! cx_backend_source_is_valid backend "$2" + mode=clean + cx_backend_source_is_valid backend "$2" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), temporary], + check=True, + ) + + def test_backend_source_root_normalizes_inherited_special_mode(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + source_root = root / "experimental" / "CollectiveX" / ".cx_sources" + source = source_root / "backend-revision" + source.mkdir(parents=True) + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_EXECUTION_ID="source-mode-$$" + trap 'cx_cleanup_private_logs 0' EXIT + expected_mount="$2" + expected_source="$3" + expected_root="${expected_source%/*}" + observed_mode=2700 + mock_stage_owner=4200 + mock_root_owner=4200 + chmod_calls=0 + chmod() { + test "$1" = 700 && test "$2" = "$expected_root" + chmod_calls=$((chmod_calls + 1)) + [ "$chmod_calls" = 2 ] || return 1 + observed_mode=700 + } + stat() { + case "$2" in + %u) + case "$3" in + "$expected_mount") printf '%s\n' "$mock_stage_owner" ;; + "$expected_root") printf '%s\n' "$mock_root_owner" ;; + *) return 1 ;; + esac + ;; + %a) + case "$3" in + "$expected_mount") printf '2700\n' ;; + "$expected_root") printf '%s\n' "$observed_mode" ;; + *) return 1 ;; + esac + ;; + *) return 1 ;; + esac + } + cx_backend_source_path() { printf '%s' "$expected_source"; } + cx_backend_source_is_valid() { + test "$1" = backend && test "$2" = "$expected_source" + } + cx_prepare_backend_source "$2" backend + test "$observed_mode" = 2700 + test "$chmod_calls" = 0 + observed_mode=2750 + ! _cx_prepare_backend_source "$2" backend + test "$chmod_calls" = 1 + _cx_prepare_backend_source "$2" backend + test "$observed_mode" = 700 + mock_root_owner=4300 + ! _cx_prepare_backend_source "$2" backend + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), str(root), str(source)], + check=True, + ) + + def test_canonical_backend_sources_use_verified_seed_without_network(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + mount = root / "mount" + source_root = mount / "experimental" / "CollectiveX" / ".cx_sources" + seed_root = root / "seed" + seeds = [ + seed_root / f"{backend}-revision" + for backend in ("backend-one", "backend-two") + ] + mount.mkdir(mode=0o700) + source_root.parent.mkdir(parents=True, mode=0o700) + for seed in seeds: + seed.mkdir(parents=True, mode=0o700) + (seed / "pinned").write_text("source\n") + network = root / "network" + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 + export CX_BACKEND_SOURCE_SEED_ROOT="$4" + export COLLECTIVEX_EXECUTION_ID="source-seed-$$" + trap 'cx_cleanup_private_logs 0' EXIT + NETWORK="$5" + stat() { + case "$2" in + %u) printf '4200\n' ;; + %a) printf '700\n' ;; + *) return 1 ;; + esac + } + cx_backend_source_path() { printf '%s/%s-revision' "$1" "$2"; } + cx_backend_source_is_valid() { test -f "$2/pinned"; } + cx_fetch_revision() { : > "$NETWORK"; return 1; } + cx_prepare_backend_source "$2" backend-one + cx_prepare_backend_source "$2" backend-two + test -f "$3/backend-one-revision/pinned" + test -f "$3/backend-two-revision/pinned" + test ! -e "$NETWORK" + rm -rf -- "$3/backend-one-revision" "$3/backend-two-revision" + unset CX_BACKEND_SOURCE_SEED_ROOT + ! _cx_prepare_backend_source "$2" backend-one + test ! -e "$NETWORK" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(mount), + str(source_root), str(seed_root), str(network), + ], + check=True, + ) + + def test_deepep_hybrid_cache_reuse_revalidates_extensions(self) -> None: + common = ROOT / "runtime" / "common.sh" + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "deep_ep_cpp.so").write_bytes(b"deep") + (root / "hybrid_ep_cpp.so").write_bytes(b"hybrid") + command = r''' + set -euo pipefail + chmod 700 "$3" + source "$1" + eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$2")" + eval "$(sed -n '/^cx_deepep_hybrid_cache_is_valid()/,/^}/p' "$2")" + revision=revision tree=tree + cx_git() { + case " $* " in + *' rev-parse HEAD '*) printf '%s\n' "$revision" ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' "$tree" ;; + *' status --porcelain '*|*' ls-files --others '*) return 0 ;; + *) return 1 ;; + esac + } + cx_git_in_tree() { shift; cx_git "$@"; } + marker="$3/.collectivex-complete" + digest="$(cx_extension_pair_sha256 "$3" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" + (umask 077; printf '%s\n%s\n%s\n' "$revision" "$tree" "$digest" > "$marker") + cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + printf changed > "$3/hybrid_ep_cpp.so" + ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + printf hybrid > "$3/hybrid_ep_cpp.so" + cp "$3/deep_ep_cpp.so" "$3/deep_ep_cpp-extra.so" + ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), str(runtime), temporary], + check=True, + ) + + def test_rack_backend_environment_is_shared_per_node_and_required(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + launcher = (ROOT / "launchers" / "launch_gb-nv.sh").read_text() + assignment = next( + line for line in launcher.splitlines() + if line.startswith("SOURCE_BACKEND_ENV=") + ) + self.assertNotIn("/tmp/.cx_backend_env", launcher) + self.assertIn('[ -f "$env_file" ] && [ -r "$env_file" ]', launcher) + self.assertIn('[ ! -L "$env_file" ]', launcher) + self.assertIn('$(stat -c "%u" "$env_root"):600', launcher) + self.assertIn('case "$(stat -c "%a" "$env_root")" in 700|[1-7]700)', launcher) + self.assertIn("node-${SLURM_NODEID}.sh", launcher) + self.assertIn("HybridEPBuffer", launcher) + self.assertIn('. "$env_file" || exit 66', launcher) + with tempfile.TemporaryDirectory() as temporary: + consumer = r''' + eval "$1" + env_root="$2/env" + SOURCE_BACKEND_ENV="${SOURCE_BACKEND_ENV//\/ix\/experimental\/CollectiveX\/.cx_backend\/env/$env_root}" + mkdir -p "$env_root" + env_file="$env_root/node-1.sh" + printf 'printf sourced > "$CX_SENTINEL"\n' > "$env_file" + chmod 600 "$env_file" + export CX_SENTINEL="$2/sentinel" + stat() { + [ "${STAT_FAIL:-0}" = 0 ] || return 1 + case "$2" in + %a) printf '%s\n' "$ROOT_MODE" ;; + %u) printf '1000\n' ;; + %u:%a) printf '%s\n' "$FILE_OWNER_MODE" ;; + *) return 2 ;; + esac + } + run_case() { + rm -f "$CX_SENTINEL" + ROOT_MODE="$1" FILE_OWNER_MODE="$2" STAT_FAIL="$3" SLURM_NODEID="$4" + ( eval "$SOURCE_BACKEND_ENV" ) + rc=$? + [ "$rc" = "$5" ] || return 1 + if [ "$5" = 0 ]; then + [ -f "$CX_SENTINEL" ] + else + [ ! -e "$CX_SENTINEL" ] + fi + } + run_case 700 1000:600 0 1 0 + run_case 2700 1000:600 0 1 0 + run_case 755 1000:600 0 1 66 + run_case 700 1000:600 1 1 66 + run_case 700 2000:600 0 1 66 + mv "$env_file" "$env_file.real" + ln -s "$env_file.real" "$env_file" + run_case 700 1000:600 0 1 66 + rm "$env_file" + mv "$env_file.real" "$env_file" + run_case 700 1000:600 0 invalid 66 + ''' + subprocess.run( + ["bash", "-c", consumer, "_", assignment, temporary], + check=True, + ) + command = r''' + set -euo pipefail + cd "$2" + eval "$(sed -n '/^cx_persist_backend_env()/,/^}/p' "$1")" + export SLURM_NODEID=1 PYTHONPATH=/ix/pinned DEEPEP_COMMIT=abc + cx_persist_backend_env + env_file="$PWD/.cx_backend/env/node-1.sh" + test -f "$env_file" + test "$(stat -f %Lp "$env_file" 2>/dev/null || stat -c %a "$env_file")" = 600 + unset PYTHONPATH DEEPEP_COMMIT + . "$env_file" + test "$PYTHONPATH" = /ix/pinned + test "$DEEPEP_COMMIT" = abc + SLURM_NODEID=invalid && ! cx_persist_backend_env + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), temporary], + check=True, + ) + + def test_stage_cleanup_failure_fails_job_but_marks_allocation_safe(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "repo").mkdir() + (root / "stage").mkdir() + command = r''' + source "$1" + cx_write_cleanup_guard() { + rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe" + : > "$CX_JOB_ROOT/cleanup-$1" + } + cx_cleanup_stage() { return 1; } + cx_cleanup_private_logs() { : > "$CX_JOB_ROOT/logs-deleted"; } + export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/stage" + export COLLECTIVEX_CANONICAL_GHA=1 CX_ALLOCATION_REQUESTED=0 + unset CX_BENCH JOB_ID + cx_launcher_cleanup 0 + ''' + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(root)], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + self.assertEqual(result.returncode, 1, result.stderr) + self.assertTrue((root / "cleanup-safe").is_file()) + self.assertFalse((root / "cleanup-unsafe").exists()) + self.assertFalse((root / "logs-deleted").exists()) + + def test_generated_stage_cleanup_never_removes_configured_base(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + base = root / "stage" + repo = root / "repo" + generated = base / "job_execution" + generated.mkdir(parents=True) + repo.mkdir() + (generated / "payload").write_text("temporary") + subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_cleanup_stage "$2" "$3"; ' + '! cx_cleanup_stage "$4" "$3"', + "_", str(ROOT / "runtime" / "common.sh"), str(generated), + str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_EXECUTION_ID": "execution", + "CX_STAGE_DIR": str(base), + }, + ) + self.assertFalse(generated.exists()) + self.assertTrue(base.is_dir()) + self.assertTrue(repo.is_dir()) + + def test_adapters_do_not_retain_dead_expected_methods(self) -> None: + for path in HERE.glob("ep_*.py"): + tree = ast.parse(path.read_text(), str(path)) + methods = { + node.name for node in ast.walk(tree) + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + } + self.assertNotIn("expected", methods, path.name) + + def test_artifact_safety_rejects_sensitive_material(self) -> None: + private_address = ".".join(str(octet) for octet in (10, 0, 0, 1)) + secret = "github_pat_" + "A" * 24 + sensitive = { + "ipv4": ({"note": private_address}, private_address), + "ipv6": ({"note": "[2001:db8::1]:29500"}, "2001:db8::1"), + "user-at-host": ({"note": "ssh admin@private-host"}, "admin@private-host"), + "hostname": ({"note": "host=compute-17"}, "compute-17"), + "private-dns": ({"note": "worker-7.cluster.local"}, "worker-7.cluster.local"), + "suffixed-host": ({"worker_hostname": "relative"}, "worker_hostname"), + "suffixed-address": ({"control_address": "relative"}, "control_address"), + "suffixed-path": ({"scheduler_path": "relative"}, "scheduler_path"), + "exact-address": ({"address": "relative"}, "address"), + "exact-ip": ({"ip": "relative"}, "ip"), + "camel-host": ({"workerHost": "relative"}, "workerHost"), + "camel-path": ({"schedulerPath": "relative"}, "schedulerPath"), + "acronym-gpu-uuid": ({"gpuUUID": "relative"}, "gpuUUID"), + "acronym-device-uuid": ({"deviceUUID": "relative"}, "deviceUUID"), + "acronym-pci-bus": ({"pciBusID": "relative"}, "pciBusID"), + "mac-address": ({"note": "00:11:22:33:44:55"}, "00:11:22:33:44:55"), + "ib-guid": ({"note": "00:11:22:33:44:55:66:77"}, "00:11:22:33:44:55:66:77"), + "dgx-host": ({"note": "dgx-b300-001"}, "dgx-b300-001"), + "cloud-host": ({"note": "ip-10-20-30-40"}, "ip-10-20-30-40"), + "credential-field": ({"service_token": "short"}, "service_token"), + "prefixed-token": ({"note": secret}, secret), + "hf-token": ({"note": "hf_" + "A" * 24}, "hf_" + "A" * 24), + "payment-token": ({"note": "sk_live_" + "A" * 24}, "sk_live_" + "A" * 24), + "generic-secret": ({"note": "password=not-a-real-secret"}, "not-a-real-secret"), + } + for root in ("data", "it-share", "lustre", "raid", "nvme_home", "scratch", "gpfs", "fsx"): + value = f"/{root}/collectivex/run" + sensitive[f"private-root-{root}"] = ({"note": value}, value) + for name, (document, offending) in sensitive.items(): + with self.subTest(name=name), self.assertRaises( + artifact_safety.ArtifactSafetyError + ) as caught: + artifact_safety.assert_publication_safe([document]) + self.assertNotIn(offending, str(caught.exception)) + + artifact_safety.assert_publication_safe([{ + "runner": "b300", + "redaction": "sanitized-v1", + "path": "datasets/" + "a" * 64 + "/dataset.json", + "timing": "8:64:32", + "image_digest": "sha256:" + "b" * 64, + "source": "github.com", + }]) + for ref in ("release@candidate", "worker1-feature", "sk-refactor-long-component-name"): + artifact_safety.assert_publication_safe([{"ref": ref}]) + + def test_artifact_safety_cli_does_not_echo_sensitive_values(self) -> None: + private_value = ".".join(str(octet) for octet in (10, 24, 68, 12)) + with tempfile.TemporaryDirectory() as temporary: + path = Path(temporary) / "artifact.json" + path.write_text(json.dumps({"note": private_value})) + result = subprocess.run( + [sys.executable, str(ROOT / "artifact_safety.py"), str(path)], + text=True, + capture_output=True, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("forbidden ipv4-address value", result.stderr) + self.assertNotIn(private_value, result.stderr) + + def test_artifact_safety_rejects_linked_and_special_inputs(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + source = root / "source.json" + source.write_text("{}") + linked = root / "linked.json" + linked.symlink_to(source) + fifo = root / "fifo.json" + os.mkfifo(fifo) + for path in (linked, fifo): + with self.subTest(path=path.name), self.assertRaises( + artifact_safety.ArtifactSafetyError + ): + artifact_safety.load_documents([str(path)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py new file mode 100644 index 0000000000..89a6b46052 --- /dev/null +++ b/experimental/CollectiveX/tests/workload.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +"""Canonical, byte-stable CollectiveX routing workloads. + +A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent +file, and referenced by an immutable `workload_id`. Every promoted benchmark point consumes the +SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a +checksum match, not by trusting that two machines re-ran the same seeded generator. + +Layout on disk (one workload = two files, basename = workload_id): + /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32 + /.manifest.json dims, routing profile, generator version, seed, SHA-256s + +Routing and gate weights come from a stdlib integer counter, not a framework RNG. The same +parameters therefore produce the same int32/float32 bytes across PyTorch and accelerator images. +""" +from __future__ import annotations + +from array import array +import bisect +import hashlib +import json +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import identity # noqa: E402 + +WORKLOAD_SCHEMA_VERSION = 1 +# Bump when the counter or byte encoding changes. The workload ID binds parameters and trace bytes. +GENERATOR_VERSION = "collectivex-routing-counter-v3" +GATE_WEIGHT_FORMAT = "counter-u16-normalized-f32" +ACTIVATION_GENERATOR = "collectivex-activation-counter-v3" +_MASK64 = (1 << 64) - 1 + + +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def _mix64(value: int) -> int: + value = (value + 0x9E3779B97F4A7C15) & _MASK64 + value = ((value ^ (value >> 30)) * 0xBF58476D1CE4E5B9) & _MASK64 + value = ((value ^ (value >> 27)) * 0x94D049BB133111EB) & _MASK64 + return value ^ (value >> 31) + + +def _counter(seed: int, token: int, slot: int, attempt: int, stream: int) -> int: + value = ( + (seed & _MASK64) + ^ (((token + 1) * 0xD2B74407B1CE6E93) & _MASK64) + ^ (((slot + 1) * 0xCA5A826395121157) & _MASK64) + ^ (((attempt + 1) * 0x9E3779B185EBCA87) & _MASK64) + ^ (((stream + 1) * 0xA24BAED4963EE407) & _MASK64) + ) + return _mix64(value) + + +def canonical_routing_rows( + global_tokens: int, experts: int, topk: int, routing: str, seed: int +) -> tuple[list[list[int]], list[list[float]]]: + """Generate distinct experts and normalized weights using exact integer counters.""" + if routing not in {"uniform", "zipf"}: + raise ValueError(f"unknown routing {routing!r} (uniform|zipf)") + if global_tokens <= 0 or experts <= 0 or topk <= 0 or topk > experts: + raise ValueError("global_tokens/experts/topk must be positive and topk <= experts") + + cumulative: list[int] | None = None + if routing == "zipf": + total = 0 + cumulative = [] + for expert in range(experts): + total += (1 << 32) // (expert + 1) + cumulative.append(total) + + indices: list[list[int]] = [] + weights: list[list[float]] = [] + for token in range(global_tokens): + selected: list[int] = [] + used: set[int] = set() + for slot in range(topk): + attempt = 0 + while True: + value = _counter(seed, token, slot, attempt, 0) + expert = ( + value % experts + if cumulative is None + else bisect.bisect_right(cumulative, value % cumulative[-1]) + ) + if expert not in used: + used.add(expert) + selected.append(expert) + break + attempt += 1 + if attempt > experts * 16: + raise RuntimeError("counter routing could not select distinct experts") + raw = [1 + _counter(seed, token, slot, 0, 1) % 65535 for slot in range(topk)] + denominator = float(sum(raw)) + indices.append(selected) + weights.append([value / denominator for value in raw]) + return indices, weights + + +def _canonical_bytes( + indices: list[list[int]], weights: list[list[float]] +) -> tuple[bytes, bytes]: + idx = array("i", (value for row in indices for value in row)) + gate = array("f", (value for row in weights for value in row)) + if idx.itemsize != 4 or gate.itemsize != 4: + raise RuntimeError("canonical workload requires 32-bit int and float arrays") + if sys.byteorder != "little": + idx.byteswap() + gate.byteswap() + return idx.tobytes(), gate.tobytes() + + +def trace_checksums( + indices: list[list[int]], weights: list[list[float]] +) -> dict[str, str]: + """Return the manifest hashes for exact logical or remapped routing rows.""" + idx_bytes, weight_bytes = _canonical_bytes(indices, weights) + return { + "topk_idx": _sha256(idx_bytes), + "topk_weights": _sha256(weight_bytes), + "trace": _sha256(idx_bytes + weight_bytes), + } + + +def canonical_member( + routing: str, + hidden: int, + topk: int, + experts: int, + ep_size: int, + tokens_per_rank: int, + seed: int, +) -> tuple[str, dict[str, str], list[list[int]], list[list[float]]]: + """Derive one canonical manifest member and retain its rows for proof checks.""" + global_tokens = ep_size * tokens_per_rank + indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed) + checksums = trace_checksums(indices, weights) + member = compute_workload_id( + routing, + hidden, + topk, + experts, + ep_size, + global_tokens, + seed, + trace_checksum=checksums["trace"], + ) + return member, checksums, indices, weights + + +def compute_workload_id(routing: str, hidden: int, topk: int, experts: int, + ep_size: int, global_tokens: int, seed: int, + generator: str = GENERATOR_VERSION, + trace_checksum: str | None = None) -> str: + """Deterministic ID over parameters and canonical trace bytes.""" + if generator != GENERATOR_VERSION: + raise ValueError(f"unsupported workload generator {generator!r}") + if trace_checksum is None: + indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed) + idx_bytes, weight_bytes = _canonical_bytes(indices, weights) + trace_checksum = _sha256(idx_bytes + weight_bytes) + key = { + "generator": generator, "routing": routing, "hidden": hidden, "topk": topk, + "experts": experts, "ep_size": ep_size, "global_tokens": global_tokens, + "seed": seed, "trace_sha256": trace_checksum, + "activation_generator": ACTIVATION_GENERATOR, + "activation_identity": compute_activation_identity(seed, hidden), + } + return identity.workload_id(key) + + +def compute_activation_identity(seed, hidden, generator=ACTIVATION_GENERATOR) -> str: + """Identity of the exact counter-derived activation generator.""" + key = f"counter|seed={seed}|hidden={hidden}|gen={generator}" + return _sha256(key.encode()) + + +def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank, + idx_np, weights_np): + """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib.""" + if experts % experts_per_rank: + raise ValueError("experts must be divisible by experts_per_rank") + idx_bytes = idx_np.astype(" str: + import numpy as np + os.makedirs(out_dir, exist_ok=True) + wid = manifest["workload_id"] + np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"), + topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32)) + with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + return wid + + +def load_workload(npz_path, verify=True): + """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest). + Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums.""" + import numpy as np + base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path + with open(base + ".manifest.json") as fh: + manifest = json.load(fh) + if manifest.get("workload_id") != os.path.basename(base): + raise ValueError(f"workload manifest ID does not match filename for {base}") + with np.load(base + ".npz", allow_pickle=False) as archive: + if set(archive.files) != {"topk_idx", "topk_weights"}: + raise ValueError(f"workload archive fields differ for {base}") + idx_np = np.ascontiguousarray(archive["topk_idx"]) + w_np = np.ascontiguousarray(archive["topk_weights"]) + if verify: + ok, reason = verify_workload(manifest, idx_np, w_np) + if not ok: + raise ValueError(f"workload checksum mismatch for {base}: {reason}") + return idx_np, w_np, manifest + + +def verify_workload(manifest, idx_np, weights_np): + """Recompute checksums and compare to the manifest. Returns (ok, reason).""" + import numpy as np + expected_fields = { + "schema_version", "workload_id", "generator_version", "gate_weight_format", "dims", + "routing_profile", "seed", "checksums", "activation_profile", "activation_generator", + "activation_identity", + } + if not isinstance(manifest, dict) or set(manifest) != expected_fields: + return False, "manifest fields differ from the v1 contract" + if (manifest["schema_version"] != WORKLOAD_SCHEMA_VERSION + or manifest["generator_version"] != GENERATOR_VERSION + or manifest["gate_weight_format"] != GATE_WEIGHT_FORMAT + or manifest["routing_profile"] not in {"uniform", "zipf"}): + return False, "manifest version or generator is unsupported" + if (isinstance(manifest["seed"], bool) or not isinstance(manifest["seed"], int) + or not identity.is_typed_id(manifest["workload_id"], "workload")): + return False, "manifest seed or workload ID is invalid" + dims = manifest["dims"] + dim_fields = {"hidden", "topk", "experts", "ep_size", "tokens_per_rank", + "global_tokens", "experts_per_rank"} + if not isinstance(dims, dict) or set(dims) != dim_fields: + return False, "manifest dimensions are invalid" + if any(isinstance(dims[key], bool) or not isinstance(dims[key], int) or dims[key] <= 0 + for key in dim_fields): + return False, "manifest dimensions must be positive integers" + if (dims["experts"] != dims["ep_size"] * dims["experts_per_rank"] + or dims["global_tokens"] != dims["ep_size"] * dims["tokens_per_rank"]): + return False, "manifest EP dimensions are inconsistent" + shape = (dims["global_tokens"], dims["topk"]) + if (idx_np.dtype != np.int32 or weights_np.dtype != np.float32 + or idx_np.shape != shape or weights_np.shape != shape + or not idx_np.flags.c_contiguous or not weights_np.flags.c_contiguous): + return False, "workload array dtype, shape, or layout is invalid" + if (np.any(idx_np < 0) or np.any(idx_np >= dims["experts"]) + or np.any(np.diff(np.sort(idx_np, axis=1), axis=1) == 0)): + return False, "expert indices are out of range or repeated" + if (not np.isfinite(weights_np).all() or np.any(weights_np < 0) + or not np.allclose(weights_np.sum(axis=1), 1.0, rtol=1e-5, atol=1e-6)): + return False, "gate weights are invalid" + if (manifest["activation_profile"] != "canonical-counter-source-v3" + or manifest["activation_generator"] != ACTIVATION_GENERATOR + or manifest["activation_identity"] + != compute_activation_identity( + manifest["seed"], dims["hidden"], manifest["activation_generator"] + )): + return False, "activation identity is invalid" + ib = idx_np.astype(" must fail + idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256 + bad, _ = verify_workload(man2, idx2, w2) + assert not bad, "verify must catch tampering" + print(f"save/load/verify roundtrip OK (workload_id={wid})") + except ImportError: + print("(numpy unavailable — skipped serialization roundtrip; id logic passed)") + print("workload self-test: PASS") + sys.exit(0)