diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml
index 7ddaca285c..489bca7888 100644
--- a/.github/workflows/collectivex-sweep.yml
+++ b/.github/workflows/collectivex-sweep.yml
@@ -1,71 +1,300 @@
# CollectiveX Sweep — one structured run instead of thousands of dispatches.
#
-# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs
-# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that
-# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked
-# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job
-# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't
-# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer /
-# deepep-hybrid / nccl-ep, + deepep_v2) for full parity.
+# Shape: setup -> sweep. The matrix unit is a shard: one allocation that sweeps
+# cases sharing (sku, backend, nodes). Each cell uploads its privacy-checked raw
+# result JSONs. The isolated v1 publisher consumes downloaded shards separately.
name: CollectiveX Sweep
+permissions:
+ contents: read
on:
workflow_dispatch:
inputs:
backend:
- description: EP library to sweep (deepep matrix is remapped onto the others, capability-filtered)
+ description: "EP library to sweep — 'all' runs every EP backend in one matrix"
type: choice
- default: deepep
- options: [deepep, uccl, flashinfer, deepep-hybrid, nccl-ep]
- deepep_v2:
- description: DeepEP V2 from-source kernels (kernel_gen=v2; deepep backend only)
- type: boolean
- default: false
+ default: all
+ options: [all, deepep, deepep-v2, uccl, deepep-hybrid, mori, nccl-ep]
suites:
description: "'all' or comma-list of suite names"
type: string
default: all
only_sku:
- description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all
+ description: Restrict to one GHA runner pool (h100-dgxc|h200-dgxc|b300|b200-dgxc|gb200|gb300|mi325x|mi355x); blank = all
+ type: string
+ default: ''
+ min_nodes:
+ description: Keep only shards with >= this tray count (2 = rack-scale EP8 only; blank = all)
+ type: string
+ default: ''
+ max_nodes:
+ description: Keep only shards with <= this tray count (1 = single-tray EP4 only; blank = all)
type: string
default: ''
max_cases:
- description: Max cases per shard cell (chunk larger shards)
+ description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites)
type: string
- default: '14'
-
+ default: '128'
+ diagnostic_execution:
+ description: Temporary retained-log execution identity; blank runs the benchmark
+ type: string
+ default: ''
concurrency:
- group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }}
+ group: ${{ inputs.diagnostic_execution != '' && format('cx-diagnostic-{0}', inputs.diagnostic_execution) || format('cx-sweep-{0}-{1}-{2}', github.ref, inputs.backend, inputs.only_sku) }}
cancel-in-progress: false
jobs:
+ diagnostic:
+ if: ${{ inputs.diagnostic_execution != '' }}
+ runs-on: ${{ 'h100-dgxc' }}
+ timeout-minutes: 5
+ env:
+ EXECUTION_ID: ${{ inputs.diagnostic_execution }}
+ steps:
+ - name: Classify retained private log without disclosing it
+ run: |
+ python3 - <<'PY'
+ import hashlib
+ import json
+ import os
+ import re
+ import stat
+
+ execution = os.environ.get("EXECUTION_ID", "")
+ expected = "28706865182_1_h100-dgxc-deepep-v2-n1"
+ if execution != expected:
+ raise SystemExit("invalid diagnostic request")
+ root = f"/tmp/inferencex-collectivex-{os.getuid()}/{expected}"
+ try:
+ root_fd = os.open(
+ root, os.O_RDONLY | os.O_DIRECTORY | os.O_CLOEXEC | os.O_NOFOLLOW
+ )
+ except OSError:
+ raise SystemExit("retained diagnostic unavailable") from None
+ metadata = os.fstat(root_fd)
+ if (
+ not stat.S_ISDIR(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != 0o700
+ ):
+ raise SystemExit("private diagnostic directory is unsafe")
+
+ native_status = {}
+ native_sites = {}
+ exceptions = {}
+ trace_sites = {}
+ terms = {}
+ digests = []
+ total = 0
+ logs = 0
+ for name in sorted(os.listdir(root_fd)):
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]{0,127}[.]log", name):
+ continue
+ fd = -1
+ try:
+ fd = os.open(
+ name,
+ os.O_RDONLY | os.O_CLOEXEC | os.O_NOFOLLOW,
+ dir_fd=root_fd,
+ )
+ item = os.fstat(fd)
+ if (
+ not stat.S_ISREG(item.st_mode)
+ or item.st_uid != os.getuid()
+ or stat.S_IMODE(item.st_mode) & 0o077
+ or item.st_nlink != 1
+ or item.st_size > 64 * 1024 * 1024
+ ):
+ raise RuntimeError
+ chunks = []
+ remaining = item.st_size
+ while remaining:
+ chunk = os.read(fd, min(1024 * 1024, remaining))
+ if not chunk:
+ raise RuntimeError
+ chunks.append(chunk)
+ remaining -= len(chunk)
+ if os.read(fd, 1):
+ raise RuntimeError
+ payload = b"".join(chunks)
+ except (OSError, RuntimeError):
+ raise SystemExit("retained diagnostic validation failed") from None
+ finally:
+ if fd >= 0:
+ os.close(fd)
+ logs += 1
+ total += len(payload)
+ digests.append(hashlib.sha256(payload).digest())
+ for line in payload.splitlines():
+ lower = line.lower()
+ match = re.search(rb" exception \([^()\n]*:[0-9]{1,6}\):\s*([0-9]{1,6})", line)
+ if match:
+ key = match.group(1).decode("ascii")
+ native_status[key] = native_status.get(key, 0) + 1
+ native_site = re.search(
+ rb" exception \([^()\n]*/(nccl[.]cu):([0-9]{1,6})\):\s*([0-9]{1,6})",
+ line,
+ )
+ if native_site:
+ key = ":".join(
+ part.decode("ascii") for part in native_site.groups()
+ )
+ native_sites[key] = native_sites.get(key, 0) + 1
+ for found in re.finditer(
+ rb"(? artifact for the cells; slim (no cases) -> the strategy output.
- python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $ov $v2 $os --out matrix_full.json >/dev/null
- SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))")
- echo "matrix=$SLIM" >> "$GITHUB_OUTPUT"
- echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT"
- python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))"
+ args=(--suites "$INPUT_SUITES" --max-cases "$INPUT_MAX_CASES")
+ case "$INPUT_BACKEND" in
+ all) args+=(--backends all) ;;
+ *) args+=(--backend "$INPUT_BACKEND") ;;
+ esac
+ [ -n "$INPUT_ONLY_SKU" ] && args+=(--only-sku "$INPUT_ONLY_SKU")
+ [ -n "$INPUT_MIN_NODES" ] && args+=(--min-nodes "$INPUT_MIN_NODES")
+ [ -n "$INPUT_MAX_NODES" ] && args+=(--max-nodes "$INPUT_MAX_NODES")
+ python3 sweep_matrix.py "${args[@]}" --out matrix_full.json >/dev/null
+ python3 artifact_safety.py matrix_full.json
+ SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='case_ids'} for x in m['include']]}))")
+ {
+ echo "matrix=$SLIM"
+ echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")"
+ echo "source_backends=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(' '.join(sorted({x['backend'] for x in m['include']} & {'deepep-v2','deepep-hybrid'})))")"
+ } >> "$GITHUB_OUTPUT"
+ unsupported_n=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(sum(x['disposition']=='unsupported' for x in m['requested_cases']))")
+ echo "unsupported_n=$unsupported_n" >> "$GITHUB_OUTPUT"
+ if [ "$unsupported_n" -gt 0 ]; then
+ python3 sweep_matrix.py --emit-unsupported-from matrix_full.json \
+ --out-dir unsupported
+ fi
+ python3 -c "import json;m=json.load(open('matrix_full.json'));r=m['requested_cases'];print('shard-cells:',len(m['include']),'runnable:',sum(x['disposition']=='runnable' for x in r),'unsupported:',sum(x['disposition']=='unsupported' for x in r))"
+ - name: Prepare pinned backend source archive
+ if: ${{ steps.gen.outputs.source_backends != '' }}
+ working-directory: experimental/CollectiveX
+ env:
+ SOURCE_BACKENDS: ${{ steps.gen.outputs.source_backends }}
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_sources
+ run: |
+ set -euo pipefail
+ source runtime/common.sh
+ work="$RUNNER_TEMP/collectivex-backend-sources"
+ archive="$RUNNER_TEMP/collectivex-backend-sources.tar"
+ rm -rf -- "$work" "$archive"
+ umask 077
+ mkdir -m 700 "$work"
+ mkdir -p "$work/experimental/CollectiveX"
+ read -r -a backends <<< "$SOURCE_BACKENDS"
+ [ "${#backends[@]}" -gt 0 ]
+ for backend in "${backends[@]}"; do
+ cx_prepare_backend_source "$work" "$backend"
+ done
+ cx_cleanup_private_logs 0
+ tar --sort=name --mtime='@1' --owner=0 --group=0 --numeric-owner \
+ -C "$work/experimental/CollectiveX" -cf "$archive" .cx_sources
+ sha256sum "$archive"
+ rm -rf -- "$work"
+ - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ if: ${{ steps.gen.outputs.source_backends != '' }}
+ with:
+ name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ runner.temp }}/collectivex-backend-sources.tar
+ if-no-files-found: error
+ retention-days: 3
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
- name: cxsweep-matrix-${{ github.run_id }}
+ name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }}
path: experimental/CollectiveX/matrix_full.json
if-no-files-found: error
+ - name: Validate unsupported artifact safety
+ id: unsupported_safety
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 }}
+ run: |
+ python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/unsupported/*.json
+ - name: Validate unsupported outcomes
+ id: unsupported_contracts
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_safety.outcome == 'success' }}
+ env:
+ COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }}
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported
+ run: |
+ python3 experimental/CollectiveX/contracts.py validate-delivery \
+ --source experimental/CollectiveX/matrix_full.json \
+ --disposition unsupported \
+ experimental/CollectiveX/unsupported/*.json
+ - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_contracts.outcome == 'success' && steps.unsupported_safety.outcome == 'success' }}
+ with:
+ name: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }}
+ path: experimental/CollectiveX/unsupported/*.json
+ if-no-files-found: error
# ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ----
sweep:
@@ -73,82 +302,248 @@ jobs:
if: ${{ fromJSON(needs.setup.outputs.n) > 0 }}
strategy:
fail-fast: false
- max-parallel: 10 # don't saturate the ~20-runner fleet; cells queue as slots free
+ max-parallel: 10
matrix: ${{ fromJSON(needs.setup.outputs.matrix) }}
- # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental).
- runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }}
+ runs-on: ${{ matrix.sku }}
timeout-minutes: 350
env:
CX_BENCH: ${{ matrix.backend }}
- CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }}
CX_NODES: ${{ matrix.nodes }}
- CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json
+ CX_GPUS_PER_NODE: ${{ matrix.gpus_per_node }}
+ CX_SCALE_UP_DOMAIN: ${{ matrix.scale_up_domain }}
+ CX_SHARD_FILE: .shards/${{ matrix.id }}.json
+ CX_SHARD_SKU: ${{ matrix.sku }}
+ COLLECTIVEX_CANONICAL_GHA: '1'
COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
- CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
- CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+ COLLECTIVEX_ARTIFACT_NAME: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }}
+ # Consolidated shards run one bounded build-group in one Slurm allocation, so
+ # the launcher's default 45-min --time is too short. 300 min covers a cold
+ # compute-node image import plus the shard. The allocation releases early
+ # when the shard finishes, so short shards don't waste it.
+ CX_TIME: '300'
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.id }}
+ CX_JOB_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}
+ CX_SOURCE_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/source
+ HOME: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/home
steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
- with: { clean: true }
+ - name: Prepare isolated source
+ id: source
+ env:
+ COLLECTIVEX_REPOSITORY: ${{ github.repository }}
+ run: |
+ set -euo pipefail
+ python3 - <<'PY'
+ import os
+ import re
+ import shutil
+ import stat
+ import time
+
+ pattern = re.compile(r"inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+")
+ cutoff = time.time() - 86400
+ for entry in os.scandir("/tmp"):
+ if not pattern.fullmatch(entry.name):
+ continue
+ try:
+ metadata = entry.stat(follow_symlinks=False)
+ except FileNotFoundError:
+ continue
+ if (
+ not stat.S_ISDIR(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != 0o700
+ or metadata.st_mtime >= cutoff
+ ):
+ continue
+ marked = False
+ for marker_name in ("cleanup-safe", "cleanup-unsafe"):
+ try:
+ marker = os.stat(
+ os.path.join(entry.path, marker_name), follow_symlinks=False
+ )
+ except FileNotFoundError:
+ continue
+ marked = (
+ stat.S_ISREG(marker.st_mode)
+ and marker.st_uid == os.getuid()
+ and stat.S_IMODE(marker.st_mode) == 0o600
+ )
+ if marked:
+ break
+ if marked:
+ shutil.rmtree(entry.path)
+ PY
+ [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ || { echo "CollectiveX isolated root is invalid" >&2; exit 1; }
+ [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \
+ || { echo "CollectiveX source root is invalid" >&2; exit 1; }
+ if [ -e "$CX_JOB_ROOT" ] || [ -L "$CX_JOB_ROOT" ]; then
+ echo "CollectiveX isolated root already exists" >&2
+ exit 1
+ fi
+ umask 077
+ mkdir -m 700 -- "$CX_JOB_ROOT"
+ trap 'rc=$?; [ "$rc" = 0 ] || rm -rf -- "$CX_JOB_ROOT"; exit "$rc"' EXIT
+ mkdir -m 700 -- "$HOME" "$CX_JOB_ROOT/control" "$CX_JOB_ROOT/artifact" "$CX_SOURCE_ROOT"
+ : > "$CX_JOB_ROOT/cleanup-safe"
+ if ! {
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null git init -q "$CX_SOURCE_ROOT"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" remote add origin \
+ "https://github.com/${COLLECTIVEX_REPOSITORY}.git"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" -c credential.helper= -c protocol.version=2 \
+ fetch -q --no-tags --depth=1 origin "$COLLECTIVEX_SOURCE_SHA"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" -c advice.detachedHead=false \
+ checkout -q --detach FETCH_HEAD
+ [ "$(git -C "$CX_SOURCE_ROOT" rev-parse HEAD)" = "$COLLECTIVEX_SOURCE_SHA" ]
+ } /dev/null 2>&1; then
+ echo "CollectiveX source preparation failed" >&2
+ exit 1
+ fi
+ [ "$(stat -c '%a' "$CX_JOB_ROOT")" = 700 ] \
+ || { echo "CollectiveX isolated root has unsafe permissions" >&2; exit 1; }
+ echo 'prepared=true' >> "$GITHUB_OUTPUT"
+ trap - EXIT
- uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
- name: cxsweep-matrix-${{ github.run_id }}
- path: experimental/CollectiveX
- - name: Extract this shard's cases (stdlib only — no runner deps)
- working-directory: experimental/CollectiveX
+ name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ env.CX_JOB_ROOT }}/control
+ - name: Download pinned backend source archive
+ if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }}
+ uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+ with:
+ name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ env.CX_JOB_ROOT }}/control
+ - name: Install pinned backend source seed
+ if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }}
+ env:
+ EXPECTED_BACKEND: ${{ matrix.backend }}
run: |
set -euo pipefail
- python3 -c "
- import json
- m=json.load(open('matrix_full.json'))
- s=[x for x in m['include'] if x['id']=='${{ matrix.id }}']
- assert s, 'shard ${{ matrix.id }} not in matrix'
- s=s[0]
- json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w'))
- print('shard ${{ matrix.id }}:', len(s['cases']), 'cases')
- "
+ archive="$CX_JOB_ROOT/control/collectivex-backend-sources.tar"
+ destination="$CX_SOURCE_ROOT/experimental/CollectiveX"
+ seed_root="$destination/.cx_sources"
+ [ -f "$archive" ] && [ ! -e "$seed_root" ] && [ ! -L "$seed_root" ]
+ python3 - "$archive" <<'PY'
+ from pathlib import PurePosixPath
+ import sys
+ import tarfile
+
+ with tarfile.open(sys.argv[1]) as archive:
+ for member in archive.getmembers():
+ path = PurePosixPath(member.name)
+ if (
+ not path.parts
+ or path.parts[0] != ".cx_sources"
+ or ".." in path.parts
+ or member.issym()
+ or member.islnk()
+ or member.isdev()
+ ):
+ raise SystemExit("invalid backend source archive")
+ PY
+ umask 077
+ tar --extract --no-same-owner --no-same-permissions \
+ --file "$archive" --directory "$destination"
+ source "$destination/runtime/common.sh"
+ source_path="$(cx_backend_source_path "$seed_root" "$EXPECTED_BACKEND")"
+ cx_backend_source_is_valid "$EXPECTED_BACKEND" "$source_path"
+ printf 'CX_BACKEND_SOURCE_SEED_ROOT=%s\n' "$seed_root" >> "$GITHUB_ENV"
+ - name: Extract and validate this shard's cases
+ run: |
+ set -euo pipefail
+ cd "$CX_SOURCE_ROOT/experimental/CollectiveX" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 sweep_matrix.py \
+ --extract-from "$CX_JOB_ROOT/control/matrix_full.json" \
+ --shard-id '${{ matrix.id }}' \
+ --expect-sku '${{ matrix.sku }}' \
+ --expect-backend '${{ matrix.backend }}' \
+ --expect-nodes '${{ matrix.nodes }}' \
+ --out '${{ env.CX_SHARD_FILE }}' >/dev/null
- name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation)
+ id: sweep_shard
env:
- RUNNER_NAME: ${{ runner.name }}
- run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+ COLLECTIVEX_OPERATOR_CONFIG_CONTENT: ${{ secrets.COLLECTIVEX_OPERATOR_CONFIG_V1 }}
+ COLLECTIVEX_OPERATOR_CONFIG_REQUIRED: '1'
+ run: |
+ set -euo pipefail
+ umask 077
+ : > "$CX_JOB_ROOT/cleanup-unsafe"
+ rm -f -- "$CX_JOB_ROOT/cleanup-safe"
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ bash "experimental/CollectiveX/launchers/launch_${{ matrix.launcher }}.sh"
+ - name: Confirm allocation cleanup
+ id: allocation_cleanup
+ if: ${{ always() && steps.source.outputs.prepared == 'true' }}
+ run: |
+ set -euo pipefail
+ [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \
+ || { echo "CollectiveX allocation cleanup was not confirmed" >&2; exit 1; }
+ - name: Validate shard artifact safety
+ id: artifact_safety
+ if: ${{ always() && steps.allocation_cleanup.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json
+ - name: Validate shard delivery completeness
+ id: delivery_contracts
+ if: ${{ always() && steps.artifact_safety.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/contracts.py validate-delivery \
+ --source "experimental/CollectiveX/${CX_SHARD_FILE}" \
+ experimental/CollectiveX/results/*.json
- name: Shard summary
- if: always()
- run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
+ if: ${{ always() && steps.artifact_safety.outcome == 'success' && steps.delivery_contracts.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/summarize.py \
+ --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
+ - name: Stage shard artifact
+ id: stage_artifact
+ if: ${{ always() && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' }}
+ run: |
+ set -euo pipefail
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ cp -- experimental/CollectiveX/results/*.json "$CX_JOB_ROOT/artifact/"
- name: Upload shard results
- if: always()
+ id: upload_artifact
+ if: always() && steps.stage_artifact.outcome == 'success' && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
- name: cxshard-${{ matrix.id }}-${{ github.run_id }}
- path: experimental/CollectiveX/results/*.json # glob skips the hidden .shard_*.json
- if-no-files-found: warn
-
- # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ----
- aggregate:
- needs: sweep
- if: always()
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
- with: { clean: true }
- - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
- with:
- pattern: cxshard-*-${{ github.run_id }}
- path: _shards
- merge-multiple: true
- - name: Aggregate shards -> one ndjson
- working-directory: experimental/CollectiveX
+ name: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }}
+ path: |
+ ${{ env.CX_JOB_ROOT }}/artifact/*.json
+ if-no-files-found: error
+ - name: Cleanup isolated workspace
+ if: ${{ always() && steps.source.outputs.prepared == 'true' }}
run: |
set -euo pipefail
- tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}"
- python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson"
- {
- echo "## CollectiveX sweep aggregate (${tag})"
- echo '```'
- wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
- - name: Upload aggregate
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }}
- path: experimental/CollectiveX/results/aggregate/*.ndjson
- if-no-files-found: warn
+ [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ || { echo "CollectiveX cleanup root is invalid" >&2; exit 1; }
+ [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \
+ || { echo "CollectiveX cleanup source is invalid" >&2; exit 1; }
+ [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \
+ || { echo "CollectiveX allocation cleanup was not confirmed; retaining isolated files" >&2; exit 1; }
+ if [ '${{ steps.sweep_shard.outcome }}' = success ] \
+ && [ '${{ steps.allocation_cleanup.outcome }}' = success ] \
+ && [ '${{ steps.artifact_safety.outcome }}' = success ] \
+ && [ '${{ steps.delivery_contracts.outcome }}' = success ] \
+ && [ '${{ steps.stage_artifact.outcome }}' = success ] \
+ && [ '${{ steps.upload_artifact.outcome }}' = success ] \
+ && [ -f "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" ]; then
+ # shellcheck source=/dev/null
+ if source "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" \
+ >/dev/null 2>&1; then
+ cx_cleanup_private_logs 0
+ fi
+ fi
+ rm -rf -- "$CX_JOB_ROOT"
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 0000000000..56b307215b
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,15 @@
+__pycache__/
+*.pyc
+results/
+unsupported/
+.shards/
+.cx_workloads/
+.cx_backend/
+/matrix_full.json
+gpucore.*
+
+# Local plans and infrastructure inventory.
+goal.md
+notes.md
+configs/platforms.yaml
+private-infra.md
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 0000000000..bd01428974
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,115 @@
+# CollectiveX
+
+
+
+**English** | [中文](./README_zh.md)
+
+
+
+CollectiveX is an experimental MoE expert-parallel communication benchmark. It measures dispatch,
+combine, and paired roundtrip latency across EP libraries and accelerator systems.
+
+> Publication hold: historical schema 3-5 data is diagnostic. No current dataset is approved for
+> rankings, recommendations, or regression baselines.
+
+## v1 Execution Profile
+
+Every scheduled case is BF16, normal mode, `layout-and-dispatch-v1`, backend-tuned resources, packed
+placement, and `fixed-512-v1` sampling: 64 trials x 8 timed iterations with 32 synchronized full
+roundtrip warmups before each measured component at every trial/point. Roundtrip is measured first,
+and every backend uses the same phase-specific conditioning ramp and ascending point order. Routing is limited
+to uniform and one Zipf sensitivity; EPLB is measured only
+as the Zipf remedy. Combine returns activation payload only on every backend; gate weights are verified
+at dispatch. A stdlib integer counter produces byte-identical routing and gate weights.
+
+The current matrix has 38 runnable allocation cells across H100, H200, B200, B300, GB200, GB300,
+MI325X, and MI355X. It requests 360 cases / 840 token points: 228 runnable cases / 532 points and
+132 explicit unsupported cases / 308 points. `sweep_matrix.py` materializes every token ladder and
+rejects missing, stale, malformed, or altered shard controls. Workflow shards are emitted round-robin
+by SKU so the bounded GHA matrix can use every available runner pool from its first scheduling cycle.
+
+| Backend | Current scope |
+|---|---|
+| DeepEP V1 | Image-pinned `deep_ep.Buffer`: upstream v1.2.1 on x86 and the image's GB fork on arm64 |
+| DeepEP V2 | PR #605 `ElasticBuffer` plus the upstream #630 scale-up fix; NCCL Device API LSA and source/SASS-bound reproducible JIT |
+| DeepEP Hybrid | Pinned `HybridEPBuffer`; realized auto-tuned config and JIT keys; NVLink/MNNVL domain |
+| UCCL | Pinned 0.1.1 wheel and wrapper on Hopper; Blackwell is explicitly unsupported |
+| NCCL/RCCL A2A | Portable rank-deduplicated payload plus expert/routing-metadata reference |
+| MoRI | MI325X AsyncLL transport and MI355X intranode transport |
+
+FlashInfer is outside v1 because its exercised EP path failed intermittently at runtime. It is not
+misreported as a platform capability limitation and can return after a stable pinned path is proven.
+
+DeepEP V2 means the `ElasticBuffer` implementation introduced by
+[DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605), not a newer legacy `Buffer` build.
+The pinned source is the minimal upstream [PR #630](https://github.com/deepseek-ai/DeepEP/pull/630)
+follow-up: its parent is the #605 merge tree and its only source change fixes pure scale-up
+initialization when GIN is unavailable. Every v1 V2 case fits inside its declared NVLink/MNNVL
+scale-up domain, so the adapter requests NCCL Device API LSA and disables network GIN. It then
+requires NCCL's realized LSA team to cover the full EP world; a smaller realized domain fails rather
+than being mislabeled. A true scale-out case must use and identify GIN separately. The isolated
+build records the API, source, loaded libraries, generated JIT source, executable SASS, and raw
+CUBIN diagnostics. NVIDIA SKUs remain unvalidated until their GPU outcomes pass the native
+correctness and publication gates.
+
+Removed v1 axes include cached-layout `[cl]`, runtime-visible `[rv]`, LL, FP8, quantized combine,
+extra routing distributions, activation profiles, uneven allocation, placement permutations, model
+envelopes, and scaling studies.
+
+## Workflow And Artifacts
+
+`.github/workflows/collectivex-sweep.yml` generates a public-SKU matrix, extracts a strict ignored
+`.shards/.json` control, executes one allocation per shard, privacy-checks result JSON, and uploads
+raw GitHub artifacts. Raw producers are diagnostic-only; they cannot self-promote evidence.
+
+Development publication uses one self-hosted persistent filesystem. GitHub artifacts are
+transient input; Vercel storage, GCP, Neon, managed databases, and managed object stores are out of
+scope. `publisher.py` ingests complete downloaded workflow artifacts, verifies or promotes explicit
+bundle IDs, and writes the atomic content-addressed layout consumed by the frontend. It never runs on
+GPU workers. The store contract and promotion gates are in [docs/methodology.md](docs/methodology.md).
+
+## Runner Configuration
+
+Runner-local Slurm and storage values use a strict per-SKU JSON document at
+`$XDG_CONFIG_HOME/inferencex/collectivex.json` or `COLLECTIVEX_OPERATOR_CONFIG`. The mode-0600,
+same-owner, non-symlink file is outside the checkout and never uploaded. Unknown runners, fields,
+duplicate keys, endpoint literals, unsafe paths, and non-JSON input fail closed; configuration is
+never evaluated as shell. GHA passes encrypted `COLLECTIVEX_OPERATOR_CONFIG_V1` content only to the
+launcher, which validates it, exports the selected SKU's allowlisted values, and deletes the
+temporary copy before allocation. Required JSON fields are:
+
+| SKU | Variables |
+|---|---|
+| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir` |
+| `h200-dgxc` | `partition`, `squash_dir` |
+| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `gb200` | `partition`, `account`, ordered `storage_roots` |
+| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` |
+| `mi325x`, `mi355x` | `partition`, `squash_dir` |
+
+Before import, each Docker Hub tag is resolved with bounded registry requests and must match its
+pinned digest; digest-qualified overrides are rejected. Enroot imports use a fixed filesystem epoch
+and a versioned, registry-digest-bound cache key. Every mounted squash is freshly hashed. The
+verified registry digest and local squash hash are both recorded. Image-provided DeepEP is checked
+against exact wheel and installed-file fingerprints; source-built backends use pinned commits and
+runtime-verified GPU targets. DeepEP V2's mode-0700 cluster-local build cache is keyed by a versioned
+build recipe, verified image, architecture, upstream trees, and dependency pins; only its fixed
+`/cx-cache` mount reaches the container, and it never enters result artifacts.
+Compute containers receive an explicit environment allowlist. Private host, address, device, NIC,
+credential, workspace, and path data stays in encrypted config, ignored operator notes, or bounded
+mode-0600 runner logs; it is never uploaded.
+
+## Local Checks
+
+```bash
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py'
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify
+bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh
+```
+
+Core paths are `capability.py`, `configs/`, `contracts.py`, `schemas/`, `sweep_matrix.py`,
+`publisher.py`, `runtime/`, `launchers/`, and `tests/`.
diff --git a/experimental/CollectiveX/README_zh.md b/experimental/CollectiveX/README_zh.md
new file mode 100644
index 0000000000..bed2172d40
--- /dev/null
+++ b/experimental/CollectiveX/README_zh.md
@@ -0,0 +1,111 @@
+# CollectiveX
+
+
+
+[English](./README.md) | **中文**
+
+
+
+CollectiveX 是实验性的 MoE 专家并行通信基准,用于测量不同 EP 库和加速器系统的
+dispatch、combine 及配对 roundtrip 延迟。
+
+> 发布暂停:历史 schema 3-5 数据仅供诊断。目前没有数据集获准用于排名、推荐或回归基线。
+
+## v1 执行配置
+
+每个调度用例均采用 BF16、normal mode、`layout-and-dispatch-v1`、后端调优资源、packed
+placement 以及 `fixed-512-v1` 采样:64 trials x 8 timed iterations;每个 trial/point 的每个
+被测组件前执行 32 次同步完整 roundtrip warmup。先测 roundtrip;所有后端使用相同的分阶段
+conditioning ramp 和升序点位。Routing 仅保留 uniform 和一个 Zipf 敏感性场景,EPLB 只作为
+Zipf 的修正方案测量。所有后端的 combine 仅返回 activation payload,gate weights 在 dispatch
+阶段接受校验。stdlib 整数计数器生成逐字节一致的 routing 和 gate weights。
+
+当前矩阵覆盖 H100、H200、B200、B300、GB200、GB300、MI325X 和 MI355X,共 38 个可运行
+allocation cells。矩阵请求 360 个 cases / 840 个 token points:228 个可运行 cases / 532 个
+points,以及 132 个显式 unsupported cases / 308 个 points。`sweep_matrix.py` 物化每个 token
+ladder,并拒绝缺失、过期、格式错误或被修改的 shard controls。Workflow shards 按 SKU
+round-robin 发出,使受限的 GHA matrix 从第一个调度周期起即可使用所有可用 runner pools。
+
+| 后端 | 当前范围 |
+|---|---|
+| DeepEP V1 | 镜像固定的 `deep_ep.Buffer`:x86 使用 upstream v1.2.1,arm64 使用镜像内 GB fork |
+| DeepEP V2 | PR #605 `ElasticBuffer` 加 upstream #630 scale-up 修复;NCCL Device API LSA 与 source/SASS 绑定的可复现 JIT |
+| DeepEP Hybrid | 固定的 `HybridEPBuffer`;记录实际自动调优配置与 JIT keys;NVLink/MNNVL domain |
+| UCCL | Hopper 上固定的 0.1.1 wheel 和 wrapper;Blackwell 显式标为 unsupported |
+| NCCL/RCCL A2A | 可移植的 rank-deduplicated payload 加 expert/routing-metadata reference |
+| MoRI | MI325X AsyncLL transport 和 MI355X intranode transport |
+
+FlashInfer 不在 v1 范围内,因为已测试的 EP path 在运行时存在间歇性失败。该问题不会被误报为
+平台能力限制;在证明有稳定的固定实现后可重新加入。
+
+DeepEP V2 指 [DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605) 引入的
+`ElasticBuffer` 实现,而不是更新的 legacy `Buffer` build。固定 source 使用最小化的 upstream
+[PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) 后续修复:其 parent 是 #605 merge
+tree,唯一 source 变更是修复 GIN 不可用时的纯 scale-up 初始化。v1 的所有 V2 cases 都位于各自
+声明的 NVLink/MNNVL scale-up domain 内,因此 adapter 请求 NCCL Device API LSA 并禁用网络
+GIN。随后必须确认 NCCL 实际建立的 LSA team 覆盖整个 EP world;若实际 domain 更小,case
+会直接失败而不会被错误标注。真正的 scale-out case 必须单独启用并标识 GIN。隔离构建会记录
+API、source、loaded libraries、generated JIT source、executable SASS 与 raw CUBIN
+diagnostics。在 GPU outcome 通过 native correctness 和 publication gates 前,各 NVIDIA SKU
+仍为 unvalidated。
+
+v1 已移除的轴包括 cached-layout `[cl]`、runtime-visible `[rv]`、LL、FP8、quantized combine、
+额外 routing distributions、activation profiles、uneven allocation、placement permutations、
+model envelopes 和 scaling studies。
+
+## Workflow 与产物
+
+`.github/workflows/collectivex-sweep.yml` 生成 public-SKU matrix,提取严格且被忽略的
+`.shards/.json` control,每个 shard 执行一次 allocation,对结果 JSON 做隐私检查并上传
+raw GitHub artifacts。Raw producers 仅供诊断,不能自行提升 evidence。
+
+开发阶段发布使用一个 self-hosted persistent filesystem。GitHub artifacts 仅作为临时输入;
+Vercel storage、GCP、Neon、managed databases 和 managed object stores 均不在范围内。
+`publisher.py` 摄取完整下载的 workflow artifacts,验证或提升显式 bundle IDs,并写入供前端
+使用的原子 content-addressed layout。它不会在 GPU workers 上运行。Store contract 和 promotion
+gates 见 [docs/methodology_zh.md](docs/methodology_zh.md)。
+
+## Runner 配置
+
+Runner 本地 Slurm 和 storage 值使用严格的 per-SKU JSON 文档,路径为
+`$XDG_CONFIG_HOME/inferencex/collectivex.json` 或 `COLLECTIVEX_OPERATOR_CONFIG`。该 mode-0600、
+同 owner、非 symlink 文件位于 checkout 外且永不上传。未知 runners、fields、duplicate keys、
+endpoint literals、unsafe paths 和非 JSON 输入均 fail closed;配置绝不作为 shell 执行。GHA
+仅将加密的 `COLLECTIVEX_OPERATOR_CONFIG_V1` 内容传给 launcher;launcher 验证后只导出所选
+SKU 的 allowlisted values,并在 allocation 前删除临时副本。必需 JSON fields 如下:
+
+| SKU | 变量 |
+|---|---|
+| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir` |
+| `h200-dgxc` | `partition`, `squash_dir` |
+| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `gb200` | `partition`, `account`, 有序 `storage_roots` |
+| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` |
+| `mi325x`, `mi355x` | `partition`, `squash_dir` |
+
+导入前,每个 Docker Hub tag 都通过有界 registry requests 解析,并且必须匹配固定 digest;拒绝
+digest-qualified overrides。Enroot imports 使用固定 filesystem epoch 和带版本、绑定 registry
+digest 的 cache key。每个已挂载 squash 都重新计算 hash,同时记录 verified registry digest 和
+local squash hash。镜像提供的 DeepEP 会按精确 wheel 和 installed-file fingerprints 检查;
+source-built backends 使用固定 commits 和 runtime-verified GPU targets。DeepEP V2 的 mode-0700
+cluster-local build cache 由版本化 build recipe、verified image、architecture、upstream
+trees 和 dependency pins 共同寻址;container 只看到固定的 `/cx-cache` mount,且该 cache 永不
+进入 result artifacts。
+Compute containers 仅接收显式 environment allowlist。Private host、address、device、NIC、
+credential、workspace 和 path 数据只保留在加密配置、忽略的 operator notes 或有界 mode-0600
+runner logs 中,永不上传。
+
+## 本地检查
+
+```bash
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py'
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify
+bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh
+```
+
+核心路径为 `capability.py`、`configs/`、`contracts.py`、`schemas/`、`sweep_matrix.py`、
+`publisher.py`、`runtime/`、`launchers/` 和 `tests/`。
diff --git a/experimental/CollectiveX/artifact_safety.py b/experimental/CollectiveX/artifact_safety.py
new file mode 100644
index 0000000000..83d522fba8
--- /dev/null
+++ b/experimental/CollectiveX/artifact_safety.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""Fail-closed privacy check for CollectiveX public result documents."""
+from __future__ import annotations
+
+import argparse
+import ipaddress
+import json
+import os
+import re
+import stat
+
+
+SENSITIVE_FIELDS = frozenset({
+ "environment", "env", "host", "hostname", "uuid", "gpu_uuid", "device_uuid",
+ "pci_bus_id", "ip_address", "ip_addresses", "master_addr", "ssh", "ssh_target",
+ "nodelist", "node_list", "nic_guid", "ib_guid", "topology_matrix", "rdma_devices",
+ "user", "username", "password", "passwd", "secret", "token", "access_token",
+ "api_token", "auth_token", "api_key", "private_key", "credential", "credentials",
+ "address", "addresses", "ip", "ips",
+})
+SENSITIVE_FIELDS_COMPACT = frozenset(item.replace("_", "") for item in SENSITIVE_FIELDS)
+SENSITIVE_FIELD_SUFFIXES = (
+ "_host", "_hostname", "_address", "_addresses", "_path", "_paths", "_ip", "_ips",
+ "_password", "_passwd", "_secret", "_token", "_credential", "_credentials",
+ "_uuid", "_guid", "_bus_id",
+)
+SENSITIVE_VALUE_PATTERNS = (
+ ("private-path", re.compile(
+ r"(? str:
+ normalized = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", str(value).strip())
+ normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized)
+ return normalized.lower().replace("-", "_")
+
+
+def _sensitive_value_rule(value: str, *, contextual: bool = True) -> str | None:
+ matched = next(
+ (
+ name for name, pattern in SENSITIVE_VALUE_PATTERNS
+ if (contextual or name not in CONTEXTUAL_VALUE_RULES) and pattern.search(value)
+ ),
+ None,
+ )
+ if matched:
+ return matched
+ for candidate in IPV6_CANDIDATE.findall(value):
+ try:
+ address = candidate.split("%", 1)[0]
+ if ipaddress.ip_address(address).version == 6:
+ return "ipv6-address"
+ except ValueError:
+ continue
+ return None
+
+
+def assert_publication_safe(docs: list[dict]) -> None:
+ """Reject private infrastructure fields and value shapes."""
+ def walk(value, doc_index: int, parent_field: str | None = None) -> None:
+ if isinstance(value, dict):
+ for key, child in value.items():
+ field = _normalized_field(key)
+ compact = field.replace("_", "")
+ if (
+ field in SENSITIVE_FIELDS
+ or compact in SENSITIVE_FIELDS_COMPACT
+ or field.endswith(SENSITIVE_FIELD_SUFFIXES)
+ ):
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden private field"
+ )
+ key_rule = _sensitive_value_rule(str(key))
+ if key_rule:
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden {key_rule} key"
+ )
+ walk(child, doc_index, field)
+ elif isinstance(value, list):
+ for child in value:
+ walk(child, doc_index, parent_field)
+ elif isinstance(value, str):
+ rule = _sensitive_value_rule(value, contextual=parent_field != "ref")
+ if rule:
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden {rule} value"
+ )
+
+ for index, doc in enumerate(docs):
+ if not isinstance(doc, dict):
+ raise ArtifactSafetyError(f"artifact safety: doc[{index}] is not a JSON object")
+ walk(doc, index)
+
+
+def load_documents(paths: list[str]) -> list[dict]:
+ docs: list[dict] = []
+ for path in paths:
+ try:
+ metadata = os.lstat(path)
+ except OSError as exc:
+ raise ArtifactSafetyError("artifact safety: result file is unavailable") from exc
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or metadata.st_size <= 0
+ or metadata.st_size > MAX_INPUT_BYTES
+ ):
+ raise ArtifactSafetyError("artifact safety: result file is unavailable")
+ descriptor = -1
+ try:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ opened = os.fstat(descriptor)
+ if (
+ not stat.S_ISREG(opened.st_mode)
+ or (opened.st_dev, opened.st_ino, opened.st_size)
+ != (metadata.st_dev, metadata.st_ino, metadata.st_size)
+ ):
+ raise ArtifactSafetyError("artifact safety: result file changed during open")
+ with os.fdopen(descriptor, encoding="utf-8") as fh:
+ descriptor = -1
+ if path.endswith(".ndjson"):
+ for line_number, line in enumerate(fh, 1):
+ if not line.strip():
+ continue
+ try:
+ docs.append(json.loads(line))
+ except json.JSONDecodeError as exc:
+ raise ArtifactSafetyError(
+ f"artifact safety: malformed NDJSON at input line {line_number}"
+ ) from exc
+ else:
+ docs.append(json.load(fh))
+ except json.JSONDecodeError as exc:
+ raise ArtifactSafetyError("artifact safety: malformed JSON input") from exc
+ except (OSError, UnicodeError) as exc:
+ raise ArtifactSafetyError("artifact safety: result file is unreadable") from exc
+ finally:
+ if descriptor >= 0:
+ os.close(descriptor)
+ if not docs:
+ raise ArtifactSafetyError("artifact safety: no public result documents found")
+ return docs
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="Check CollectiveX result artifacts for private data")
+ parser.add_argument("paths", nargs="+")
+ args = parser.parse_args()
+ try:
+ docs = load_documents(args.paths)
+ assert_publication_safe(docs)
+ except ArtifactSafetyError as exc:
+ parser.error(str(exc))
+ print(f"artifact safety: {len(docs)} public document(s) passed")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/capability.py b/experimental/CollectiveX/capability.py
new file mode 100644
index 0000000000..6a069b09b9
--- /dev/null
+++ b/experimental/CollectiveX/capability.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Public runner and backend capability registry for CollectiveX v1."""
+
+from __future__ import annotations
+
+import re
+
+
+DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+DEEPEP_V2_SKU_CAPABILITIES = {
+ "h100-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"},
+ "h200-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"},
+ "b200-dgxc": {"schedulable": True, "basis": "upstream-sm100-result"},
+ "gb200": {"schedulable": True, "basis": "upstream-sm100-result"},
+ "b300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"},
+ "gb300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"},
+ "mi325x": {"schedulable": False, "basis": "nvidia-only"},
+ "mi355x": {"schedulable": False, "basis": "nvidia-only"},
+}
+PLATFORMS = {
+ "h100-dgxc": dict(vendor="nvidia", arch="sm90", machine="amd64", product="h100", transport="nvlink", topology_class="h100-nvlink-island",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"),
+ "h200-dgxc": dict(vendor="nvidia", arch="sm90", machine="amd64", product="h200", transport="nvlink", topology_class="h200-nvlink-island",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"),
+ "b200-dgxc": dict(vendor="nvidia", arch="sm100", machine="amd64", product="b200", transport="nvlink", topology_class="b200-nvlink-island",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"),
+ "b300": dict(vendor="nvidia", arch="sm103", machine="amd64", product="b300", transport="nvlink", topology_class="b300-nvlink-island",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="single-slurm"),
+ "gb200": dict(vendor="nvidia", arch="sm100", machine="arm64", product="gb200", transport="mnnvl", topology_class="gb200-nvl72-mnnvl",
+ gpus_per_node=4, scale_up_domain=72, ep_degrees=(4, 8), launcher="gb-nv"),
+ "gb300": dict(vendor="nvidia", arch="sm103", machine="arm64", product="gb300", transport="mnnvl", topology_class="gb300-nvl72-mnnvl",
+ gpus_per_node=4, scale_up_domain=72, ep_degrees=(4, 8), launcher="gb-nv"),
+ "mi325x": dict(vendor="amd", arch="gfx942", machine="amd64", product="mi325x", transport="xgmi", topology_class="mi325x-xgmi",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="mi-amds"),
+ "mi355x": dict(vendor="amd", arch="gfx950", machine="amd64", product="mi355x", transport="xgmi", topology_class="mi355x-xgmi",
+ gpus_per_node=8, scale_up_domain=8, ep_degrees=(8,), launcher="mi-amds"),
+}
+
+BACKENDS = {
+ "deepep": {"vendors": {"nvidia"}},
+ "deepep-v2": {
+ "vendors": {"nvidia"},
+ "implementation": "deep_ep.ElasticBuffer",
+ "source": "deepseek-ai/DeepEP#605+#630",
+ "commit": DEEPEP_V2_COMMIT,
+ "communication_backend": "nccl-device-lsa",
+ "torch": "2.10.0+cu130",
+ "nccl": "2.30.4",
+ "sku_capabilities": DEEPEP_V2_SKU_CAPABILITIES,
+ },
+ "uccl": {
+ "vendors": {"nvidia"},
+ "machines": {"amd64"},
+ "excluded_skus": {"b200-dgxc", "b300"},
+ },
+ "deepep-hybrid": {"vendors": {"nvidia"}},
+ "mori": {"vendors": {"amd"}},
+ "nccl-ep": {"vendors": {"nvidia", "amd"}},
+}
+SWEEP_BACKENDS = tuple(BACKENDS)
+
+
+def runtime_identity_issues(
+ sku: str, *, vendor: str, arch: str, machine: str, device_name: str,
+ device_count: int, world_size: int,
+) -> list[str]:
+ """Validate public product identity on every rank without private device identifiers."""
+ platform = PLATFORMS.get(sku)
+ if platform is None:
+ return [f"unknown runner identity {sku!r}"]
+ issues = []
+ for field, observed in (("vendor", vendor), ("arch", arch), ("machine", machine)):
+ if observed != platform[field]:
+ issues.append(f"{field}={observed!r}, expected {platform[field]!r}")
+ products = set(re.findall(r"[a-z]+\d+[a-z]*", device_name.lower()))
+ if platform["product"] not in products:
+ issues.append(f"device product {device_name!r} does not identify {platform['product']}")
+ if device_count != platform["gpus_per_node"]:
+ issues.append(
+ f"visible GPUs={device_count}, expected {platform['gpus_per_node']} per node"
+ )
+ if world_size not in platform["ep_degrees"]:
+ issues.append(f"EP{world_size} is not registered for {sku}")
+ return issues
+
+
+def resolve(sku: str, backend: str, *, nodes: int = 1, routing: str = "uniform",
+ eplb: bool = False) -> tuple[bool, str]:
+ """Return whether one fixed-v1 case can run on a public GHA runner label."""
+ platform, implementation = PLATFORMS.get(sku), BACKENDS.get(backend)
+ if platform is None:
+ return False, f"unknown GHA runner label {sku!r}"
+ if implementation is None:
+ return False, f"unknown backend {backend!r}"
+ if nodes < 1 or nodes * platform["gpus_per_node"] not in platform["ep_degrees"]:
+ return False, f"{sku} does not register a {nodes}-node EP degree"
+ if routing not in {"uniform", "zipf"} or (eplb and routing != "zipf"):
+ return False, "v1 routing is uniform or zipf, with EPLB only on zipf"
+ if platform["vendor"] not in implementation["vendors"]:
+ return False, f"{backend} does not support {platform['vendor']}"
+ sku_capability = implementation.get("sku_capabilities", {}).get(sku)
+ if sku_capability is not None and not sku_capability["schedulable"]:
+ return False, f"{backend} is unsupported on {sku}: {sku_capability['basis']}"
+ if platform["machine"] not in implementation.get("machines", {platform["machine"]}):
+ return False, f"{backend} does not support {platform['machine']}"
+ if sku in implementation.get("excluded_skus", set()):
+ return False, f"{backend} is unavailable on {sku}"
+ return True, "ok"
diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml
new file mode 100644
index 0000000000..0d72ceaae4
--- /dev/null
+++ b/experimental/CollectiveX/configs/suites.yaml
@@ -0,0 +1,21 @@
+# CollectiveX v1 comparison suites.
+schema_version: 1
+
+suites:
+ ep-core-v1:
+ workloads: [deepseek-v3-v1]
+ platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+ routings: [uniform]
+ phases: [decode, prefill]
+ token_points_prefill: [256, 512]
+ required_publication: official
+
+ ep-routing-v1:
+ workloads: [deepseek-v3-v1]
+ platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+ routings: [zipf]
+ eplb: [false, true]
+ phases: [decode, prefill]
+ token_points_decode: [128]
+ token_points_prefill: [512]
+ required_publication: comparable-experimental
diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml
new file mode 100644
index 0000000000..b5b68334c4
--- /dev/null
+++ b/experimental/CollectiveX/configs/workloads.yaml
@@ -0,0 +1,9 @@
+# CollectiveX v1 canonical workload and phase metadata.
+schema_version: 1
+
+model_derived:
+ deepseek-v3-v1:
+ hidden: 7168
+ topk: 8
+ routed_experts: 256
+ verified_against: "deepseek-ai/DeepSeek-V3@e815299b0bcbac849fa540c768ef21845365c9eb/config.json"
diff --git a/experimental/CollectiveX/contracts.py b/experimental/CollectiveX/contracts.py
new file mode 100644
index 0000000000..6089b8d119
--- /dev/null
+++ b/experimental/CollectiveX/contracts.py
@@ -0,0 +1,2641 @@
+#!/usr/bin/env python3
+"""Strict native attempt contracts and metric validation for CollectiveX v1."""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+from functools import lru_cache
+import hashlib
+import json
+import math
+import os
+from pathlib import Path, PurePosixPath
+import re
+import sys
+from typing import Any, Iterable
+
+import artifact_safety
+import capability
+import identity
+
+TESTS = Path(__file__).resolve().parent / "tests"
+sys.path.insert(0, str(TESTS))
+import eplb as eplb_contract # noqa: E402
+import workload as workload_contract # noqa: E402
+
+RAW_FORMAT = "collectivex.ep.v1"
+SAMPLES_FORMAT = "collectivex.samples.v1"
+TERMINAL_FORMAT = "collectivex.terminal.v1"
+TERMINAL_CASE_FIELDS = {
+ "backend", "canonical", "eplb", "ep", "experts", "gpus_per_node", "hidden",
+ "ladder", "nodes", "phase", "required_publication", "routing", "samples_per_point",
+ "scale_up_domain", "suite", "timing", "topk", "warmup_semantics", "workload",
+}
+ALLOCATION_FACTOR_FIELDS = {
+ "artifact", "execution_id", "job", "repo", "run_attempt", "run_id", "runner",
+ "source_sha",
+}
+GIT_RUN_FIELDS = {"artifact", "job", "ref", "repo", "run_attempt", "run_id", "source_sha"}
+PRE_EXECUTION_FAILURE_REASONS = {
+ "setup": "launcher-setup-failed",
+ "repository-stage": "repository-staging-failed",
+ "registry-verification": "container-registry-verification-failed",
+ "scheduler-allocation": "scheduler-allocation-failed",
+ "container-import": "container-image-preparation-failed",
+ "container-hash": "container-image-identity-failed",
+ "container-launch": "container-runtime-launch-failed",
+ "backend-setup": "backend-setup-failed",
+ "artifact-collection": "artifact-collection-failed",
+}
+RUNTIME_FAILURE_REASONS = {
+ **PRE_EXECUTION_FAILURE_REASONS,
+ "runtime-identity": "runtime-identity-mismatch",
+ "timeout": "execution-timeout",
+ "deadlock": "execution-deadlock",
+ "execution": "distributed-command-failed",
+}
+POST_EMIT_FAILURE_REASONS = {
+ mode: "post-emit-distributed-command-failed"
+ for mode in ("runtime-identity", "timeout", "deadlock", "execution")
+}
+CAPABILITY_FAILURE_REASONS = frozenset({
+ "backend-platform-unsupported",
+ "backend-token-capacity",
+})
+RETURN_CODE_FAILURE_MODES = {
+ 5: "runtime-identity",
+ 124: "timeout",
+}
+PERCENTILES = ("p50", "p90", "p95", "p99")
+V1_CONDITIONING_LADDERS = {
+ "decode": (1, 2, 4, 8, 16, 32, 64, 128),
+ "prefill": (1, 2, 4, 8, 16, 32, 64, 128, 256, 512),
+}
+V1_CONDITIONING_ROUNDS_PER_SHAPE = 8
+DEEPEP_V2_JIT_KERNELS = frozenset({
+ "barrier", "combine", "combine_reduce_epilogue", "dispatch",
+ "dispatch_copy_epilogue",
+})
+DEEPEP_V2_V1_PROVENANCE = {
+ "deepep_version": "2.0.0",
+ "deepep_distribution_version": "2.0.0+fa8a9b1",
+ "deepep_commit": "fa8a9b16898204afd347c663b89e65ef87dc6ce6",
+ "deepep_tree": "29809e75c5874e6609dac4804e7b651d5226959f",
+ "deepep_pr": 605,
+ "deepep_fix_pr": 630,
+ "fmt_commit": "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa",
+ "torch_version": "2.10.0+cu130",
+ "nccl_package_version": "2.30.4",
+ "nccl_version": "2.30.4",
+ "nvshmem_package_version": "3.3.9",
+ "allow_hybrid_mode": False,
+ "gin_enabled": False,
+ "communication_backend": "nccl-device-lsa",
+}
+UCCL_DEPENDENCY_VERSIONS = {
+ "intervaltree": "3.1.0",
+ "nvidia-cuda-runtime-cu12": "12.9.79",
+ "sortedcontainers": "2.4.0",
+}
+SCHEMA_DIR = Path(__file__).resolve().parent / "schemas"
+_SCHEMA_CACHE: dict[str, dict[str, Any]] = {}
+REQUIRED_BACKEND_PROVENANCE = {
+ "deepep": (
+ "deepep_version", "deepep_commit", "backend_lineage", "allow_mnnvl",
+ "mnnvl_comm",
+ ),
+ "deepep-v2": (
+ *DEEPEP_V2_V1_PROVENANCE, "api_signature_sha256", "loaded_libraries",
+ "jit_cubins", "jit_random_seed", "deterministic", "num_experts",
+ "tuning_num_experts",
+ ),
+ "deepep-hybrid": (
+ "deepep_commit", "deepep_tree", "branch", "backend_lineage",
+ "loaded_libraries", "realized_config", "jit_kernel_keys", "jit_shared_objects",
+ ),
+ "uccl": (
+ "uccl_version", "uccl_commit", "uccl_wrapper_commit", "backend_lineage",
+ "loaded_libraries", "uccl_dependency_versions",
+ ),
+ "mori": ("mori_commit",),
+ "nccl-ep": ("nccl_version", "collective_library", "backend_lineage"),
+}
+PROVENANCE_KEYS = {
+ "allocated_qps", "allow_hybrid_mode", "allow_mnnvl", "allow_multiple_reduction",
+ "api", "api_signature_sha256", "backend", "backend_lineage", "block_num",
+ "block_num_floored", "block_num_target", "branch", "collective_library",
+ "combine_dtype", "combine_warps", "communication_backend", "cuda_version",
+ "deepep_commit", "deepep_distribution_version", "deepep_fix_pr", "deepep_pr", "deepep_tree",
+ "deepep_version", "deterministic", "device_cus",
+ "device_sms", "dispatch_dtype", "dispatch_warps", "enable_sdma", "fmt_commit",
+ "gin_enabled",
+ "gpus_per_node", "heap_size",
+ "impl", "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_random_seed",
+ "jit_shared_objects", "kernel_type",
+ "loaded_libraries", "local_experts",
+ "logical_scaleout_ranks",
+ "logical_scaleup_ranks", "mapping_variant", "max_num_inp_token_per_rank",
+ "max_num_tokens", "max_total_recv_tokens", "mnnvl_comm", "mode", "mori_commit",
+ "nccl_communicator", "nccl_package_version", "nccl_version", "num_experts",
+ "nvshmem_package_version",
+ "num_max_tokens_per_rank", "num_nvl_bytes", "num_qps", "num_qps_per_rank",
+ "num_rdma_bytes", "num_sms", "path",
+ "physical_nvlink_ranks", "physical_rdma_ranks", "prefer_overlap_with_compute",
+ "realized_config", "reference_semantics", "requested_num_sms", "resource_mode", "routing_factor",
+ "routing_metadata", "sm_fraction", "top_k",
+ "torch_git_version", "torch_version", "transport", "trtllm", "tuned_source",
+ "tuning_num_experts",
+ "uccl_commit", "uccl_dependency_versions", "uccl_version", "uccl_wrapper_commit",
+ "workspace",
+}
+
+
+class ContractError(ValueError):
+ """A document differs from the native v1 contract."""
+
+
+def resolve_deepep_mnnvl(
+ *, requested: bool, signature_parameters: Iterable[str], deepep_commit: str | None
+) -> tuple[dict[str, bool], str]:
+ """Resolve one explicit DeepEP MNNVL API mode without signature fallbacks."""
+ if not requested:
+ return {}, "not-requested"
+ if "allow_mnnvl" in set(signature_parameters):
+ return {"allow_mnnvl": True}, "explicit-allow-mnnvl"
+ raise ContractError(
+ f"requested DeepEP MNNVL is unsupported by commit {deepep_commit or 'unknown'}"
+ )
+
+
+def collective_kernel_generation(collective_library: Any) -> str:
+ """Return the public NCCL/RCCL implementation lineage."""
+ if collective_library not in {"nccl", "rccl"}:
+ raise ContractError("reference collective library must be nccl or rccl")
+ return collective_library
+
+
+def project_resource_profile(provenance: dict[str, Any]) -> dict[str, Any]:
+ """Project backend provenance into the canonical cross-backend resource vocabulary."""
+ device_units = provenance.get("device_sms") or provenance.get("device_cus")
+ if provenance.get("num_sms") is not None:
+ kind, configured = "sm", provenance["num_sms"]
+ elif (
+ provenance.get("block_num") is not None
+ and provenance.get("kernel_type") != "AsyncLL"
+ ):
+ kind, configured = "cu_block", provenance["block_num"]
+ else:
+ kind, configured = None, None
+ achieved = configured / device_units if configured and device_units else None
+ fixed = "fixed-kernel" in str(provenance.get("tuned_source", ""))
+ source = str(provenance.get("tuned_source", ""))
+ return {
+ "achieved_fraction": round(achieved, 4) if achieved else None,
+ "comm_units_kind": kind,
+ "configured_units": configured,
+ "conformance_class": (
+ "not-applicable" if fixed else "best-known" if "default" not in source
+ else "backend-default"
+ ),
+ "device_units": device_units,
+ "fixed_kernel": fixed,
+ "nonconforming": False,
+ "pareto_eligible": False,
+ "persistent_bytes": (
+ provenance.get("num_nvl_bytes")
+ or provenance.get("num_rdma_bytes")
+ or provenance.get("heap_size")
+ ),
+ "qps_per_rank": provenance.get("num_qps_per_rank"),
+ "requested_fraction": None,
+ "resource_class": "fixed-kernel" if fixed else "backend-tuned",
+ "target_achieved_within_tol": None,
+ "tolerance": 0.10,
+ "tuned_source": provenance.get("tuned_source"),
+ "warps_combine": provenance.get("combine_warps"),
+ "warps_dispatch": provenance.get("dispatch_warps"),
+ }
+
+
+def backend_version(provenance: dict[str, Any]) -> str | None:
+ """Return the canonical public backend version from implementation provenance."""
+ for field in (
+ "deepep_version", "uccl_version", "nccl_version",
+ "mori_commit", "deepep_commit",
+ ):
+ value = provenance.get(field)
+ if value is not None and str(value).strip():
+ return str(value)[:160]
+ return None
+
+
+def public_series_config(
+ *, kernel_generation: Any, provenance: dict[str, Any],
+ resource_profile: dict[str, Any], resource_mode: Any, device_product: Any,
+) -> dict[str, Any]:
+ """Project raw implementation facts into the exact public configuration fields."""
+ generation = None if kernel_generation == "n-a" else kernel_generation
+ profile = "profile-" + _sha256_json(resource_profile)[:16]
+ return {
+ "backend": {
+ "generation": generation,
+ "version": backend_version(provenance),
+ },
+ "resource": {
+ "mode": resource_mode,
+ "profile": profile,
+ "comm_units_kind": resource_profile.get("comm_units_kind"),
+ "configured_units": resource_profile.get("configured_units"),
+ },
+ "system": {"label": str(device_product)[:160]},
+ }
+
+
+def public_series_config_sha256(config: dict[str, Any]) -> str:
+ """Commit the canonical public configuration projection into series identity."""
+ return _sha256_json(config)
+
+
+SOURCE_BUILT_LIBRARY_ROLES = frozenset({
+ "deepep-extension", "deepep-hybrid-extension",
+})
+
+
+def series_provenance(provenance: dict[str, Any]) -> dict[str, Any]:
+ """Project stable semantic build identity while retaining raw binaries in private evidence."""
+ projected = {
+ key: value for key, value in provenance.items()
+ if key not in {"jit_cache_key", "jit_shared_objects", "path", "sm_fraction"}
+ }
+ libraries = provenance.get("loaded_libraries")
+ if isinstance(libraries, list):
+ projected["loaded_libraries"] = [
+ {
+ "name": item.get("name"),
+ "role": item.get("role"),
+ "source_tree": provenance.get("deepep_tree"),
+ }
+ if isinstance(item, dict) and item.get("role") in SOURCE_BUILT_LIBRARY_ROLES
+ else item
+ for item in libraries
+ ]
+ jit_cubins = provenance.get("jit_cubins")
+ if isinstance(jit_cubins, list):
+ projected["jit_cubins"] = [
+ {
+ "cache_key": item.get("cache_key"),
+ "sass_sha256": item.get("sass_sha256"),
+ "source_sha256": item.get("source_sha256"),
+ }
+ if isinstance(item, dict)
+ else item
+ for item in jit_cubins
+ ]
+ return projected
+
+
+def routing_implementation_control_sha256(implementation: dict[str, Any]) -> str:
+ """Bind routing cohorts to the same static build/generator and non-treatment configuration."""
+ provenance = implementation.get("provenance")
+ if not isinstance(provenance, dict):
+ raise ContractError("implementation provenance is unavailable")
+ semantic = series_provenance(provenance)
+ treatment_fields = {
+ "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_shared_objects",
+ "local_experts", "num_experts", "path", "realized_config", "sm_fraction",
+ }
+ return _sha256_json({
+ "kernel_generation": implementation.get("kernel_generation"),
+ "name": implementation.get("name"),
+ "provenance": {
+ key: value for key, value in semantic.items()
+ if key not in treatment_fields
+ },
+ "resource_profile": implementation.get("resource_profile"),
+ })
+
+
+def _resolved_provenance_value(field: str, value: Any) -> bool:
+ if value is None or isinstance(value, (dict, list, tuple, set)) and not value:
+ return False
+ text = str(value).strip().lower()
+ if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}:
+ return False
+ if "capture-failed" in text:
+ return False
+ if field.endswith("_commit") and (
+ text in {"main", "hybrid-ep", "uccl", "pkg-uccl"}
+ or text.endswith(("-unknown", "-none", "-main", "-hybrid-ep"))
+ ):
+ return False
+ return True
+
+
+def _content_evidence_is_valid(value: Any, required_roles: set[str]) -> bool:
+ if not isinstance(value, list) or not value:
+ return False
+ records: set[tuple[str, str]] = set()
+ roles: set[str] = set()
+ for item in value:
+ if not isinstance(item, dict) or set(item) != {"name", "role", "sha256"}:
+ return False
+ name, role, digest = item["name"], item["role"], item["sha256"]
+ if (
+ not isinstance(name, str)
+ or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name)
+ or not isinstance(role, str)
+ or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role)
+ or not isinstance(digest, str)
+ or not re.fullmatch(r"[0-9a-f]{64}", digest)
+ or (role, name) in records
+ ):
+ return False
+ records.add((role, name))
+ roles.add(role)
+ return required_roles <= roles
+
+
+def _deepep_v2_jit_cubins_are_valid(value: Any) -> bool:
+ if not isinstance(value, list) or len(value) != len(DEEPEP_V2_JIT_KERNELS):
+ return False
+ cache_keys = []
+ kernel_names = set()
+ for item in value:
+ if not isinstance(item, dict) or set(item) != {
+ "cache_key", "cubin_sha256", "sass_sha256", "source_sha256",
+ }:
+ return False
+ cache_key = item["cache_key"]
+ match = (
+ re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.[0-9a-f]{32}", cache_key)
+ if isinstance(cache_key, str)
+ else None
+ )
+ if (
+ match is None
+ or any(
+ not isinstance(item[field], str)
+ or not re.fullmatch(r"[0-9a-f]{64}", item[field])
+ for field in ("cubin_sha256", "sass_sha256", "source_sha256")
+ )
+ ):
+ return False
+ cache_keys.append(cache_key)
+ kernel_names.add(match.group(1))
+ return (
+ cache_keys == sorted(set(cache_keys))
+ and kernel_names == DEEPEP_V2_JIT_KERNELS
+ )
+
+
+HYBRID_REALIZED_CONFIG_FIELDS = {
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes", "pad_multiple",
+ "num_of_tokens_per_chunk_preprocessing_api",
+ "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api",
+ "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type",
+ "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api",
+ "num_of_in_flight_s2g_dispatch_api",
+ "num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_additional_in_flight_s2g_dispatch_api",
+ "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api",
+ "forward_dispatch_api", "device_side_sync_dispatch_api",
+ "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api",
+ "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api",
+ "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api",
+ "backward_combine_api", "device_side_sync_combine_api",
+}
+HYBRID_REALIZED_BOOL_FIELDS = {
+ "forward_dispatch_api", "device_side_sync_dispatch_api", "backward_combine_api",
+ "device_side_sync_combine_api",
+}
+
+
+def _hybrid_realized_config_is_valid(value: Any) -> bool:
+ if not isinstance(value, dict) or set(value) != HYBRID_REALIZED_CONFIG_FIELDS:
+ return False
+ for field, field_value in value.items():
+ if field in HYBRID_REALIZED_BOOL_FIELDS:
+ if type(field_value) is not bool:
+ return False
+ elif field == "token_data_type":
+ if field_value not in {"UINT8", "UINT16"}:
+ return False
+ elif type(field_value) is not int or field_value < 0:
+ return False
+ return all(value[field] > 0 for field in (
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes",
+ ))
+
+
+def _hybrid_kernel_keys_are_valid(value: Any) -> bool:
+ return (
+ isinstance(value, list)
+ and len(value) == 3
+ and len(set(value)) == 3
+ and value == sorted(value)
+ and all(
+ isinstance(key, str)
+ and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", key)
+ for key in value
+ )
+ )
+
+
+def _hybrid_jit_evidence_is_valid(value: Any, kernel_keys: Any) -> bool:
+ if not _hybrid_kernel_keys_are_valid(kernel_keys) or not isinstance(value, list):
+ return False
+ if len(value) != len(kernel_keys):
+ return False
+ rank_sets = []
+ for expected_key, item in zip(kernel_keys, value):
+ if not isinstance(item, dict) or set(item) != {"kernel_key", "rank_artifacts"}:
+ return False
+ rank_artifacts = item["rank_artifacts"]
+ if item["kernel_key"] != expected_key or not isinstance(rank_artifacts, list):
+ return False
+ ranks = []
+ for artifact in rank_artifacts:
+ if not isinstance(artifact, dict) or set(artifact) != {"bytes", "rank", "sha256"}:
+ return False
+ rank, digest, size = artifact["rank"], artifact["sha256"], artifact["bytes"]
+ if (
+ type(rank) is not int
+ or rank < 0
+ or not isinstance(digest, str)
+ or not re.fullmatch(r"[0-9a-f]{64}", digest)
+ or type(size) is not int
+ or size <= 0
+ ):
+ return False
+ ranks.append(rank)
+ if not ranks or ranks != list(range(len(ranks))):
+ return False
+ rank_sets.append(ranks)
+ return all(ranks == rank_sets[0] for ranks in rank_sets)
+
+
+def backend_provenance_issues(backend: str, provenance: dict[str, Any]) -> list[str]:
+ unknown = [
+ field for field, value in provenance.items()
+ if isinstance(value, str) and value.strip().lower() == "unknown"
+ ]
+ unresolved = [
+ field for field in REQUIRED_BACKEND_PROVENANCE.get(backend, ())
+ if not _resolved_provenance_value(field, provenance.get(field))
+ ]
+ if backend == "deepep":
+ mode = provenance.get("mnnvl_comm")
+ allow = provenance.get("allow_mnnvl")
+ valid_modes = {
+ "not-requested": False,
+ "explicit-allow-mnnvl": True,
+ }
+ if type(allow) is not bool or valid_modes.get(mode) is not allow:
+ unresolved.append("mnnvl_comm")
+ if provenance.get("backend_lineage") != "deepep-v1":
+ unresolved.append("backend_lineage")
+ if backend == "deepep-v2":
+ for field in ("num_experts", "tuning_num_experts"):
+ if type(provenance.get(field)) is not int or provenance[field] <= 0:
+ unresolved.append(field)
+ if not _deepep_v2_jit_cubins_are_valid(provenance.get("jit_cubins")):
+ unresolved.append("jit_cubins")
+ if provenance.get("jit_random_seed") != "collectivex-deepep-v2-fa8a9b1":
+ unresolved.append("jit_random_seed")
+ unresolved.extend(
+ field for field, expected in DEEPEP_V2_V1_PROVENANCE.items()
+ if provenance.get(field) != expected
+ )
+ content_roles = {
+ "deepep-v2": {"deepep-extension", "nccl", "nvshmem"},
+ "deepep-hybrid": {"deepep-extension", "deepep-hybrid-extension"},
+ "uccl": {
+ "uccl-distribution", "uccl-wrapper", "intervaltree-distribution",
+ "sortedcontainers-distribution", "cuda-runtime",
+ },
+ }.get(backend)
+ if content_roles is not None and not _content_evidence_is_valid(
+ provenance.get("loaded_libraries"), content_roles
+ ):
+ unresolved.append("loaded_libraries")
+ if backend in {"deepep-v2", "deepep-hybrid"} and not re.fullmatch(
+ r"[0-9a-f]{40}", str(provenance.get("deepep_tree", ""))
+ ):
+ unresolved.append("deepep_tree")
+ if backend == "deepep-hybrid" and provenance.get("backend_lineage") != "deepep-hybrid":
+ unresolved.append("backend_lineage")
+ if backend == "deepep-hybrid":
+ if not _hybrid_realized_config_is_valid(provenance.get("realized_config")):
+ unresolved.append("realized_config")
+ if not _hybrid_kernel_keys_are_valid(provenance.get("jit_kernel_keys")):
+ unresolved.append("jit_kernel_keys")
+ if not _hybrid_jit_evidence_is_valid(
+ provenance.get("jit_shared_objects"), provenance.get("jit_kernel_keys")
+ ):
+ unresolved.append("jit_shared_objects")
+ if backend == "uccl" and provenance.get("backend_lineage") != "uccl":
+ unresolved.append("backend_lineage")
+ if backend == "uccl" and provenance.get("uccl_dependency_versions") != (
+ UCCL_DEPENDENCY_VERSIONS
+ ):
+ unresolved.append("uccl_dependency_versions")
+ if backend == "nccl-ep":
+ collective = provenance.get("collective_library")
+ if collective not in {"nccl", "rccl"}:
+ unresolved.append("collective_library")
+ if provenance.get("backend_lineage") != collective:
+ unresolved.append("backend_lineage")
+ return sorted(set(unknown + unresolved))
+
+
+def provenance_complete(
+ provenance: dict[str, Any], backend: str, git_run: dict[str, Any] | None,
+ *, image_digest: Any, image_verified: Any, squash_sha256: Any,
+) -> bool:
+ image = str(image_digest or "")
+ squash = str(squash_sha256 or "")
+ return (
+ not backend_provenance_issues(backend, provenance)
+ and image_verified is True
+ and bool(re.fullmatch(r"sha256:[0-9a-f]{64}", image))
+ and bool(re.fullmatch(r"[0-9a-f]{64}", squash))
+ and isinstance(git_run, dict)
+ and all(git_run.get(field) for field in GIT_RUN_FIELDS)
+ )
+
+
+def strict_load(path: str | os.PathLike[str]) -> Any:
+ """Load JSON while rejecting duplicate keys and non-finite constants."""
+ def pairs(items):
+ result = {}
+ for key, value in items:
+ if key in result:
+ raise ContractError(f"duplicate JSON key {key!r}")
+ result[key] = value
+ return result
+
+ def constant(value):
+ raise ContractError(f"non-finite JSON number {value}")
+
+ try:
+ with open(path) as handle:
+ return json.load(handle, object_pairs_hook=pairs, parse_constant=constant)
+ except (OSError, json.JSONDecodeError) as exc:
+ raise ContractError(f"invalid JSON {path}: {exc}") from exc
+
+
+def canonical_json_bytes(value: Any) -> bytes:
+ """Canonical finite JSON bytes for checksums and immutable artifacts."""
+ _finite_tree(value)
+ try:
+ return json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True,
+ separators=(",", ":"),
+ ).encode("utf-8")
+ except (TypeError, ValueError) as exc:
+ raise ContractError(f"value is not canonical JSON: {exc}") from exc
+
+
+def content_manifest_evidence(
+ *, role: str, name: str, files: Iterable[tuple[str, str | os.PathLike[str]]]
+) -> dict[str, str]:
+ """Hash a labeled file set without exposing any host path in provenance."""
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role):
+ raise ContractError("content evidence role is invalid")
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name):
+ raise ContractError("content evidence name is invalid")
+ manifest: list[dict[str, Any]] = []
+ labels: set[str] = set()
+ for label, raw_path in files:
+ logical = PurePosixPath(label)
+ if (
+ not label
+ or logical.is_absolute()
+ or ".." in logical.parts
+ or label in labels
+ or any(ord(character) < 0x20 or ord(character) > 0x7E for character in label)
+ ):
+ raise ContractError("content evidence label is invalid or duplicated")
+ path = Path(raw_path)
+ if not path.is_file():
+ raise ContractError("content evidence source is not a file")
+ digest = hashlib.sha256()
+ size = 0
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ size += len(chunk)
+ labels.add(label)
+ manifest.append({"bytes": size, "label": label, "sha256": digest.hexdigest()})
+ if not manifest:
+ raise ContractError("content evidence cannot be empty")
+ digest = hashlib.sha256(
+ canonical_json_bytes(sorted(manifest, key=lambda item: item["label"]))
+ ).hexdigest()
+ return {"name": name, "role": role, "sha256": digest}
+
+
+def _obj(value: Any, path: str) -> dict[str, Any]:
+ if not isinstance(value, dict):
+ raise ContractError(f"{path} must be an object")
+ return value
+
+
+def _keys(value: Any, expected: set[str], path: str) -> dict[str, Any]:
+ obj = _obj(value, path)
+ actual = set(obj)
+ if actual != expected:
+ raise ContractError(
+ f"{path} fields differ: missing={sorted(expected - actual)}, "
+ f"extra={sorted(actual - expected)}"
+ )
+ return obj
+
+
+def _text(value: Any, path: str, *, nullable: bool = False) -> str | None:
+ if nullable and value is None:
+ return None
+ if not isinstance(value, str) or not value:
+ raise ContractError(f"{path} must be a non-empty string")
+ return value
+
+
+def _integer(value: Any, path: str, *, minimum: int = 0) -> int:
+ if type(value) is not int or value < minimum:
+ raise ContractError(f"{path} must be an integer >= {minimum}")
+ return value
+
+
+def validate_conditioning_contract(value: Any, phase: str) -> dict[str, Any]:
+ """Validate the exact phase-specific v1 conditioning schedule."""
+ if phase not in V1_CONDITIONING_LADDERS:
+ raise ContractError("raw conditioning phase is invalid")
+ conditioning = _keys(
+ value, {"contract", "ladder", "roundtrips_per_shape"},
+ "raw.measurement.conditioning",
+ )
+ ladder = conditioning["ladder"]
+ if (
+ conditioning["contract"] != identity.V1_CASE_PROFILE["conditioning_contract"]
+ or type(ladder) is not list
+ or any(type(point) is not int for point in ladder)
+ or ladder != list(V1_CONDITIONING_LADDERS[phase])
+ or _integer(
+ conditioning["roundtrips_per_shape"],
+ "raw.measurement.conditioning.roundtrips_per_shape",
+ minimum=1,
+ ) != V1_CONDITIONING_ROUNDS_PER_SHAPE
+ ):
+ raise ContractError(f"raw {phase} conditioning contract differs")
+ return conditioning
+
+
+def _number(value: Any, path: str, *, minimum: float | None = None) -> float:
+ if isinstance(value, bool) or not isinstance(value, (int, float)) or not math.isfinite(value):
+ raise ContractError(f"{path} must be finite")
+ result = float(value)
+ if minimum is not None and result < minimum:
+ raise ContractError(f"{path} must be >= {minimum}")
+ return result
+
+
+def _finite_tree(value: Any, path: str = "$") -> None:
+ if isinstance(value, float) and not math.isfinite(value):
+ raise ContractError(f"{path} contains a non-finite number")
+ if isinstance(value, list):
+ for index, item in enumerate(value):
+ _finite_tree(item, f"{path}[{index}]")
+ elif isinstance(value, dict):
+ for key, item in value.items():
+ _finite_tree(item, f"{path}.{key}")
+
+
+def _typed(value: Any, kind: str, path: str) -> str:
+ if not identity.is_typed_id(value, kind):
+ raise ContractError(f"{path} is not a {kind} ID")
+ return value
+
+
+def _sha256_json(value: Any) -> str:
+ payload = json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":")
+ ).encode()
+ return hashlib.sha256(payload).hexdigest()
+
+
+@lru_cache(maxsize=None)
+def _expected_eplb_plan(
+ routing: str,
+ topk: int,
+ logical_experts: int,
+ physical_experts: int,
+ ep_size: int,
+ seed: int,
+ reference_tokens_per_rank: int,
+) -> dict[str, Any]:
+ indices, _ = workload_contract.canonical_routing_rows(
+ reference_tokens_per_rank * ep_size,
+ logical_experts,
+ topk,
+ routing,
+ seed,
+ )
+ load = [0] * logical_experts
+ for row in indices:
+ for expert in row:
+ load[expert] += 1
+ return eplb_contract.build_plan(load, physical_experts, ep_size)
+
+
+@lru_cache(maxsize=None)
+def _expected_canonical_trace(
+ routing: str,
+ hidden: int,
+ topk: int,
+ logical_experts: int,
+ physical_experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ seed: int,
+ eplb_enabled: bool,
+ reference_tokens_per_rank: int,
+) -> tuple[str, dict[str, str], str, list[list[int]], list[list[float]]]:
+ member, checksums, indices, weights = workload_contract.canonical_member(
+ routing,
+ hidden,
+ topk,
+ logical_experts,
+ ep_size,
+ tokens_per_rank,
+ seed,
+ )
+ if eplb_enabled:
+ plan = _expected_eplb_plan(
+ routing,
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ seed,
+ reference_tokens_per_rank,
+ )
+ indices = eplb_contract.remap_rows(indices, plan)
+ routing_hash = workload_contract.trace_checksums(indices, weights)["trace"]
+ return member, checksums, routing_hash, indices, weights
+
+
+def _coefficient_of_variation(values: list[int]) -> float:
+ mean = sum(values) / len(values)
+ if mean == 0:
+ return 0.0
+ variance = sum((value - mean) ** 2 for value in values) / len(values)
+ return variance**0.5 / mean
+
+
+def _expected_routing_summary(
+ indices: list[list[int]],
+ weights: list[list[float]],
+ *,
+ physical_experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ gpus_per_node: int,
+ scale_up_domain: int,
+) -> dict[str, Any]:
+ """Recompute every published routing/load statistic without torch."""
+ experts_per_rank = physical_experts // ep_size
+ expert_load = [0] * physical_experts
+ assignment_load = [0] * ep_size
+ payload_load = [0] * ep_size
+ fanouts: list[int] = []
+ local = same_node = same_domain = copies = 0
+ for token, row in enumerate(indices):
+ destinations = {expert // experts_per_rank for expert in row}
+ source = token // tokens_per_rank
+ fanouts.append(len(destinations))
+ for expert in row:
+ expert_load[expert] += 1
+ assignment_load[expert // experts_per_rank] += 1
+ for destination in destinations:
+ payload_load[destination] += 1
+ copies += 1
+ local += destination == source
+ same_node += destination // gpus_per_node == source // gpus_per_node
+ same_domain += destination // scale_up_domain == source // scale_up_domain
+ fanout_histogram = [fanouts.count(value) for value in range(1, ep_size + 1)]
+ expert_mean = sum(expert_load) / len(expert_load)
+ return {
+ "empty_expert_count": expert_load.count(0),
+ "empty_rank_count": payload_load.count(0),
+ "expert_assignment_rank_cv": _coefficient_of_variation(assignment_load),
+ "expert_assignments_per_rank": assignment_load,
+ "expert_load_cv": _coefficient_of_variation(expert_load),
+ "expert_load_max": max(expert_load),
+ "expert_load_mean": expert_mean,
+ "expert_load_min": min(expert_load),
+ "fanout_histogram": fanout_histogram,
+ "fanout_max": max(fanouts),
+ "fanout_mean": sum(fanouts) / len(fanouts),
+ "fanout_min": min(fanouts),
+ "hash": workload_contract.trace_checksums(indices, weights)["trace"],
+ "hotspot_ratio": max(expert_load) / expert_mean if expert_mean else 0.0,
+ "locality": {
+ "placement": "packed",
+ "local_rank_fraction": local / copies,
+ "same_node_fraction": same_node / copies,
+ "same_scaleup_domain_fraction": same_domain / copies,
+ "cross_node_fraction": 1 - same_node / copies,
+ "cross_domain_fraction": 1 - same_domain / copies,
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ "copies": copies,
+ },
+ "payload_copies_per_rank": payload_load,
+ "payload_rank_cv": _coefficient_of_variation(payload_load),
+ "routed_copies": copies,
+ "source_token_stats": {
+ "min": tokens_per_rank,
+ "mean": float(tokens_per_rank),
+ "max": tokens_per_rank,
+ "cv": 0.0,
+ "empty_ranks": 0,
+ "total": tokens_per_rank * ep_size,
+ "ranks": ep_size,
+ },
+ }
+
+
+def _expected_histogram(samples: list[float], bins: int = 40) -> dict[str, Any]:
+ low, high = min(samples), max(samples)
+ if high <= low:
+ return {"n": len(samples), "min": low, "max": high, "bins": bins, "counts": [len(samples)]}
+ counts = [0] * bins
+ span = high - low
+ for sample in samples:
+ index = min(bins - 1, int((sample - low) / span * bins))
+ counts[index] += 1
+ return {
+ "n": len(samples),
+ "min": round(low, 3),
+ "max": round(high, 3),
+ "bins": bins,
+ "counts": counts,
+ }
+
+
+def _expected_anomalies(
+ tokens: int, components: dict[str, Any]
+) -> list[dict[str, Any]]:
+ dispatch = components["dispatch"]["percentiles_us"]
+ combine = components["combine"]["percentiles_us"]
+ roundtrip = components["roundtrip"]["percentiles_us"]
+ isolated = components["isolated_sum"]["percentiles_us"]
+ anomalies: list[dict[str, Any]] = []
+ if isolated is not None and roundtrip["p99"] > 3.0 * isolated["p99"]:
+ anomalies.append({
+ "type": "roundtrip_gt_isolated_sum",
+ "T": tokens,
+ "roundtrip_p99": round(roundtrip["p99"], 2),
+ "isolated_sum_p99": round(isolated["p99"], 2),
+ "ratio": round(roundtrip["p99"] / isolated["p99"], 2),
+ "threshold": 3.0,
+ })
+ floor = max(dispatch["p50"], combine["p50"]) if dispatch and combine else None
+ if floor and roundtrip["p50"] < 0.95 * floor:
+ anomalies.append({
+ "type": "roundtrip_lt_component_floor",
+ "T": tokens,
+ "roundtrip_p50": round(roundtrip["p50"], 2),
+ "component_floor_p50": round(floor, 2),
+ })
+ return anomalies
+
+
+def _validate_canonical_workload(
+ workload: dict[str, Any],
+ scheduled_case: dict[str, Any],
+ rows: list[dict[str, Any]],
+ eplb: dict[str, Any],
+) -> None:
+ """Bind every canonical member and measured routing hash to its scheduled token row."""
+ profile = identity.V1_CASE_PROFILE
+ if eplb["enabled"]:
+ plan = _expected_eplb_plan(
+ scheduled_case["routing"],
+ scheduled_case["topk"],
+ scheduled_case["experts"],
+ eplb["num_physical_experts"],
+ scheduled_case["ep"],
+ profile["seed"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ if eplb["mapping_hash"] != eplb_contract.mapping_hash(plan):
+ raise ContractError("raw EPLB mapping differs from the frozen canonical plan")
+
+ expected: dict[str, dict[str, str]] = {}
+ for index, row in enumerate(rows):
+ member, checksums, routing_hash, _, _ = _expected_canonical_trace(
+ scheduled_case["routing"],
+ scheduled_case["hidden"],
+ scheduled_case["topk"],
+ scheduled_case["experts"],
+ eplb["num_physical_experts"],
+ scheduled_case["ep"],
+ row["tokens_per_rank"],
+ profile["seed"],
+ eplb["enabled"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ if row["routing"]["hash"] != routing_hash:
+ raise ContractError(
+ f"raw.measurement.rows[{index}].routing.hash differs from its canonical member"
+ )
+ expected[member] = checksums
+ if (
+ len(expected) != len(rows)
+ or workload["members"] != sorted(expected)
+ or workload["manifest_checksums"] != expected
+ ):
+ raise ContractError("raw canonical member set/checksums differ from scheduled rows")
+ expected_workload_id = identity.workload_id({
+ "members": [
+ {"checksums": expected[member], "workload_id": member}
+ for member in sorted(expected)
+ ]
+ })
+ if workload["workload_id"] != expected_workload_id:
+ raise ContractError("raw composite workload identity differs from scheduled rows")
+
+
+def _nearest_rank(samples: list[float], q: int) -> float:
+ ordered = sorted(samples)
+ return ordered[max(0, min(len(ordered) - 1, math.ceil(q / 100 * len(ordered)) - 1))]
+
+
+def _close(observed: Any, expected: float, path: str, tolerance: float = 1e-6) -> None:
+ value = _number(observed, path)
+ if not math.isclose(value, expected, rel_tol=tolerance, abs_tol=tolerance):
+ raise ContractError(f"{path}={value} differs from recomputed {expected}")
+
+
+def _equivalent(
+ observed: Any, expected: Any, path: str, *, tolerance: float = 1e-6
+) -> None:
+ """Compare a recomputed JSON subtree while allowing only float roundoff."""
+ if isinstance(expected, dict):
+ value = _keys(observed, set(expected), path)
+ for key, child in expected.items():
+ _equivalent(value[key], child, f"{path}.{key}", tolerance=tolerance)
+ return
+ if isinstance(expected, list):
+ if not isinstance(observed, list) or len(observed) != len(expected):
+ raise ContractError(f"{path} differs from recomputed evidence")
+ for index, child in enumerate(expected):
+ _equivalent(observed[index], child, f"{path}[{index}]", tolerance=tolerance)
+ return
+ if isinstance(expected, float):
+ _close(observed, expected, path, tolerance)
+ return
+ if type(observed) is not type(expected) or observed != expected:
+ raise ContractError(f"{path} differs from recomputed evidence")
+
+
+def _schema_equal(left: Any, right: Any) -> bool:
+ """JSON Schema equality: booleans are distinct from numbers."""
+ if isinstance(left, bool) or isinstance(right, bool):
+ return type(left) is type(right) and left == right
+ if isinstance(left, dict) and isinstance(right, dict):
+ return set(left) == set(right) and all(
+ _schema_equal(left[key], right[key]) for key in left
+ )
+ if isinstance(left, list) and isinstance(right, list):
+ return len(left) == len(right) and all(
+ _schema_equal(a, b) for a, b in zip(left, right, strict=True)
+ )
+ return left == right
+
+
+def _schema_ref(root: dict[str, Any], reference: str) -> dict[str, Any]:
+ if not reference.startswith("#/"):
+ raise ContractError("native artifact schema contains a non-local reference")
+ value: Any = root
+ for part in reference[2:].split("/"):
+ part = part.replace("~1", "/").replace("~0", "~")
+ if not isinstance(value, dict) or part not in value:
+ raise ContractError("native artifact schema contains a broken reference")
+ value = value[part]
+ if not isinstance(value, dict):
+ raise ContractError("native artifact schema reference is not an object")
+ return value
+
+
+def _schema_type_matches(value: Any, expected: str) -> bool:
+ if expected == "null":
+ return value is None
+ if expected == "boolean":
+ return type(value) is bool
+ if expected == "object":
+ return isinstance(value, dict)
+ if expected == "array":
+ return isinstance(value, list)
+ if expected == "string":
+ return isinstance(value, str)
+ if expected == "number":
+ return (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ )
+ if expected == "integer":
+ return (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ and float(value).is_integer()
+ )
+ raise ContractError(f"native artifact schema uses unsupported type {expected!r}")
+
+
+def _validate_schema_value(
+ value: Any, schema: dict[str, Any], root: dict[str, Any], path: str
+) -> None:
+ """Validate the bounded JSON Schema subset used by native artifact contracts."""
+ if "$ref" in schema:
+ _validate_schema_value(value, _schema_ref(root, schema["$ref"]), root, path)
+ return
+ if "oneOf" in schema:
+ matches = 0
+ for candidate in schema["oneOf"]:
+ try:
+ _validate_schema_value(value, candidate, root, path)
+ except ContractError:
+ continue
+ matches += 1
+ if matches != 1:
+ raise ContractError(f"{path} must match exactly one native schema alternative")
+ return
+ expected_type = schema.get("type")
+ if expected_type is not None and not _schema_type_matches(value, expected_type):
+ raise ContractError(f"{path} is not a schema {expected_type}")
+ if "const" in schema and not _schema_equal(value, schema["const"]):
+ raise ContractError(f"{path} differs from its schema constant")
+ if "enum" in schema and not any(_schema_equal(value, item) for item in schema["enum"]):
+ raise ContractError(f"{path} is outside its schema enum")
+
+ if isinstance(value, dict):
+ required = set(schema.get("required", ()))
+ properties = schema.get("properties", {})
+ missing = required - set(value)
+ if missing:
+ raise ContractError(f"{path} lacks schema fields {sorted(missing)}")
+ additional = schema.get("additionalProperties", True)
+ extra = set(value) - set(properties)
+ if additional is False and extra:
+ raise ContractError(f"{path} has extra schema fields {sorted(extra)}")
+ for key, item in value.items():
+ if key in properties:
+ _validate_schema_value(item, properties[key], root, f"{path}.{key}")
+ elif isinstance(additional, dict):
+ _validate_schema_value(item, additional, root, f"{path}.{key}")
+ property_names = schema.get("propertyNames")
+ if property_names is not None:
+ for key in value:
+ _validate_schema_value(key, property_names, root, f"{path}.")
+
+ if isinstance(value, list):
+ if len(value) < schema.get("minItems", 0):
+ raise ContractError(f"{path} has too few schema items")
+ maximum = schema.get("maxItems")
+ if maximum is not None and len(value) > maximum:
+ raise ContractError(f"{path} has too many schema items")
+ if schema.get("uniqueItems") and any(
+ _schema_equal(item, prior)
+ for index, item in enumerate(value)
+ for prior in value[:index]
+ ):
+ raise ContractError(f"{path} schema items are not unique")
+ if "items" in schema:
+ for index, item in enumerate(value):
+ _validate_schema_value(item, schema["items"], root, f"{path}[{index}]")
+
+ if isinstance(value, str):
+ if len(value) < schema.get("minLength", 0):
+ raise ContractError(f"{path} is shorter than its schema minimum")
+ maximum = schema.get("maxLength")
+ if maximum is not None and len(value) > maximum:
+ raise ContractError(f"{path} is longer than its schema maximum")
+ if "pattern" in schema and re.search(schema["pattern"], value) is None:
+ raise ContractError(f"{path} does not match its schema pattern")
+ if schema.get("format") == "date-time":
+ try:
+ parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except ValueError as exc:
+ raise ContractError(f"{path} is not a schema date-time") from exc
+ if parsed.tzinfo is None:
+ raise ContractError(f"{path} schema date-time lacks a timezone")
+
+ if (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ ):
+ if "minimum" in schema and value < schema["minimum"]:
+ raise ContractError(f"{path} is below its schema minimum")
+ if "maximum" in schema and value > schema["maximum"]:
+ raise ContractError(f"{path} is above its schema maximum")
+
+
+def _validate_native_schema(name: str, value: Any) -> None:
+ schema = _SCHEMA_CACHE.get(name)
+ if schema is None:
+ loaded = strict_load(SCHEMA_DIR / name)
+ if not isinstance(loaded, dict):
+ raise ContractError(f"native artifact schema {name} is not an object")
+ schema = loaded
+ _SCHEMA_CACHE[name] = schema
+ _validate_schema_value(value, schema, schema, "$")
+
+
+def validate_samples_document(document: Any) -> dict[str, Any]:
+ _validate_native_schema("samples-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"allocation_id", "attempt_id", "case_id", "format", "points", "sampling",
+ "schema_version", "series_id"},
+ "samples",
+ )
+ if doc["format"] != SAMPLES_FORMAT or doc["schema_version"] != 1:
+ raise ContractError("samples format/schema differs from v1")
+ for field, kind in (
+ ("allocation_id", "allocation"), ("attempt_id", "attempt"),
+ ("case_id", "case"), ("series_id", "series"),
+ ):
+ _typed(doc[field], kind, f"samples.{field}")
+ sampling = _keys(
+ doc["sampling"], {"iterations_per_trial", "reduction", "trials"}, "samples.sampling"
+ )
+ if (
+ _integer(sampling["iterations_per_trial"], "samples.sampling.iterations_per_trial", minimum=1) != 8
+ or _integer(sampling["trials"], "samples.sampling.trials", minimum=1) != 64
+ or sampling["reduction"] != identity.V1_CASE_PROFILE["rank_reduction"]
+ ):
+ raise ContractError("samples must use the fixed 8x64 cross-rank-max contract")
+ points = doc["points"]
+ if not isinstance(points, list) or not points:
+ raise ContractError("samples.points must be non-empty")
+ seen = set()
+ for index, point_value in enumerate(points):
+ path = f"samples.points[{index}]"
+ point = _keys(
+ point_value,
+ {"components", "evidence_id", "point_id", "sample_sha256", "tokens_per_rank"},
+ path,
+ )
+ tokens = _integer(point["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1)
+ if tokens in seen:
+ raise ContractError(f"duplicate sample token point {tokens}")
+ seen.add(tokens)
+ _typed(point["point_id"], "point", f"{path}.point_id")
+ _typed(point["evidence_id"], "evidence", f"{path}.evidence_id")
+ components = _keys(point["components"], {"combine", "dispatch", "roundtrip"}, f"{path}.components")
+ for name, component_value in components.items():
+ component = _keys(
+ component_value, {"availability", "sample_count", "trials"},
+ f"{path}.components.{name}",
+ )
+ availability = component["availability"]
+ count = _integer(component["sample_count"], f"{path}.components.{name}.sample_count")
+ trials = component["trials"]
+ if availability == "unavailable":
+ if count != 0 or trials is not None or name == "roundtrip":
+ raise ContractError(f"{path}.components.{name} has invalid unavailability")
+ continue
+ if availability != "measured" or not isinstance(trials, list) or len(trials) != 64:
+ raise ContractError(f"{path}.components.{name} must contain 64 measured trials")
+ if any(not isinstance(trial, list) or len(trial) != 8 for trial in trials):
+ raise ContractError(f"{path}.components.{name} trials must each contain 8 samples")
+ flattened = [
+ _number(sample, f"{path}.components.{name}.trials", minimum=0.0)
+ for trial in trials for sample in trial
+ ]
+ if count != 512 or len(flattened) != 512:
+ raise ContractError(f"{path}.components.{name} must contain 512 samples")
+ sample_base = {"components": components, "tokens_per_rank": tokens}
+ if point["sample_sha256"] != _sha256_json(sample_base):
+ raise ContractError(f"{path}.sample_sha256 differs")
+ return doc
+
+
+def _validate_component(
+ component_value: Any,
+ sample_component: dict[str, Any] | None,
+ path: str,
+ *,
+ derived: bool = False,
+) -> None:
+ component = _keys(
+ component_value, {"availability", "origin", "percentiles_us", "sample_count"}, path
+ )
+ availability = component["availability"]
+ if availability == "unavailable":
+ if component != {
+ "availability": "unavailable", "origin": None,
+ "percentiles_us": None, "sample_count": 0,
+ }:
+ raise ContractError(f"{path} has invalid unavailable representation")
+ if sample_component and sample_component["availability"] != "unavailable":
+ raise ContractError(f"{path} disagrees with samples")
+ return
+ expected_availability = "derived" if derived else "measured"
+ expected_origin = "derived-percentile-sum" if derived else "measured"
+ if availability != expected_availability or component["origin"] != expected_origin:
+ raise ContractError(f"{path} has invalid availability/origin")
+ percentiles = _keys(component["percentiles_us"], set(PERCENTILES), f"{path}.percentiles_us")
+ if derived:
+ if component["sample_count"] != 0:
+ raise ContractError(f"{path}.sample_count must be zero for a derived value")
+ return
+ if sample_component is None or sample_component["availability"] != "measured":
+ raise ContractError(f"{path} lacks measured sample evidence")
+ flattened = [sample for trial in sample_component["trials"] for sample in trial]
+ if component["sample_count"] != len(flattened):
+ raise ContractError(f"{path}.sample_count differs from exact samples")
+ for name, percentile in zip(PERCENTILES, (50, 90, 95, 99), strict=True):
+ _close(percentiles[name], _nearest_rank(flattened, percentile), f"{path}.{name}")
+
+
+def _validate_oracle(value: Any, path: str) -> dict[str, Any]:
+ oracle = _keys(
+ value,
+ {"atol", "checks", "combine_weight_semantics", "contract", "dispatch_sha256",
+ "max_absolute_error", "max_elementwise_relative_error", "max_relative_error",
+ "max_weight_error", "order_sha256", "ordering_contract", "passed", "receive_count",
+ "rtol"},
+ path,
+ )
+ if oracle["contract"] != identity.V1_CASE_PROFILE["oracle_contract"]:
+ raise ContractError(f"{path}.contract differs")
+ checks = _keys(
+ oracle["checks"],
+ {"combine_values", "counts", "metadata", "multiplicity", "payload", "source_set",
+ "weights"},
+ f"{path}.checks",
+ )
+ if any(type(value) is not bool for value in checks.values()):
+ raise ContractError(f"{path}.checks must be boolean")
+ if type(oracle["passed"]) is not bool:
+ raise ContractError(f"{path}.passed must be boolean")
+ _integer(oracle["receive_count"], f"{path}.receive_count")
+ _text(oracle["ordering_contract"], f"{path}.ordering_contract")
+ if oracle["combine_weight_semantics"] != "unweighted-rank-sum":
+ raise ContractError(f"{path}.combine_weight_semantics differs from v1")
+ _close(oracle["rtol"], 5e-2, f"{path}.rtol")
+ _close(oracle["atol"], 2e-2, f"{path}.atol")
+ for field in ("dispatch_sha256", "order_sha256"):
+ digest = oracle[field]
+ if digest is not None and (
+ not isinstance(digest, str) or len(digest) != 64
+ or any(character not in "0123456789abcdef" for character in digest)
+ ):
+ raise ContractError(f"{path}.{field} is not a SHA-256 digest")
+ for field in (
+ "max_absolute_error", "max_elementwise_relative_error", "max_relative_error",
+ "max_weight_error",
+ ):
+ if oracle[field] is not None:
+ _number(oracle[field], f"{path}.{field}", minimum=0.0)
+ expected_pass = (
+ all(checks.values())
+ and oracle["max_relative_error"] is not None
+ and oracle["max_relative_error"] < 5e-2
+ )
+ if oracle["passed"] != expected_pass:
+ raise ContractError(f"{path}.passed differs from its evidence")
+ return oracle
+
+
+def validate_raw_document(document: Any, samples_document: Any) -> dict[str, Any]:
+ """Validate identities, exact samples, formulas, privacy, and the native raw shape."""
+ _validate_native_schema("raw-case-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"case", "format", "generated_at", "identity", "implementation", "measurement",
+ "outcome", "provenance", "record_type", "runtime_fingerprint", "sample_artifact",
+ "schema_version", "topology", "workload"},
+ "raw",
+ )
+ _finite_tree(doc)
+ if doc["format"] != RAW_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "case-attempt":
+ raise ContractError("raw format/schema/record type differs from v1")
+ _text(doc["generated_at"], "raw.generated_at")
+ identifiers = _keys(
+ doc["identity"],
+ {"allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", "case_factors",
+ "case_id", "series_factors", "series_id"},
+ "raw.identity",
+ )
+ for field, kind in (
+ ("allocation_id", "allocation"), ("attempt_id", "attempt"),
+ ("case_id", "case"), ("series_id", "series"),
+ ):
+ _typed(identifiers[field], kind, f"raw.identity.{field}")
+ ordinal = _integer(identifiers["attempt_ordinal"], "raw.identity.attempt_ordinal", minimum=1)
+ allocation_factors = _keys(
+ identifiers["allocation_factors"], ALLOCATION_FACTOR_FIELDS,
+ "raw.identity.allocation_factors",
+ )
+ case_factors = _keys(
+ identifiers["case_factors"], {"case", "profile", "sku"},
+ "raw.identity.case_factors",
+ )
+ scheduled_case = _keys(
+ case_factors["case"], TERMINAL_CASE_FIELDS, "raw.identity.case_factors.case"
+ )
+ if case_factors["profile"] != identity.V1_CASE_PROFILE:
+ raise ContractError("raw case profile differs from CollectiveX v1")
+ _text(case_factors["sku"], "raw.identity.case_factors.sku")
+ series_factors = _keys(
+ identifiers["series_factors"],
+ {"backend", "case_id", "image_digest", "implementation_contract_sha256",
+ "public_config_sha256", "routing_control_sha256",
+ "runtime_fingerprint_sha256", "source_sha", "squash_sha256", "workload_id"},
+ "raw.identity.series_factors",
+ )
+ if identity.allocation_id(identifiers["allocation_factors"]) != identifiers["allocation_id"]:
+ raise ContractError("allocation identity differs")
+ if identity.digest("case", identifiers["case_factors"]) != identifiers["case_id"]:
+ raise ContractError("case identity differs")
+ if identity.series_id(identifiers["series_factors"]) != identifiers["series_id"]:
+ raise ContractError("series identity differs")
+ if identity.attempt_id(
+ allocation=identifiers["allocation_id"], case=identifiers["case_id"], ordinal=ordinal
+ ) != identifiers["attempt_id"]:
+ raise ContractError("attempt identity differs")
+
+ samples = validate_samples_document(samples_document)
+ for field in ("allocation_id", "attempt_id", "case_id", "series_id"):
+ if samples[field] != identifiers[field]:
+ raise ContractError(f"samples.{field} differs from raw identity")
+ sample_by_token = {point["tokens_per_rank"]: point for point in samples["points"]}
+
+ case = _keys(
+ doc["case"],
+ {"attempt_ordinal", "backend", "eplb", "ep_size", "mode", "phase",
+ "required_publication", "resource_mode", "runner", "shape", "suite", "workload_name"},
+ "raw.case",
+ )
+ ep_size = _integer(case["ep_size"], "raw.case.ep_size", minimum=1)
+ if case["attempt_ordinal"] != ordinal:
+ raise ContractError("case attempt ordinal differs")
+ for field in ("backend", "mode", "phase", "required_publication", "resource_mode", "runner",
+ "suite", "workload_name"):
+ _text(case[field], f"raw.case.{field}")
+ shape = _keys(
+ case["shape"],
+ {"activation_profile", "dispatch_dtype", "eplb", "experts", "experts_per_rank",
+ "hidden", "kernel_gen", "num_logical_experts", "quant", "routing", "topk"},
+ "raw.case.shape",
+ )
+ hidden = _integer(shape["hidden"], "raw.case.shape.hidden", minimum=1)
+ topk = _integer(shape["topk"], "raw.case.shape.topk", minimum=1)
+ physical_experts = _integer(
+ shape["experts"], "raw.case.shape.experts", minimum=1
+ )
+ logical_experts = _integer(
+ shape["num_logical_experts"],
+ "raw.case.shape.num_logical_experts",
+ minimum=1,
+ )
+ experts_per_rank = _integer(
+ shape["experts_per_rank"], "raw.case.shape.experts_per_rank", minimum=1
+ )
+ quant = _keys(
+ shape["quant"],
+ {"combine_accum_dtype", "combine_input_dtype", "combine_output_dtype",
+ "combine_quant_mode", "scale_layout"},
+ "raw.case.shape.quant",
+ )
+ eplb = _keys(
+ case["eplb"],
+ {"enabled", "imbalance_after", "imbalance_before", "mapping_hash", "max_replicas",
+ "num_logical_experts", "num_physical_experts", "num_redundant", "planner",
+ "reference_tokens_per_rank", "replicated_experts"},
+ "raw.case.eplb",
+ )
+ if not isinstance(eplb["enabled"], bool):
+ raise ContractError("raw.case.eplb.enabled must be boolean")
+ expected_redundant = (
+ identity.V1_CASE_PROFILE["eplb_redundant_experts"] if eplb["enabled"] else 0
+ )
+ expected_physical = eplb_contract.physical_count(
+ scheduled_case["experts"], expected_redundant, ep_size
+ )
+ if (
+ shape["eplb"] != eplb["enabled"]
+ or logical_experts != scheduled_case["experts"]
+ or physical_experts != expected_physical
+ or experts_per_rank * ep_size != physical_experts
+ or eplb["num_logical_experts"] != logical_experts
+ or eplb["num_physical_experts"] != physical_experts
+ or eplb["num_redundant"] != expected_redundant
+ ):
+ raise ContractError("raw EPLB/shape dimensions differ from the frozen profile")
+ if eplb["enabled"]:
+ expected_plan = _expected_eplb_plan(
+ scheduled_case["routing"],
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ identity.V1_CASE_PROFILE["seed"],
+ identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"],
+ )
+ expected_eplb = {
+ "enabled": True,
+ "imbalance_after": expected_plan["imbalance_after"],
+ "imbalance_before": expected_plan["imbalance_before"],
+ "mapping_hash": eplb_contract.mapping_hash(expected_plan),
+ "max_replicas": expected_plan["max_replicas"],
+ "num_logical_experts": logical_experts,
+ "num_physical_experts": physical_experts,
+ "num_redundant": expected_redundant,
+ "planner": identity.V1_CASE_PROFILE["eplb_planner"],
+ "reference_tokens_per_rank": identity.V1_CASE_PROFILE[
+ "eplb_reference_tokens_per_rank"
+ ],
+ "replicated_experts": expected_plan["replicated_experts"],
+ }
+ else:
+ expected_eplb = {
+ "enabled": False,
+ "imbalance_after": None,
+ "imbalance_before": None,
+ "mapping_hash": None,
+ "max_replicas": None,
+ "num_logical_experts": logical_experts,
+ "num_physical_experts": physical_experts,
+ "num_redundant": 0,
+ "planner": None,
+ "reference_tokens_per_rank": None,
+ "replicated_experts": 0,
+ }
+ _equivalent(eplb, expected_eplb, "raw.case.eplb", tolerance=1e-9)
+ if case_factors["sku"] != case["runner"]:
+ raise ContractError("raw case runner differs from case identity")
+
+ workload = _keys(
+ doc["workload"],
+ {"activation_generator", "activation_identity", "activation_profile",
+ "cross_rank_consistent", "manifest_checksums", "members", "routing_generator", "source",
+ "trace_hashes", "trace_signature", "workload_id"},
+ "raw.workload",
+ )
+ if workload["source"] not in {"canonical-serialized", "seeded-runtime"}:
+ raise ContractError("raw workload source is invalid")
+ if workload["source"] == "canonical-serialized":
+ _typed(workload["workload_id"], "workload", "raw.workload.workload_id")
+ members = workload["members"]
+ checksums = workload["manifest_checksums"]
+ if (
+ not isinstance(members, list)
+ or not members
+ or members != sorted(set(members))
+ or not all(identity.is_typed_id(member, "workload") for member in members)
+ or not isinstance(checksums, dict)
+ or set(checksums) != set(members)
+ ):
+ raise ContractError("raw canonical workload members/checksums are invalid")
+ for member, values in checksums.items():
+ if (
+ not isinstance(values, dict)
+ or set(values) != {"topk_idx", "topk_weights", "trace"}
+ or any(not re.fullmatch(r"[0-9a-f]{64}", str(value)) for value in values.values())
+ ):
+ raise ContractError(f"raw canonical workload checksums differ for {member}")
+ expected_workload_id = identity.workload_id({
+ "members": [
+ {"checksums": checksums[member], "workload_id": member}
+ for member in members
+ ]
+ })
+ if workload["workload_id"] != expected_workload_id:
+ raise ContractError("raw composite workload identity differs from its members")
+ elif any(workload[field] is not None for field in ("members", "manifest_checksums", "workload_id")):
+ raise ContractError("raw seeded workload cannot claim serialized members")
+ if workload["cross_rank_consistent"] is not True:
+ raise ContractError("raw workload is not consistent across ranks")
+
+ measurement = _keys(
+ doc["measurement"],
+ {"component_order_contract", "conditioning", "contract", "rows",
+ "sampling", "source_allocation"},
+ "raw.measurement",
+ )
+ validate_conditioning_contract(measurement["conditioning"], case["phase"])
+ sampling = _keys(
+ measurement["sampling"],
+ {"contract", "iterations_per_trial", "percentile_method", "reduction",
+ "samples_per_component", "trials", "warmup_iterations", "warmup_semantics"},
+ "raw.measurement.sampling",
+ )
+ expected_sampling = {
+ "contract": identity.V1_CASE_PROFILE["sampling_contract"], "iterations_per_trial": 8,
+ "percentile_method": identity.V1_CASE_PROFILE["percentile_method"],
+ "reduction": identity.V1_CASE_PROFILE["rank_reduction"],
+ "samples_per_component": 512, "trials": 64, "warmup_iterations": 32,
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ }
+ if sampling != expected_sampling:
+ raise ContractError("raw sampling contract differs from fixed-512-v1")
+ profile = identity.V1_CASE_PROFILE
+ if (
+ case["mode"] != profile["mode"]
+ or case["resource_mode"] != profile["resource_mode"]
+ or measurement["contract"] != profile["contract"]
+ or measurement["component_order_contract"] != profile["component_order_contract"]
+ or measurement["source_allocation"] != "even"
+ or shape["activation_profile"] != profile["activation_profile"]
+ or shape["dispatch_dtype"] != profile["dtype"]
+ or quant["combine_input_dtype"] != profile["combine_dtype"]
+ or quant["combine_output_dtype"] != profile["combine_dtype"]
+ or quant["combine_quant_mode"] != profile["combine_quant_mode"]
+ or quant["scale_layout"] is not None
+ or workload["activation_generator"] != profile["activation_generator"]
+ or workload["activation_profile"] != profile["activation_profile"]
+ or workload["routing_generator"] != profile["routing_generator"]
+ ):
+ raise ContractError("raw case differs from the frozen v1 profile")
+ expected_activation = hashlib.sha256(
+ (
+ f"counter|seed={profile['seed']}|hidden={hidden}|"
+ f"gen={profile['activation_generator']}"
+ ).encode()
+ ).hexdigest()
+ if workload["activation_identity"] != expected_activation:
+ raise ContractError("raw activation identity differs from the frozen seed/profile")
+ rows = measurement["rows"]
+ if not isinstance(rows, list) or not rows:
+ raise ContractError("raw.measurement.rows must be non-empty")
+ seen_points = set()
+ row_tokens = []
+ recomputed_anomalies = 0
+ for index, row_value in enumerate(rows):
+ path = f"raw.measurement.rows[{index}]"
+ row = _keys(
+ row_value,
+ {"anomalies", "components", "correctness", "evidence_id", "global_tokens",
+ "logical_bytes", "point_id", "receive", "routing",
+ "sample_histograms", "sample_sha256", "token_rate_at_latency_percentile",
+ "tokens_per_rank"},
+ path,
+ )
+ tokens = _integer(row["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1)
+ row_tokens.append(tokens)
+ if tokens in seen_points or tokens not in sample_by_token:
+ raise ContractError(f"{path} token point is duplicate or missing samples")
+ seen_points.add(tokens)
+ if row["global_tokens"] != tokens * ep_size:
+ raise ContractError(f"{path}.global_tokens formula differs")
+ sample_point = sample_by_token[tokens]
+ expected_point = identity.point_id(series=identifiers["series_id"], tokens_per_rank=tokens)
+ if row["point_id"] != expected_point or sample_point["point_id"] != expected_point:
+ raise ContractError(f"{path}.point_id differs")
+ expected_evidence = identity.evidence_id(
+ point=expected_point, allocation=identifiers["allocation_id"],
+ attempt=identifiers["attempt_id"], sample_sha256=sample_point["sample_sha256"],
+ )
+ if row["evidence_id"] != expected_evidence or sample_point["evidence_id"] != expected_evidence:
+ raise ContractError(f"{path}.evidence_id differs")
+ if row["sample_sha256"] != sample_point["sample_sha256"]:
+ raise ContractError(f"{path}.sample_sha256 differs")
+ components = _keys(
+ row["components"], {"combine", "dispatch", "isolated_sum", "roundtrip"},
+ f"{path}.components",
+ )
+ for name in ("combine", "dispatch", "roundtrip"):
+ _validate_component(
+ components[name], sample_point["components"][name], f"{path}.components.{name}"
+ )
+ _validate_component(
+ components["isolated_sum"], None, f"{path}.components.isolated_sum", derived=True
+ )
+ _, _, _, expected_indices, expected_weights = _expected_canonical_trace(
+ scheduled_case["routing"],
+ hidden,
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ tokens,
+ profile["seed"],
+ eplb["enabled"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ expected_routing = _expected_routing_summary(
+ expected_indices,
+ expected_weights,
+ physical_experts=physical_experts,
+ ep_size=ep_size,
+ tokens_per_rank=tokens,
+ gpus_per_node=scheduled_case["gpus_per_node"],
+ scale_up_domain=scheduled_case["scale_up_domain"],
+ )
+ _equivalent(
+ row["routing"], expected_routing, f"{path}.routing", tolerance=1e-5
+ )
+ expected_payload_counts = expected_routing["payload_copies_per_rank"]
+ throughput = _keys(
+ row["token_rate_at_latency_percentile"], set(PERCENTILES),
+ f"{path}.token_rate_at_latency_percentile",
+ )
+ for percentile in PERCENTILES:
+ latency = components["roundtrip"]["percentiles_us"][percentile]
+ if latency <= 0:
+ raise ContractError(f"{path} roundtrip latency must be positive")
+ _close(
+ throughput[percentile], row["global_tokens"] / (latency * 1e-6),
+ f"{path}.token_rate_at_latency_percentile.{percentile}", 1e-9,
+ )
+ correctness = _keys(
+ row["correctness"],
+ {"contract", "max_relative_error", "passed", "rank_evidence", "scope"},
+ f"{path}.correctness",
+ )
+ if (
+ correctness["contract"] != identity.V1_CASE_PROFILE["oracle_contract"]
+ or correctness["scope"] != "dispatch-metadata-and-transformed-combine"
+ or type(correctness["passed"]) is not bool
+ ):
+ raise ContractError(f"{path}.correctness contract differs")
+ _number(
+ correctness["max_relative_error"],
+ f"{path}.correctness.max_relative_error",
+ minimum=0.0,
+ )
+ rank_evidence = correctness["rank_evidence"]
+ if not isinstance(rank_evidence, list) or len(rank_evidence) != ep_size:
+ raise ContractError(f"{path}.correctness.rank_evidence must cover every rank")
+ ranks = set()
+ observed_max_error = 0.0
+ evidence_passed = True
+ for evidence_index, evidence_value in enumerate(rank_evidence):
+ evidence_path = f"{path}.correctness.rank_evidence[{evidence_index}]"
+ evidence = _keys(
+ evidence_value,
+ {"input_unchanged", "order_stable", "post_timing", "pre_timing", "rank"},
+ evidence_path,
+ )
+ evidence_rank = _integer(evidence["rank"], f"{evidence_path}.rank")
+ if evidence_rank >= ep_size:
+ raise ContractError(f"{evidence_path}.rank is outside the EP group")
+ ranks.add(evidence_rank)
+ if type(evidence["input_unchanged"]) is not bool or type(evidence["order_stable"]) is not bool:
+ raise ContractError(f"{evidence_path} stability fields must be boolean")
+ pre = _validate_oracle(evidence["pre_timing"], f"{evidence_path}.pre_timing")
+ post = _validate_oracle(evidence["post_timing"], f"{evidence_path}.post_timing")
+ if (
+ pre["receive_count"] != expected_payload_counts[evidence_rank]
+ or post["receive_count"] != expected_payload_counts[evidence_rank]
+ ):
+ raise ContractError(
+ f"{evidence_path}.receive_count differs from canonical routing"
+ )
+ expected_stability = all(
+ pre[field] == post[field]
+ for field in ("ordering_contract", "order_sha256", "dispatch_sha256")
+ )
+ if evidence["order_stable"] != expected_stability:
+ raise ContractError(f"{evidence_path}.order_stable differs from the evidence")
+ errors = [
+ oracle["max_relative_error"]
+ for oracle in (pre, post)
+ if oracle["max_relative_error"] is not None
+ ]
+ observed_max_error = max([observed_max_error, *errors])
+ evidence_passed = evidence_passed and all(
+ (evidence["input_unchanged"], evidence["order_stable"], pre["passed"], post["passed"])
+ )
+ if ranks != set(range(ep_size)) or correctness["passed"] != evidence_passed:
+ raise ContractError(f"{path}.correctness rank coverage or outcome differs")
+ _close(
+ correctness["max_relative_error"], observed_max_error,
+ f"{path}.correctness.max_relative_error",
+ )
+ if components["dispatch"]["availability"] == "measured":
+ for percentile in PERCENTILES:
+ expected = (
+ components["dispatch"]["percentiles_us"][percentile]
+ + components["combine"]["percentiles_us"][percentile]
+ )
+ _close(
+ components["isolated_sum"]["percentiles_us"][percentile], expected,
+ f"{path}.components.isolated_sum.{percentile}",
+ )
+ routed_copies = expected_routing["routed_copies"]
+ expected_bytes = routed_copies * hidden * 2
+ expected_logical = {
+ "combine": expected_bytes,
+ "dispatch": expected_bytes,
+ "roundtrip": expected_bytes * 2,
+ }
+ _equivalent(row["logical_bytes"], expected_logical, f"{path}.logical_bytes")
+
+ max_receive = max(expected_payload_counts)
+ expected_receive = {
+ "max": max_receive,
+ "mean": sum(expected_payload_counts) / ep_size,
+ "min": min(expected_payload_counts),
+ "total": sum(expected_payload_counts),
+ }
+ _equivalent(row["receive"], expected_receive, f"{path}.receive")
+ expected_histograms = {
+ name: (
+ _expected_histogram([
+ sample
+ for trial in sample_point["components"][name]["trials"]
+ for sample in trial
+ ])
+ if sample_point["components"][name]["availability"] == "measured"
+ else None
+ )
+ for name in ("dispatch", "combine", "roundtrip")
+ }
+ _equivalent(
+ row["sample_histograms"], expected_histograms, f"{path}.sample_histograms"
+ )
+ expected_anomalies = _expected_anomalies(tokens, components)
+ _equivalent(row["anomalies"], expected_anomalies, f"{path}.anomalies")
+ recomputed_anomalies += len(expected_anomalies)
+ if seen_points != set(sample_by_token):
+ raise ContractError("raw rows and sample points differ")
+ if row_tokens != sorted(row_tokens):
+ raise ContractError("raw rows must follow the scheduled token ladder")
+ expected_trace_hashes = sorted(row["routing"]["hash"] for row in rows)
+ if workload["trace_hashes"] != expected_trace_hashes:
+ raise ContractError("raw workload trace hashes differ from measured rows")
+ expected_trace_signature = hashlib.sha256(
+ "|".join(expected_trace_hashes).encode()
+ ).hexdigest()
+ if workload["trace_signature"] != expected_trace_signature:
+ raise ContractError("raw workload trace signature differs from measured rows")
+
+ implementation = _keys(
+ doc["implementation"], {"kernel_generation", "name", "provenance", "resource_profile"},
+ "raw.implementation",
+ )
+ if (
+ implementation["name"] != case["backend"]
+ or implementation["kernel_generation"] != shape["kernel_gen"]
+ ):
+ raise ContractError("raw implementation identity differs from the case")
+ provenance_fields = _obj(implementation["provenance"], "raw.implementation.provenance")
+ unknown = set(provenance_fields) - PROVENANCE_KEYS
+ if unknown:
+ raise ContractError(f"raw implementation provenance has unknown fields {sorted(unknown)}")
+ if (
+ implementation["name"] == "deepep-v2"
+ and provenance_fields.get("deterministic") is not False
+ ):
+ raise ContractError("DeepEP V2 deterministic mode differs from the v1 kernel contract")
+ if implementation["name"] == "deepep-v2" and (
+ _integer(
+ provenance_fields.get("tuning_num_experts"),
+ "raw.implementation.provenance.tuning_num_experts",
+ minimum=1,
+ ) != logical_experts
+ or _integer(
+ provenance_fields.get("num_experts"),
+ "raw.implementation.provenance.num_experts",
+ minimum=1,
+ ) != physical_experts
+ ):
+ raise ContractError("DeepEP V2 expert-count provenance differs from the case")
+ if implementation["name"] == "deepep-hybrid":
+ realized_config = provenance_fields.get("realized_config")
+ jit_kernel_keys = provenance_fields.get("jit_kernel_keys")
+ jit_shared_objects = provenance_fields.get("jit_shared_objects")
+ if (
+ not _hybrid_realized_config_is_valid(realized_config)
+ or not _hybrid_jit_evidence_is_valid(jit_shared_objects, jit_kernel_keys)
+ or realized_config["hidden_dim"] != shape["hidden"]
+ or realized_config["num_of_experts_per_rank"] * ep_size != physical_experts
+ or realized_config["num_of_ranks_per_node"] != ep_size
+ or realized_config["num_of_nodes"] != 1
+ or realized_config["token_data_type"] != "UINT16"
+ or any(
+ len(artifact["rank_artifacts"]) != ep_size
+ for artifact in jit_shared_objects
+ )
+ ):
+ raise ContractError("DeepEP Hybrid realized config/JIT evidence differs from the case")
+ if implementation["name"] == "nccl-ep" and implementation["kernel_generation"] != (
+ collective_kernel_generation(provenance_fields.get("collective_library"))
+ ):
+ raise ContractError("NCCL/RCCL kernel generation differs from collective lineage")
+ resource_profile = _obj(
+ implementation["resource_profile"], "raw.implementation.resource_profile"
+ )
+ expected_resource_profile = project_resource_profile(provenance_fields)
+ if resource_profile != expected_resource_profile:
+ raise ContractError("raw resource profile differs from implementation provenance")
+ topology = _keys(
+ doc["topology"],
+ {"device_count", "device_product", "gpus_per_node", "nodes", "placement",
+ "realized_placement", "scale_up_domain", "topology_class", "transport", "world_size"},
+ "raw.topology",
+ )
+ for field in ("device_count", "gpus_per_node", "nodes", "scale_up_domain", "world_size"):
+ _integer(topology[field], f"raw.topology.{field}", minimum=1)
+ realized = _keys(
+ topology["realized_placement"],
+ {"gpus_per_node", "nodes", "ranks_per_node", "unique_local_ranks", "valid"},
+ "raw.topology.realized_placement",
+ )
+ if realized != {
+ "gpus_per_node": topology["gpus_per_node"],
+ "nodes": topology["nodes"],
+ "ranks_per_node": topology["gpus_per_node"],
+ "unique_local_ranks": True,
+ "valid": True,
+ }:
+ raise ContractError("raw realized placement differs from requested topology")
+ if (
+ topology["world_size"] != ep_size
+ or topology["nodes"] * topology["gpus_per_node"] != ep_size
+ or topology["device_count"] != topology["gpus_per_node"]
+ or topology["placement"] != profile["placement"]
+ or topology["scale_up_domain"] < ep_size
+ ):
+ raise ContractError("raw topology dimensions differ from the case")
+ if implementation["name"] == "deepep-v2":
+ if (
+ provenance_fields.get("allow_hybrid_mode"),
+ provenance_fields.get("gin_enabled"),
+ provenance_fields.get("communication_backend"),
+ ) != (False, False, "nccl-device-lsa"):
+ raise ContractError("DeepEP V2 communication policy differs from the v1 contract")
+ lsa_topology = tuple(
+ _integer(
+ provenance_fields.get(field),
+ f"raw.implementation.provenance.{field}",
+ minimum=1,
+ )
+ for field in (
+ "physical_rdma_ranks", "physical_nvlink_ranks",
+ "logical_scaleout_ranks", "logical_scaleup_ranks",
+ )
+ )
+ if lsa_topology != (1, ep_size, 1, ep_size):
+ raise ContractError("DeepEP V2 no-GIN provenance is outside one realized LSA domain")
+ runtime = _keys(
+ doc["runtime_fingerprint"],
+ {"accelerator_runtime", "collective_library", "device", "driver_version", "framework",
+ "machine", "python_version", "vendor"},
+ "raw.runtime_fingerprint",
+ )
+ for field in ("machine", "python_version", "vendor"):
+ _text(runtime[field], f"raw.runtime_fingerprint.{field}")
+ runtime_device = _keys(
+ runtime["device"], {"arch", "compute_units", "memory_bytes", "product", "warp_size"},
+ "raw.runtime_fingerprint.device",
+ )
+ if topology["device_product"] != runtime_device["product"]:
+ raise ContractError("raw topology and runtime device products differ")
+ platform = capability.PLATFORMS.get(case["runner"])
+ if platform is not None:
+ identity_issues = capability.runtime_identity_issues(
+ case["runner"], vendor=runtime["vendor"], arch=runtime_device["arch"],
+ machine=runtime["machine"], device_name=runtime_device["product"],
+ device_count=topology["device_count"], world_size=topology["world_size"],
+ )
+ expected_topology_class = (
+ f"{case['runner']}-nvl72-mnnvl"
+ if case["runner"] in {"gb200", "gb300"}
+ else f"{case['runner']}-xgmi"
+ if platform["vendor"] == "amd"
+ else f"{platform['product']}-nvlink-island"
+ )
+ if identity_issues or (
+ topology["transport"] != platform["transport"]
+ or topology["gpus_per_node"] != platform["gpus_per_node"]
+ or topology["scale_up_domain"] != platform["scale_up_domain"]
+ or topology["topology_class"] != expected_topology_class
+ ):
+ raise ContractError(
+ "raw runtime/topology differs from the scheduled SKU: "
+ + "; ".join(identity_issues)
+ )
+ raw_provenance = _keys(
+ doc["provenance"], {"command", "distributed_launcher", "git_run", "image", "redaction"},
+ "raw.provenance",
+ )
+ image = _keys(
+ raw_provenance["image"],
+ {"arch", "digest", "digest_verified", "reference", "squash_sha256"},
+ "raw.provenance.image",
+ )
+ if (
+ image["digest_verified"] is not True
+ or not isinstance(image["digest"], str)
+ or not re.fullmatch(r"sha256:[0-9a-f]{64}", image["digest"])
+ ):
+ raise ContractError("raw image digest was not registry-verified")
+ if raw_provenance["redaction"] != "sanitized-v1":
+ raise ContractError("raw provenance redaction contract differs")
+ git_run = raw_provenance["git_run"]
+ if git_run is not None:
+ git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run")
+ expected_provenance_complete = provenance_complete(
+ provenance_fields,
+ case["backend"],
+ git_run,
+ image_digest=image["digest"],
+ image_verified=image["digest_verified"],
+ squash_sha256=image["squash_sha256"],
+ )
+
+ actual_scheduled_case = {
+ "backend": case["backend"],
+ "canonical": workload["source"] == "canonical-serialized",
+ "eplb": eplb["enabled"],
+ "ep": ep_size,
+ "experts": shape["num_logical_experts"],
+ "gpus_per_node": topology["gpus_per_node"],
+ "hidden": hidden,
+ "ladder": " ".join(map(str, row_tokens)),
+ "nodes": topology["nodes"],
+ "phase": case["phase"],
+ "required_publication": case["required_publication"],
+ "routing": shape["routing"],
+ "samples_per_point": sampling["samples_per_component"],
+ "scale_up_domain": topology["scale_up_domain"],
+ "suite": case["suite"],
+ "timing": (
+ f"{sampling['iterations_per_trial']}:{sampling['trials']}:"
+ f"{sampling['warmup_iterations']}"
+ ),
+ "topk": shape["topk"],
+ "warmup_semantics": sampling["warmup_semantics"],
+ "workload": case["workload_name"],
+ }
+ if scheduled_case != actual_scheduled_case:
+ mismatches = sorted(
+ field for field in scheduled_case
+ if scheduled_case[field] != actual_scheduled_case[field]
+ )
+ raise ContractError(f"raw data differs from scheduled case fields {mismatches}")
+
+ if workload["source"] == "canonical-serialized":
+ _validate_canonical_workload(workload, scheduled_case, rows, eplb)
+
+ expected_series = {
+ "backend": case["backend"],
+ "case_id": identifiers["case_id"],
+ "image_digest": image["digest"],
+ "implementation_contract_sha256": _sha256_json({
+ "kernel_generation": implementation["kernel_generation"],
+ "name": implementation["name"],
+ "provenance": series_provenance(provenance_fields),
+ "resource_profile": resource_profile,
+ }),
+ "public_config_sha256": public_series_config_sha256(public_series_config(
+ kernel_generation=implementation["kernel_generation"],
+ provenance=provenance_fields,
+ resource_profile=resource_profile,
+ resource_mode=case["resource_mode"],
+ device_product=topology["device_product"],
+ )),
+ "routing_control_sha256": routing_implementation_control_sha256(implementation),
+ "runtime_fingerprint_sha256": _sha256_json(runtime),
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ "squash_sha256": image["squash_sha256"],
+ "workload_id": workload["workload_id"] or workload["trace_signature"],
+ }
+ if series_factors != expected_series:
+ raise ContractError("raw series factors differ from measured implementation/runtime")
+ expected_allocation = {
+ "artifact": git_run["artifact"] if git_run is not None else None,
+ "execution_id": allocation_factors["execution_id"],
+ "job": git_run["job"] if git_run is not None else None,
+ "repo": git_run["repo"] if git_run is not None else None,
+ "run_attempt": git_run["run_attempt"] if git_run is not None else None,
+ "run_id": git_run["run_id"] if git_run is not None else None,
+ "runner": case["runner"],
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ }
+ if allocation_factors != expected_allocation:
+ raise ContractError("raw allocation factors differ from provenance")
+ artifact = _keys(doc["sample_artifact"], {"bytes", "format", "path", "sha256"}, "raw.sample_artifact")
+ if artifact["format"] != SAMPLES_FORMAT or Path(artifact["path"]).name != artifact["path"]:
+ raise ContractError("raw.sample_artifact format/path is invalid")
+ if not isinstance(artifact["sha256"], str) or len(artifact["sha256"]) != 64:
+ raise ContractError("raw.sample_artifact.sha256 is invalid")
+ _integer(artifact["bytes"], "raw.sample_artifact.bytes", minimum=1)
+ outcome = _keys(doc["outcome"], {"publication_status", "reasons", "status", "validity"}, "raw.outcome")
+ if outcome["status"] not in {"success", "invalid"} or outcome["publication_status"] not in {"diagnostic", "invalid"}:
+ raise ContractError("raw outcome status is invalid")
+ if not isinstance(outcome["reasons"], list) or not all(isinstance(x, str) for x in outcome["reasons"]):
+ raise ContractError("raw outcome reasons must be strings")
+ validity = _keys(
+ outcome["validity"],
+ {"anomaly_free", "execution_status", "measurement_conformance", "provenance_complete",
+ "resource_conformance", "sampling_conformance", "semantic_correctness",
+ "workload_identity", "workload_source"},
+ "raw.outcome.validity",
+ )
+ correctness_passed = all(row["correctness"]["passed"] for row in rows)
+ workload_consistent = workload["cross_rank_consistent"] is True
+ expected_status = "success" if correctness_passed and workload_consistent else "invalid"
+ expected_publication = "diagnostic" if expected_status == "success" else "invalid"
+ if (
+ outcome["status"] != expected_status
+ or outcome["publication_status"] != expected_publication
+ or bool(outcome["reasons"]) == (expected_status == "success")
+ or validity["execution_status"] != "complete"
+ or validity["semantic_correctness"] != ("pass" if correctness_passed else "fail")
+ or validity["workload_identity"] != (
+ "consistent-across-ranks" if workload_consistent else "inconsistent"
+ )
+ or validity["workload_source"] != workload["source"]
+ or validity["measurement_conformance"] != "conformant"
+ or validity["sampling_conformance"] != "conformant"
+ or validity["resource_conformance"] != resource_profile["conformance_class"]
+ or validity["anomaly_free"] != (recomputed_anomalies == 0)
+ or validity["provenance_complete"] is not expected_provenance_complete
+ ):
+ raise ContractError("raw outcome differs from its measurement evidence")
+ artifact_safety.assert_publication_safe([doc])
+ return doc
+
+
+def make_terminal_document(
+ *,
+ allocation_factors: dict[str, Any],
+ attempt_ordinal: int,
+ case: dict[str, Any],
+ case_factors: dict[str, Any],
+ control_sha256: str | None,
+ failure_mode: str,
+ generated_at: str,
+ git_run: dict[str, Any],
+ reason: str,
+ return_code: int,
+ source: str,
+ status: str,
+ expected_case_id: str | None = None,
+) -> dict[str, Any]:
+ """Build and self-validate one attributable non-success attempt."""
+ case_id = identity.digest("case", case_factors)
+ if expected_case_id is not None and expected_case_id != case_id:
+ raise ContractError(
+ f"scheduled case ID differs from terminal factors: {expected_case_id} != {case_id}"
+ )
+ allocation_id = identity.allocation_id(allocation_factors)
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=attempt_ordinal
+ )
+ document = {
+ "format": TERMINAL_FORMAT,
+ "schema_version": 1,
+ "record_type": "terminal-outcome",
+ "generated_at": generated_at,
+ "identity": {
+ "allocation_factors": allocation_factors,
+ "allocation_id": allocation_id,
+ "attempt_id": attempt_id,
+ "attempt_ordinal": attempt_ordinal,
+ "case_factors": case_factors,
+ "case_id": case_id,
+ },
+ "case": case,
+ "provenance": {
+ "git_run": git_run,
+ "control_sha256": control_sha256,
+ "redaction": "sanitized-v1",
+ "source": source,
+ },
+ "outcome": {
+ "status": status,
+ "failure_mode": failure_mode,
+ "reason": reason,
+ "return_code": return_code,
+ },
+ }
+ return validate_terminal_document(document)
+
+
+def validate_terminal_document(document: Any) -> dict[str, Any]:
+ _validate_native_schema("terminal-outcome-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"case", "format", "generated_at", "identity", "outcome", "provenance", "record_type",
+ "schema_version"},
+ "terminal",
+ )
+ if doc["format"] != TERMINAL_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "terminal-outcome":
+ raise ContractError("terminal format/schema/record type differs from v1")
+ ids = _keys(doc["identity"], {
+ "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal",
+ "case_factors", "case_id",
+ }, "terminal.identity")
+ for field, kind in (("allocation_id", "allocation"), ("attempt_id", "attempt"), ("case_id", "case")):
+ _typed(ids[field], kind, f"terminal.identity.{field}")
+ ordinal = _integer(ids["attempt_ordinal"], "terminal.identity.attempt_ordinal", minimum=1)
+ case = _keys(doc["case"], TERMINAL_CASE_FIELDS, "terminal.case")
+ factors = _keys(ids["case_factors"], {"case", "profile", "sku"}, "terminal.identity.case_factors")
+ if factors["case"] != case or factors["profile"] != identity.V1_CASE_PROFILE:
+ raise ContractError("terminal case factors differ from the scheduled case/profile")
+ _text(factors["sku"], "terminal.identity.case_factors.sku")
+ allocation = _keys(
+ ids["allocation_factors"], ALLOCATION_FACTOR_FIELDS,
+ "terminal.identity.allocation_factors",
+ )
+ expected_case = identity.digest("case", factors)
+ expected_allocation = identity.allocation_id(allocation)
+ expected_attempt = identity.attempt_id(
+ allocation=expected_allocation, case=expected_case, ordinal=ordinal
+ )
+ if (ids["case_id"], ids["allocation_id"], ids["attempt_id"]) != (
+ expected_case, expected_allocation, expected_attempt
+ ):
+ raise ContractError("terminal typed identities do not match their factors")
+ provenance = _keys(
+ doc["provenance"], {"git_run", "control_sha256", "redaction", "source"},
+ "terminal.provenance",
+ )
+ git_run = _keys(provenance["git_run"], GIT_RUN_FIELDS, "terminal.provenance.git_run")
+ control = provenance["control_sha256"]
+ if control is not None and (
+ not isinstance(control, str) or len(control) != 64
+ or any(char not in "0123456789abcdef" for char in control)
+ ):
+ raise ContractError("terminal control_sha256 is invalid")
+ if provenance["redaction"] != "sanitized-v1":
+ raise ContractError("terminal redaction contract differs")
+ source = _text(provenance["source"], "terminal.provenance.source")
+ outcome = _keys(
+ doc["outcome"], {"failure_mode", "reason", "return_code", "status"}, "terminal.outcome"
+ )
+ if outcome["status"] not in {"failed", "invalid", "unsupported"}:
+ raise ContractError("terminal outcome status is invalid")
+ failure_mode = _text(outcome["failure_mode"], "terminal.outcome.failure_mode")
+ reason = _text(outcome["reason"], "terminal.outcome.reason")
+ _integer(outcome["return_code"], "terminal.outcome.return_code")
+ if source == "runtime-emitter":
+ expected_runner = factors["sku"]
+ expected_reason = RUNTIME_FAILURE_REASONS.get(failure_mode)
+ valid_outcome = outcome["status"] == "failed" and reason == expected_reason
+ elif source == "post-emit-command":
+ expected_runner = factors["sku"]
+ expected_reason = POST_EMIT_FAILURE_REASONS.get(failure_mode)
+ valid_outcome = outcome["status"] == "failed" and reason == expected_reason
+ elif source == "matrix-capability-resolver":
+ expected_runner = "capability-resolver"
+ valid_outcome = (
+ outcome["status"] == "unsupported"
+ and failure_mode == "capability"
+ and reason in CAPABILITY_FAILURE_REASONS
+ )
+ else:
+ raise ContractError("terminal provenance source is not registered")
+ if not valid_outcome:
+ raise ContractError("terminal source and outcome are not registered")
+ expected_allocation = {
+ "artifact": git_run["artifact"],
+ "execution_id": allocation["execution_id"],
+ "job": git_run["job"],
+ "repo": git_run["repo"],
+ "run_attempt": git_run["run_attempt"],
+ "run_id": git_run["run_id"],
+ "runner": expected_runner,
+ "source_sha": git_run["source_sha"],
+ }
+ if allocation != expected_allocation:
+ raise ContractError("terminal allocation factors differ from provenance or source")
+ artifact_safety.assert_publication_safe([doc])
+ return doc
+
+
+def load_raw_attempt(path: str | os.PathLike[str]) -> dict[str, Any]:
+ document = strict_load(path)
+ artifact = _obj(document, "raw").get("sample_artifact")
+ artifact = _obj(artifact, "raw.sample_artifact")
+ sample_path = Path(path).with_name(_text(artifact.get("path"), "raw.sample_artifact.path"))
+ payload = sample_path.read_bytes()
+ if len(payload) != artifact.get("bytes") or hashlib.sha256(payload).hexdigest() != artifact.get("sha256"):
+ raise ContractError("sample artifact bytes or digest differ")
+ samples = strict_load(sample_path)
+ return validate_raw_document(document, samples)
+
+
+def load_attempt(path: str | os.PathLike[str]) -> dict[str, Any]:
+ """Fully validate and return one native raw or terminal attempt."""
+ document = strict_load(path)
+ if isinstance(document, dict) and document.get("format") == RAW_FORMAT:
+ return load_raw_attempt(path)
+ if isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT:
+ return validate_terminal_document(document)
+ raise ContractError("unknown native attempt format")
+
+
+def quarantine_invalid_attempt(path: str | os.PathLike[str]) -> bool:
+ """Move an invalid attempt and its basename-safe sample outside JSON upload globs."""
+ destination = Path(path)
+ if not destination.is_file():
+ return False
+ try:
+ load_attempt(destination)
+ return False
+ except (ContractError, OSError, ValueError):
+ try:
+ document = json.loads(destination.read_bytes())
+ except (OSError, json.JSONDecodeError):
+ document = {}
+ artifact = document.get("sample_artifact") if isinstance(document, dict) else None
+ sample_name = artifact.get("path") if isinstance(artifact, dict) else None
+ if isinstance(sample_name, str) and Path(sample_name).name == sample_name:
+ sample_path = destination.with_name(sample_name)
+ if sample_path.is_file():
+ os.replace(sample_path, sample_path.with_name(sample_path.name + ".quarantine"))
+ os.replace(destination, destination.with_name(destination.name + ".quarantine"))
+ return True
+
+
+def normalize_attempt(document: dict[str, Any]) -> dict[str, Any]:
+ """Return the publisher-facing projection after native validation."""
+ if document.get("format") == RAW_FORMAT:
+ ids = document["identity"]
+ return {
+ "allocation_id": ids["allocation_id"],
+ "attempt_id": ids["attempt_id"],
+ "case": document["case"],
+ "case_id": ids["case_id"],
+ "generated_at": document["generated_at"],
+ "outcome": document["outcome"],
+ "points": document["measurement"]["rows"],
+ "runtime_fingerprint": document["runtime_fingerprint"],
+ "series_id": ids["series_id"],
+ }
+ if document.get("format") == TERMINAL_FORMAT:
+ ids = document["identity"]
+ return {
+ "allocation_id": ids["allocation_id"],
+ "attempt_id": ids["attempt_id"],
+ "case": document["case"],
+ "case_id": ids["case_id"],
+ "generated_at": document["generated_at"],
+ "outcome": document["outcome"],
+ "points": [],
+ "runtime_fingerprint": None,
+ "series_id": None,
+ }
+ raise ContractError("unknown attempt format")
+
+
+def _env_integer(name: str, default: int) -> int:
+ try:
+ return int(os.environ.get(name, str(default)))
+ except ValueError:
+ return default
+
+
+def _env_enabled(name: str) -> bool:
+ return os.environ.get(name, "").lower() in {"1", "true", "yes"}
+
+
+def _terminal_case_from_environment(backend: str, phase: str) -> dict[str, Any]:
+ ep = _env_integer("CX_EP", _env_integer("CX_NGPUS", 1))
+ gpus_per_node = _env_integer("CX_GPUS_PER_NODE", ep)
+ ladder = os.environ.get("CX_TOKENS_LADDER", "") or (
+ "1 2 4 8 16 32 64 128"
+ if phase == "decode"
+ else "128 256 512 1024 2048 4096"
+ )
+ return {
+ "suite": os.environ.get("CX_SUITE") or "manual",
+ "workload": os.environ.get("CX_WORKLOAD_NAME") or "manual",
+ "required_publication": os.environ.get("CX_REQUIRED_PUBLICATION") or "diagnostic",
+ "backend": backend,
+ "routing": os.environ.get("CX_ROUTING", "uniform"),
+ "phase": phase,
+ "ep": ep,
+ "eplb": _env_enabled("CX_EPLB"),
+ "hidden": _env_integer("CX_HIDDEN", 7168),
+ "topk": _env_integer("CX_TOPK", 8),
+ "experts": _env_integer("CX_EXPERTS", 256),
+ "samples_per_point": _env_integer("CX_SAMPLES_PER_POINT", 512),
+ "warmup_semantics": os.environ.get(
+ "CX_WARMUP_SEMANTICS",
+ "full-roundtrip-before-each-component-trial-point-v1",
+ ),
+ "ladder": ladder,
+ "timing": (
+ f'{_env_integer("CX_ITERS", 8)}:{_env_integer("CX_TRIALS", 64)}:'
+ f'{_env_integer("CX_WARMUP", 32)}'
+ ),
+ "canonical": _env_enabled("CX_CANONICAL"),
+ "nodes": _env_integer("CX_NODES", _env_integer("SLURM_NNODES", 1)),
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": _env_integer("CX_SCALE_UP_DOMAIN", gpus_per_node),
+ }
+
+
+def _git_run_from_environment() -> dict[str, Any]:
+ def value(name: str) -> str | None:
+ return os.environ.get(name) or None
+
+ return {
+ "run_id": value("GITHUB_RUN_ID"),
+ "run_attempt": value("GITHUB_RUN_ATTEMPT"),
+ "ref": value("GITHUB_REF_NAME") or value("GITHUB_REF"),
+ "source_sha": value("COLLECTIVEX_SOURCE_SHA") or value("GITHUB_SHA"),
+ "repo": value("GITHUB_REPOSITORY"),
+ "job": value("GITHUB_JOB"),
+ "artifact": value("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+
+
+def _allocation_factors_from_environment(
+ runner: str, git_run: dict[str, Any]
+) -> dict[str, Any]:
+ return {
+ "artifact": git_run["artifact"],
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID") or None,
+ "job": git_run["job"],
+ "repo": git_run["repo"],
+ "run_attempt": git_run["run_attempt"],
+ "run_id": git_run["run_id"],
+ "runner": runner,
+ "source_sha": git_run["source_sha"],
+ }
+
+
+def make_terminal_from_environment(
+ *, backend: str, phase: str, return_code: int, failure_mode: str | None = None
+) -> dict[str, Any]:
+ """Build a terminal document from the same exported case coordinates as run_ep."""
+ mode = failure_mode or RETURN_CODE_FAILURE_MODES.get(return_code, "execution")
+ reason = RUNTIME_FAILURE_REASONS.get(mode)
+ if reason is None:
+ raise ContractError("runtime failure mode is not registered")
+ runner = os.environ.get("CX_RUNNER", "")
+ case = _terminal_case_from_environment(backend, phase)
+ case_factors = {"case": case, "profile": identity.V1_CASE_PROFILE, "sku": runner}
+ git_run = _git_run_from_environment()
+ control = os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None
+ return make_terminal_document(
+ allocation_factors=_allocation_factors_from_environment(runner, git_run),
+ attempt_ordinal=_env_integer("CX_ATTEMPT_ID", 1),
+ case=case,
+ case_factors=case_factors,
+ control_sha256=control,
+ failure_mode=mode,
+ generated_at=dt.datetime.now(dt.timezone.utc).isoformat(),
+ git_run=git_run,
+ reason=reason,
+ return_code=return_code,
+ source="runtime-emitter",
+ status="failed",
+ expected_case_id=os.environ.get("CX_CASE_ID") or None,
+ )
+
+
+def _write_document(path: str | os.PathLike[str], document: dict[str, Any]) -> None:
+ destination = Path(path)
+ destination.parent.mkdir(parents=True, exist_ok=True)
+ temporary = destination.with_name(destination.name + ".tmp")
+ temporary.write_text(json.dumps(document, indent=2, sort_keys=True) + "\n")
+ os.replace(temporary, destination)
+
+
+def demote_raw_attempt(path: str | os.PathLike[str], return_code: int) -> dict[str, Any]:
+ """Replace a rank-zero raw result when the distributed command later fails."""
+ destination = Path(path)
+ raw = strict_load(destination)
+ if not isinstance(raw, dict) or raw.get("format") != RAW_FORMAT:
+ raise ContractError("only a native raw attempt can be demoted")
+ ids = _obj(raw.get("identity"), "raw.identity")
+ required = {
+ "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal",
+ "case_factors", "case_id",
+ }
+ if not required.issubset(ids):
+ raise ContractError("raw identity lacks terminal factors")
+ mode = RETURN_CODE_FAILURE_MODES.get(return_code, "execution")
+ git_run = _obj(_obj(raw.get("provenance"), "raw.provenance").get("git_run"), "raw.provenance.git_run")
+ terminal = make_terminal_document(
+ allocation_factors=ids["allocation_factors"],
+ attempt_ordinal=ids["attempt_ordinal"],
+ case=ids["case_factors"]["case"],
+ case_factors=ids["case_factors"],
+ control_sha256=os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None,
+ failure_mode=mode,
+ generated_at=dt.datetime.now(dt.timezone.utc).isoformat(),
+ git_run=git_run,
+ reason=POST_EMIT_FAILURE_REASONS[mode],
+ return_code=return_code,
+ source="post-emit-command",
+ status="failed",
+ expected_case_id=ids["case_id"],
+ )
+ artifact = raw.get("sample_artifact") or {}
+ sample_name = artifact.get("path")
+ if isinstance(sample_name, str) and Path(sample_name).name == sample_name:
+ destination.with_name(sample_name).unlink(missing_ok=True)
+ _write_document(destination, terminal)
+ return terminal
+
+
+def validate_attempt_paths(paths: list[str]) -> int:
+ """Fully validate a result directory's attempts and paired sample artifacts."""
+ if not paths or len(paths) != len(set(paths)):
+ raise ContractError("validate-many requires unique result paths")
+ sample_paths: set[Path] = set()
+ referenced_samples: set[Path] = set()
+ attempt_count = 0
+ for raw_path in paths:
+ path = Path(raw_path).resolve()
+ document = strict_load(path)
+ if isinstance(document, dict) and document.get("format") == RAW_FORMAT:
+ document = load_raw_attempt(path)
+ referenced_samples.add(path.with_name(document["sample_artifact"]["path"]))
+ attempt_count += 1
+ elif isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT:
+ validate_terminal_document(document)
+ attempt_count += 1
+ elif isinstance(document, dict) and document.get("format") == SAMPLES_FORMAT:
+ validate_samples_document(document)
+ sample_paths.add(path)
+ else:
+ raise ContractError(f"unknown result artifact {path.name}")
+ if sample_paths != referenced_samples:
+ raise ContractError("sample artifacts are missing, orphaned, or outside the validated set")
+ if attempt_count == 0:
+ raise ContractError("result set contains no native attempts")
+ return attempt_count
+
+
+def validate_delivery(
+ paths: list[str], source_path: str, *, disposition: str | None = None
+) -> int:
+ """Reconcile a shard or matrix disposition with its complete native attempt set."""
+ source_file = Path(source_path).resolve()
+ source = strict_load(source_file)
+ if isinstance(source, dict) and source.get("format") == "collectivex.matrix.v1":
+ if disposition is None:
+ raise ContractError("matrix delivery validation requires a disposition")
+ wrappers = [
+ item for item in source.get("requested_cases", [])
+ if isinstance(item, dict) and item.get("disposition") == disposition
+ ]
+ expected = {
+ item["case"]["case_id"]: (item["sku"], item["case"])
+ for item in wrappers
+ }
+ expected_count = len(wrappers)
+ require_one_allocation = disposition == "unsupported"
+ elif isinstance(source, dict) and isinstance(source.get("cases"), list):
+ expected = {
+ case["case_id"]: (source.get("sku"), case)
+ for case in source["cases"]
+ }
+ expected_count = len(source["cases"])
+ require_one_allocation = True
+ else:
+ raise ContractError("delivery source is not a matrix or shard control")
+ if not expected or len(expected) != expected_count:
+ raise ContractError("delivery source has empty or duplicate case coverage")
+
+ validate_attempt_paths(paths)
+ attempts = []
+ for raw_path in paths:
+ document = strict_load(raw_path)
+ if isinstance(document, dict) and document.get("format") in {RAW_FORMAT, TERMINAL_FORMAT}:
+ attempts.append(load_attempt(raw_path))
+ by_case: dict[str, list[dict[str, Any]]] = {}
+ attempt_ids = set()
+ allocation_ids = set()
+ source_sha256 = hashlib.sha256(source_file.read_bytes()).hexdigest()
+ for document in attempts:
+ ids = document["identity"]
+ case_id = ids["case_id"]
+ if case_id not in expected or ids["attempt_id"] in attempt_ids:
+ raise ContractError("delivery contains an extra case or duplicate attempt")
+ attempt_ids.add(ids["attempt_id"])
+ allocation_ids.add(ids["allocation_id"])
+ sku, scheduled = expected[case_id]
+ scheduled_case = {key: value for key, value in scheduled.items() if key != "case_id"}
+ if ids["case_factors"] != {
+ "case": scheduled_case, "profile": identity.V1_CASE_PROFILE, "sku": sku
+ }:
+ raise ContractError("delivery attempt differs from its scheduled case")
+ factors = ids["allocation_factors"]
+ expected_environment = {
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+ }
+ expected_runner = (
+ "capability-resolver"
+ if document["format"] == TERMINAL_FORMAT
+ and document["provenance"]["source"] == "matrix-capability-resolver"
+ else sku
+ )
+ if any(
+ value is not None and factors[field] != value
+ for field, value in expected_environment.items()
+ ) or factors["runner"] != expected_runner:
+ raise ContractError("delivery allocation factors differ from the workflow")
+ if document["format"] == TERMINAL_FORMAT:
+ control = document["provenance"]["control_sha256"]
+ if control != source_sha256:
+ raise ContractError("terminal outcome does not reference its exact control document")
+ by_case.setdefault(case_id, []).append(document)
+ if set(by_case) != set(expected):
+ raise ContractError("delivery case coverage is incomplete")
+ for case_id, documents in by_case.items():
+ ordinals = sorted(document["identity"]["attempt_ordinal"] for document in documents)
+ if ordinals != list(range(1, len(ordinals) + 1)):
+ raise ContractError(f"delivery attempt ordinals are not contiguous for {case_id}")
+ if require_one_allocation and len(allocation_ids) != 1:
+ raise ContractError("one shard must use exactly one allocation identity")
+ return len(attempts)
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="CollectiveX native attempt contracts")
+ subparsers = parser.add_subparsers(dest="command", required=True)
+ probe = subparsers.add_parser("probe")
+ probe.add_argument("path")
+ probe.add_argument("--status", choices=("success", "invalid"))
+ emit = subparsers.add_parser("emit-terminal")
+ emit.add_argument("--out", required=True)
+ emit.add_argument("--backend", required=True)
+ emit.add_argument("--phase", required=True, choices=("decode", "prefill"))
+ emit.add_argument("--return-code", required=True, type=int)
+ emit.add_argument("--failure-mode")
+ demote = subparsers.add_parser("demote")
+ demote.add_argument("path")
+ demote.add_argument("--return-code", required=True, type=int)
+ validate_many = subparsers.add_parser("validate-many")
+ validate_many.add_argument("paths", nargs="+")
+ quarantine = subparsers.add_parser("quarantine-invalid")
+ quarantine.add_argument("path")
+ delivery = subparsers.add_parser("validate-delivery")
+ delivery.add_argument("--source", required=True)
+ delivery.add_argument("--disposition")
+ delivery.add_argument("paths", nargs="+")
+ args = parser.parse_args()
+ try:
+ if args.command == "probe":
+ document = load_attempt(args.path)
+ if args.status is None:
+ return 0
+ if document.get("format") != RAW_FORMAT:
+ return 1
+ outcome = document["outcome"]
+ validity = outcome.get("validity")
+ return int(
+ not (
+ isinstance(validity, dict)
+ and validity.get("execution_status") == "complete"
+ and outcome.get("status") == args.status
+ )
+ )
+ if args.command == "emit-terminal":
+ document = make_terminal_from_environment(
+ backend=args.backend,
+ phase=args.phase,
+ return_code=args.return_code,
+ failure_mode=args.failure_mode,
+ )
+ _write_document(args.out, document)
+ print(f"preserved terminal outcome ({document['outcome']['failure_mode']})")
+ return 0
+ if args.command == "validate-many":
+ print(f"validated {validate_attempt_paths(args.paths)} native attempts")
+ return 0
+ if args.command == "quarantine-invalid":
+ quarantine_invalid_attempt(args.path)
+ return 0
+ if args.command == "validate-delivery":
+ print(
+ f"validated {validate_delivery(args.paths, args.source, disposition=args.disposition)} "
+ "delivery attempts"
+ )
+ return 0
+ demote_raw_attempt(args.path, args.return_code)
+ return 0
+ except (ContractError, identity.IdentityError, OSError, ValueError) as exc:
+ print(f"terminal contract error: {exc}", file=sys.stderr)
+ return 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md
new file mode 100644
index 0000000000..f68ef89a5c
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology.md
@@ -0,0 +1,256 @@
+# CollectiveX EP v1 Contract
+
+
+
+**English** | [中文](./methodology_zh.md)
+
+
+
+This document defines new CollectiveX results. Historical run notes are evidence, not contract.
+
+## Product Boundary
+
+CollectiveX is a communication microbenchmark for:
+
+- comparing EP libraries on one chip/topology;
+- comparing EP latency and logical payload bandwidth across systems under the same workload; and
+- exposing unsupported, failed, invalid, and unstable evidence without contaminating decisions.
+
+It does not predict serving throughput without a separate correlation study.
+
+## Matrix
+
+The promoted workload is `deepseek-v3-v1`: hidden 7168, top-k 8, 256 routed experts, BF16 dispatch
+and combine, normal mode, packed placement, backend-tuned resources, and
+`layout-and-dispatch-v1`.
+
+- `ep-core-v1`: uniform routing; decode T=1..128 powers of two; prefill T=256/512.
+- `ep-routing-v1`: Zipf with EPLB off/on; decode T=128; prefill T=512.
+- Current runnable surface: 38 cells, 228 cases, 532 token points before repeats.
+
+Unsupported combinations are terminal outcomes, not silently skipped coverage. DeepEP V2 is the
+`ElasticBuffer` introduced by PR #605, pinned with upstream PR #630's minimal pure-scale-up fix.
+Current V2 cases request NCCL Device API LSA because their world size does not exceed the declared
+scale-up domain, then fail closed unless NCCL's realized LSA team covers the full EP world. GIN is
+reserved for a separately identified true scale-out cohort. Source-declared NVIDIA capabilities
+remain unvalidated until GPU outcomes pass the native oracle and publisher gates. Removed axes
+include `[cl]`, `[rv]`, LL,
+quantization, alternate activation/routing profiles,
+uneven allocation, placement permutations, model envelopes, and scaling.
+FlashInfer is excluded from v1 after repeatable intermittent execution failures; those failures are
+not converted into planned-unsupported coverage.
+MoRI AsyncLL and intranode paths publish distinct kernel generations rather than masquerading as the
+same controlled implementation in cross-chip cohorts.
+
+## Workload Identity
+
+One canonical workload is generated over the global token batch and sliced by source rank. Expert
+indices and gate weights are serialized. Activations use a versioned integer counter formula whose
+BF16 values are exact across runtimes; its full identity is bound into the manifest. The manifest
+also binds shape/EP coordinates and oracle version. SHA-256 covers canonical bytes and parameters;
+library RNG regeneration is not proof of identity.
+
+Routing traffic distinguishes:
+
+- token-expert assignments, which determine expert compute load; and
+- rank-deduplicated token payload copies, which determine EP activation traffic.
+
+Adapters may not generate routing or reinterpret one quantity as the other.
+
+## Measurement
+
+`layout-and-dispatch-v1` times dispatch layout plus communication. Expert-output staging is outside
+isolated combine timing and inside measured paired roundtrip. Each component declares availability,
+origin, start/end states, stage scope, and sample count. A paired-only API reports null isolated
+components. Combine is activation-only for every adapter: dispatch gate weights are verified but are
+not returned over the timed combine path. `isolated_sum` is derived and never used for throughput or
+recommendations.
+
+Every measured component uses `fixed-512-v1`:
+
+- 64 trials x 8 timed iterations = 512 observations;
+- 32 synchronized full dispatch-stage-combine warmups before each available measured component at
+ every trial/point;
+- roundtrip first, then isolated dispatch and combine, with a fixed per-phase conditioning ladder; and
+- per-iteration maximum latency across ranks before nearest-rank p50/p90/p95/p99.
+
+Measured roundtrip p99 is the headline latency. Retries remain separate attempts; a later success
+does not erase earlier failures. Decode and prefill identify the serving regime represented by one
+MoE-layer collective; they do not change the timed primitive at an otherwise identical shape.
+
+The NCCL/RCCL reference is an end-to-end Python adapter, not a bare fabric primitive. Its dispatch
+boundary includes layout, count exchange, a device-to-host split synchronization, fresh receive
+allocation, and four payload/metadata all-to-all calls; activation-only combine adds one all-to-all plus
+scatter/reduction. Its p99 therefore measures the complete reference-adapter boundary and can be
+host/scheduler-sensitive. It is useful for portable system controls but must not be labeled fabric,
+link, bus, or single-collective latency.
+
+The versioned conditioning and EPLB planner contracts (reference trace, redundant count, and
+placement/remap version) are part of scheduled and evidence identity.
+
+Logical payload bandwidth is:
+
+`logical_payload_bytes / measured_latency_seconds`
+
+Payload bytes use rank-deduplicated activations plus required scale bytes at the named boundary.
+They exclude expert metadata, padding, and backend buffer capacity. Algorithm bandwidth, bus
+bandwidth, wire utilization, and physical-link utilization are not published without a defined
+primitive model or transport counters. Logical bandwidth must never be labeled physical bandwidth.
+Published payload and token rates are named `rate_at_latency_percentile`: bytes or tokens divided by
+the matching latency percentile. They are lower-tail service rates at p99 latency, not p99
+percentiles of an inverted rate distribution.
+
+## Correctness
+
+An implementation-independent oracle uses an expert-specific deterministic transform so wrong
+expert routing cannot pass an identity roundtrip. For every rank and point it verifies:
+
+1. destination rank/expert, source token, multiplicity, gate weight, and receive counts;
+2. dispatched payload and metadata before timing;
+3. combined output before timing;
+4. unchanged semantic inputs through all timed samples; and
+5. dispatched payload/metadata and combined output again after timing.
+
+Every v1 adapter uses activation-only, unweighted rank-sum combine. The oracle builds each rank's
+gate-weighted expert aggregate before combine, independently derives `sum(gate * expert(token))`,
+and checks every element with recorded `rtol=0.05` and `atol=0.02`. Any failed rank or point makes
+the case ineligible.
+Pre/post dispatch evidence is hashed in canonical source-token order. Native receive slots may be
+assigned nondeterministically, so physical receive order is not treated as a correctness property.
+
+## Native Result
+
+One raw case document uses `format: "collectivex.ep.v1"`, rejects unknown fields, and contains:
+
+- `case`: stable case ID, suite, required tier, and coordinate;
+- `workload`: canonical identity and logical MoE shape;
+- `measurement`: sampling, component states, timing, and byte accounting;
+- `implementation`: instantiated class/API, pinned source, loaded libraries, and resources;
+- `topology`: requested and realized SKU, devices, placement, scale-up domain, and transport;
+- `provenance`: source SHA, image/squash hashes, allocation, run, and attempt;
+- `rows`: point latency, byte accounting, token rate, correctness, load, fanout, and anomaly evidence; and
+- `outcome`: `success`, `failed`, `invalid`, `diagnostic`, or `unsupported`, with reasons.
+
+Raw result documents and exact samples pass through transient GitHub delivery artifacts before the
+publisher archives them in the private bundle; they never enter the public tree. Private environment
+details remain in local mode-0600 logs and ignored operator notes; they are never archived or
+published. Every expected case has one terminal selected outcome while every attempt remains retained.
+
+## Identity And Comparisons
+
+Canonical JSON produces three full SHA-256 IDs:
+
+- `series_id`: all locked factors except token coordinate and repeat allocation;
+- `point_id`: `series_id` plus token coordinate; and
+- `evidence_id`: `point_id` plus allocation/run/attempt/sample checksum.
+
+Locked factors include workload bytes, measurement and sampling contract, resources, realized
+topology, implementation/build, loaded libraries, image/squash, runtime, and source SHA.
+Deferred code generation is captured before measurement and recaptured afterward. DeepEP V2 uses a
+fixed NVCC random seed and binds final cache keys plus generated-source and executable-SASS hashes;
+raw CUBIN bytes remain private diagnostics. Hybrid binds its realized auto-tuned config and complete
+kernel-key set while retaining rank-local shared-object hashes as private diagnostics. Locally built
+extension hashes are diagnostic; their pinned source trees, build recipe, runtime, and dependencies
+remain series-bound.
+The series identity includes the case ID, which binds the complete scheduled token ladder and the
+frozen percentile, rank-reduction, conditioning, warmup, and correctness semantics.
+
+A controlled comparison declares one contrast:
+
+- `library`: backend implementation and its tuned resource profile may differ; the realized system,
+ workload, EP, resource policy, source, and measurement remain matched;
+- `chip`: a controlled platform contrast. The full realized system/topology and tuned resource
+ profile may differ while workload, EP, placement class, resource policy, backend lineage, source,
+ and measurement remain matched. It is not a silicon-only comparison;
+- `system`: all hardware/backend differences stay visible while workload, EP, and measurement match;
+- `routing`: routing distribution/EPLB differs while the static implementation build/generator,
+ system, model shape, resource profile, and measurement remain matched. Uniform and Zipf without
+ EPLB reuse the same generated implementation; EPLB's physical-expert/JIT configuration remains an
+ explicit treatment difference.
+
+Any undeclared mismatch rejects the overlay. Chip/system results describe measured systems, not
+silicon alone.
+
+## Evidence Policy
+
+Capability declarations say what may be attempted; artifacts determine evidence status. Promotion
+requires exact expected coverage with no missing, extra, duplicate, malformed, or heterogeneous
+case. Public coverage preserves each matrix disposition; promotion requires every runnable case to
+succeed and every planned-unsupported case to remain unsupported in every selected run. Only the
+pinned canonical full-v1 matrix, with a decision-grade library, chip, system, and routing cohort,
+may advance `dev-latest`; partial matrices remain diagnostic. The full-matrix digest intentionally
+pins the exact workflow shard grouping as well as the requested cases, so changing `--max-cases`
+or the SKU round-robin scheduling order produces diagnostic-only runs even when case coverage is
+unchanged. Superseded retries,
+planned-unsupported outcomes, and unstable comparison cohorts may render diagnostically but cannot
+rank or recommend; every successful required series in a promoted dataset remains decision-grade.
+Any failed, invalid, or diagnostic retry of a runnable case blocks promotion even if a later retry
+succeeds. Routing cohorts are comparable-experimental sensitivities and never produce configuration
+recommendations; official library/platform/system cohorts own actionable recommendations.
+
+A point becomes decision-grade only after three independent workflow runs and allocation IDs pass
+correctness, identity, provenance, tail gates, p50/p99 repeat-stability thresholds, and stable ordering. The
+publisher, not the frontend, computes eligibility, controlled cohorts, sensitivity pairs, and
+recommendations.
+
+## Isolated Artifact Store
+
+Development uses one self-hosted persistent filesystem. There is no Vercel storage, GCP, Neon,
+managed database, or managed object store.
+
+```text
+$COLLECTIVEX_STORE_ROOT/
+ private/incoming/ # write-once downloaded GHA attempts
+ private/bundles// # immutable source archives, native results/samples, matrix, checksums
+ private/quarantine/ # rejected attempts plus machine-readable reasons
+ public/datasets// # immutable sanitized frontend datasets
+ public/channels/ # small atomic pointers: latest-attempt, dev-latest
+ locks/
+```
+
+Private and public trees use separate permissions. JSON manifests and checksums are authoritative;
+a rebuildable catalog is only an index. GitHub artifacts are transient delivery input.
+
+Container tags are checked against pinned registry digests. Enroot imports use a fixed
+`SOURCE_DATE_EPOCH` and versioned cache generation; every mounted squash is freshly hashed into
+series identity. Image-provided DeepEP is also checked against exact per-architecture wheel and
+installed-file fingerprints, so a stale cache cannot inherit the pinned source identity.
+Source-built DeepEP V2 uses a separate mode-0700 cluster-local cache mounted only as `/cx-cache`.
+Its content key binds a versioned build recipe, verified image digest, CPU/GPU architecture,
+upstream source trees, and pinned build dependencies. The cache is never an artifact or publisher
+input; per-execution source/results stages remain isolated and disposable, and marker plus runtime
+probes fail closed before reuse. The runner UID is inside the trusted cluster boundary: this cache
+guards against stale or accidental mutation, not hostile same-UID jobs. Only an unpublished partial
+build may be reset automatically; a published cache that fails integrity or runtime checks is left
+intact and rejected so a concurrent allocation cannot lose files it is using.
+
+Publication is fail-closed:
+
+1. acquire an exclusive filesystem lock and stage on the destination filesystem;
+2. archive source bytes before parsing;
+3. require the exact matrix-declared artifact set and reject every unconsumed archive member;
+4. validate strict schemas, privacy, checksums, identities, timing, and exact matrix outcomes;
+5. write checksums and `COMPLETE`, fsync, then atomically rename the private bundle;
+6. build and validate the sanitized content-addressed dataset, fsync, then atomically rename it;
+7. atomically replace `dev-latest.json` only when every promotion gate passes.
+
+Rejected attempts may update `latest-attempt` but never `dev-latest`. Channel responses use
+`no-cache`; immutable datasets use content hashes and long-lived caching. A same-host read-only HTTP
+route in the InferenceX frontend exposes only the two channel documents and digest-addressed
+datasets under `public/`; it rejects incomplete objects, directory listing, and client-supplied
+filesystem paths.
+
+`publisher.py ingest` accepts the exact matrix plus one `--artifact` directory or ZIP per GitHub
+artifact. `promote` accepts explicit immutable bundle IDs. Default `verify` requires
+`latest-attempt`; it also verifies `dev-latest` when present, while an explicit
+`--channel dev-latest` requires it. The frontend process receives the same absolute,
+non-symlinked `COLLECTIVEX_STORE_ROOT` and performs the only HTTP serving.
+
+The frontend fetches the channel pointer, validates it at runtime, resolves the immutable dataset,
+verifies its digest/format, and renders terminal coverage. It never invents missing values, selects
+retries, or recomputes decision eligibility.
+
+## Legacy Data
+
+Numeric schemas 3-5 are outside the v1 publisher and frontend reader. They remain historical
+diagnostic evidence and cannot seed `dev-latest` or drive v1 decisions.
diff --git a/experimental/CollectiveX/docs/methodology_zh.md b/experimental/CollectiveX/docs/methodology_zh.md
new file mode 100644
index 0000000000..c9124dc4e4
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology_zh.md
@@ -0,0 +1,247 @@
+# CollectiveX EP v1 契约
+
+
+
+[English](./methodology.md) | **中文**
+
+
+
+本文档定义新的 CollectiveX 结果。历史运行笔记是 evidence,不是 contract。
+
+## 产品边界
+
+CollectiveX 是通信 microbenchmark,用于:
+
+- 在同一 chip/topology 上比较 EP libraries;
+- 在相同 workload 下比较不同系统的 EP latency 和 logical payload bandwidth;
+- 展示 unsupported、failed、invalid 和 unstable evidence,同时避免污染决策。
+
+若没有单独的 correlation study,它不能预测 serving throughput。
+
+## 矩阵
+
+提升后的 workload 为 `deepseek-v3-v1`:hidden 7168、top-k 8、256 routed experts、BF16
+dispatch 和 combine、normal mode、packed placement、backend-tuned resources,以及
+`layout-and-dispatch-v1`。
+
+- `ep-core-v1`:uniform routing;decode T=1..128 的 2 次幂;prefill T=256/512。
+- `ep-routing-v1`:Zipf,EPLB off/on;decode T=128;prefill T=512。
+- 当前可运行范围:38 cells、228 cases、重复前 532 token points。
+
+Unsupported combinations 是 terminal outcomes,不会被静默跳过。DeepEP V2 指 PR #605
+引入的 `ElasticBuffer`,并固定使用 upstream PR #630 的最小纯 scale-up 修复。当前 V2 cases
+的 world size 均未超过声明的 scale-up domain,因此请求 NCCL Device API LSA;若 NCCL
+实际建立的 LSA team 未覆盖整个 EP world,则直接失败。GIN 只用于单独标识的真正 scale-out
+cohort。其 source 声明的 NVIDIA capabilities 在 GPU outcomes 通过 native oracle 和 publisher
+gates 前仍为 unvalidated。已移除的轴包括 `[cl]`、`[rv]`、LL、
+quantization、alternate activation/routing profiles、uneven allocation、placement
+permutations、model envelopes 和 scaling。
+FlashInfer 因可重复出现的间歇性执行失败而排除在 v1 外;这些失败不会转为 planned-unsupported
+coverage。
+MoRI AsyncLL 和 intranode paths 发布不同的 kernel generations,不会在 cross-chip cohorts 中
+伪装成相同的 controlled implementation。
+
+## Workload 身份
+
+一个 canonical workload 在 global token batch 上生成,再按 source rank 切分。Expert indices
+和 gate weights 会序列化。Activations 使用带版本的整数计数器公式,其 BF16 值在不同 runtime
+中精确一致;完整身份绑定到 manifest。Manifest 还绑定 shape/EP coordinates 和 oracle version。
+SHA-256 覆盖 canonical bytes 和 parameters;重新生成 library RNG 不能证明身份一致。
+
+Routing traffic 区分:
+
+- token-expert assignments,决定 expert compute load;
+- rank-deduplicated token payload copies,决定 EP activation traffic。
+
+Adapters 不得生成 routing,也不得将两种量相互解释。
+
+## 测量
+
+`layout-and-dispatch-v1` 计时 dispatch layout 加 communication。Expert-output staging 不计入
+isolated combine timing,但计入被测 paired roundtrip。每个 component 声明 availability、origin、
+start/end states、stage scope 和 sample count。仅有 paired API 时,isolated components 报 null。
+所有 adapter 的 combine 均采用 activation-only 边界:dispatch gate weights 会接受校验,但不会
+通过被测 combine 路径返回。`isolated_sum` 为派生值,不用于 throughput 或 recommendations。
+
+每个被测 component 均使用 `fixed-512-v1`:
+
+- 64 trials x 8 timed iterations = 512 observations;
+- 每个 trial/point 的每个可用被测 component 前,执行 32 次同步完整
+ dispatch-stage-combine warmups;
+- 先测 roundtrip,再测 isolated dispatch 和 combine,并使用固定的 per-phase conditioning ladder;
+- 每次 iteration 先取跨 rank 最大 latency,再以 nearest-rank 计算 p50/p90/p95/p99。
+
+被测 roundtrip p99 是 headline latency。Retries 保持为独立 attempts;后续成功不会抹除早期失败。
+Decode 和 prefill 表示一个 MoE-layer collective 所代表的 serving regime;在其他 shape 相同时,
+它们不会改变 timed primitive。
+
+NCCL/RCCL reference 是 end-to-end Python adapter,而不是 bare fabric primitive。其 dispatch
+boundary 包含 layout、count exchange、device-to-host split synchronization、fresh receive
+allocation,以及四次 payload/metadata all-to-all;activation-only combine 还包含一次 all-to-all 和
+scatter/reduction。因此其 p99 测量完整 reference-adapter boundary,可能对 host/scheduler 敏感。
+它可作为 portable system control,但不得标记为 fabric、link、bus 或 single-collective latency。
+
+带版本的 conditioning 和 EPLB planner contracts(reference trace、redundant count 和
+placement/remap version)属于 scheduled 和 evidence identity。
+
+Logical payload bandwidth 为:
+
+`logical_payload_bytes / measured_latency_seconds`
+
+Payload bytes 使用命名边界上的 rank-deduplicated activations 加必需 scale bytes,不包含 expert
+metadata、padding 和 backend buffer capacity。若没有定义 primitive model 或 transport counters,
+不发布 algorithm bandwidth、bus bandwidth、wire utilization 或 physical-link utilization。
+Logical bandwidth 绝不能标为 physical bandwidth。已发布 payload 和 token rates 命名为
+`rate_at_latency_percentile`:bytes 或 tokens 除以对应 latency percentile。它们是 p99 latency
+下的 lower-tail service rates,不是 inverted rate distribution 的 p99 percentiles。
+
+## 正确性
+
+与实现无关的 oracle 使用 expert-specific deterministic transform,使错误 expert routing 无法
+通过 identity roundtrip。它对每个 rank 和 point 验证:
+
+1. destination rank/expert、source token、multiplicity、gate weight 和 receive counts;
+2. timing 前的 dispatched payload 和 metadata;
+3. timing 前的 combined output;
+4. 所有 timed samples 期间 semantic inputs 不变;
+5. timing 后再次验证 dispatched payload/metadata 和 combined output。
+
+v1 的所有 adapter 均使用 activation-only、unweighted rank-sum combine。Oracle 在 combine 前
+构造每个 rank 的 gate-weighted expert aggregate,独立计算 `sum(gate * expert(token))`,
+并使用已记录的 `rtol=0.05` 和 `atol=0.02` 检查每个 element。任一 rank 或
+point 失败都会使 case 不合格。Pre/post dispatch evidence 按
+canonical source-token order 计算 hash。Native receive slots 可能非确定性分配,因此 physical
+receive order 不作为 correctness property。
+
+## Native 结果
+
+单个 raw case document 使用 `format: "collectivex.ep.v1"`,拒绝未知 fields,并包含:
+
+- `case`:稳定 case ID、suite、required tier 和 coordinate;
+- `workload`:canonical identity 和 logical MoE shape;
+- `measurement`:sampling、component states、timing 和 byte accounting;
+- `implementation`:实例化 class/API、固定 source、loaded libraries 和 resources;
+- `topology`:requested 和 realized SKU、devices、placement、scale-up domain 和 transport;
+- `provenance`:source SHA、image/squash hashes、allocation、run 和 attempt;
+- `rows`:point latency、byte accounting、token rate、correctness、load、fanout 和 anomaly evidence;
+- `outcome`:`success`、`failed`、`invalid`、`diagnostic` 或 `unsupported`,以及 reasons。
+
+Raw result documents 和 exact samples 会先经过临时 GitHub delivery artifacts,再由 publisher
+归档到 private bundle;它们不会进入 public tree。Private environment details 只保留在本地
+mode-0600 logs 和忽略的 operator notes 中;不会归档或发布。每个 expected case 有一个 terminal
+selected outcome,同时保留每次 attempt。
+
+## 身份与比较
+
+Canonical JSON 生成三个完整 SHA-256 IDs:
+
+- `series_id`:除 token coordinate 和 repeat allocation 外的所有 locked factors;
+- `point_id`:`series_id` 加 token coordinate;
+- `evidence_id`:`point_id` 加 allocation/run/attempt/sample checksum。
+
+Locked factors 包括 workload bytes、measurement 和 sampling contract、resources、realized
+topology、implementation/build、loaded libraries、image/squash、runtime 和 source SHA。
+Deferred code generation 会在 measurement 前捕获,并在之后再次捕获。DeepEP V2 使用固定的
+NVCC random seed,并绑定最终 cache keys、generated-source hashes 与 executable-SASS hashes;
+raw CUBIN bytes 仅保留为 private diagnostics。Hybrid 绑定实际自动调优配置与完整 kernel-key
+set,同时将各 rank 的 shared-object hashes 仅保留为 private diagnostics。本地构建的 extension
+hashes 属于 diagnostic;其固定 source trees、build recipe、runtime 与 dependencies 仍绑定到
+series。
+Series identity 包含 case ID;case ID 绑定完整 scheduled token ladder,以及固定的 percentile、
+rank-reduction、conditioning、warmup 和 correctness semantics。
+
+Controlled comparison 只声明一个 contrast:
+
+- `library`:backend implementation 及其 tuned resource profile 可以不同;realized system、
+ workload、EP、resource policy、source 和 measurement 必须匹配;
+- `chip`:受控 platform contrast。完整 realized system/topology 和 tuned resource profile 可以不同,
+ 但 workload、EP、placement class、resource policy、backend lineage、source 和 measurement 必须
+ 匹配。它不是 silicon-only comparison;
+- `system`:保留所有 hardware/backend 差异,同时匹配 workload、EP 和 measurement;
+- `routing`:routing distribution/EPLB 可以不同,但 static implementation build/generator、system、
+ model shape、resource profile 和 measurement 必须匹配。未启用 EPLB 的 Uniform 和 Zipf 复用
+ 同一 generated implementation;EPLB 的 physical-expert/JIT configuration 是显式 treatment
+ difference。
+
+任何未声明的 mismatch 都会拒绝 overlay。Chip/system results 描述 measured systems,而非仅描述
+silicon。
+
+## Evidence 策略
+
+Capability declarations 说明可以尝试什么;artifacts 决定 evidence status。Promotion 要求完整的
+expected coverage,不能有 missing、extra、duplicate、malformed 或 heterogeneous case。Public
+coverage 保留每个 matrix disposition;promotion 要求每个 runnable case 在所有 selected runs 中
+成功,且每个 planned-unsupported case 始终为 unsupported。只有固定 canonical full-v1 matrix,
+且具有 decision-grade library、chip、system 和 routing cohort,才能推进 `dev-latest`;partial
+matrices 仍为 diagnostic。Full-matrix digest 有意绑定精确 workflow shard grouping 和 requested
+cases,因此即使 case coverage 不变,修改 `--max-cases` 或 SKU round-robin scheduling order 也只
+会产生 diagnostic-only runs。Superseded retries、planned-unsupported outcomes 和 unstable
+comparison cohorts 可以用于诊断展示,但不能排名或推荐;promoted dataset 中每个成功的 required
+series 都必须保持 decision-grade。Runnable case 的任何 failed、invalid 或 diagnostic retry 都会
+阻止 promotion,即使后续 retry 成功。Routing cohorts 是 comparable-experimental sensitivities,
+不会产生 configuration recommendations;official library/platform/system cohorts 才能产生可执行
+recommendations。
+
+一个 point 只有在三个独立 workflow runs 和 allocation IDs 均通过 correctness、identity、
+provenance、tail gates、p50/p99 repeat-stability thresholds 和 stable ordering 后才成为
+decision-grade。Eligibility、controlled cohorts、sensitivity pairs 和 recommendations 由
+publisher 而非 frontend 计算。
+
+## 隔离产物存储
+
+开发阶段使用一个 self-hosted persistent filesystem,不使用 Vercel storage、GCP、Neon、
+managed database 或 managed object store。
+
+```text
+$COLLECTIVEX_STORE_ROOT/
+ private/incoming/ # write-once downloaded GHA attempts
+ private/bundles// # immutable source archives, native results/samples, matrix, checksums
+ private/quarantine/ # rejected attempts plus machine-readable reasons
+ public/datasets// # immutable sanitized frontend datasets
+ public/channels/ # small atomic pointers: latest-attempt, dev-latest
+ locks/
+```
+
+Private 和 public trees 使用不同 permissions。JSON manifests 和 checksums 是权威记录;可重建
+catalog 仅为 index。GitHub artifacts 是临时 delivery input。
+
+Container tags 会与固定 registry digests 核对。Enroot imports 使用固定
+`SOURCE_DATE_EPOCH` 和 versioned cache generation;每个 mounted squash 都重新计算 hash 并纳入
+series identity。Image-provided DeepEP 也按精确 per-architecture wheel 和 installed-file
+fingerprints 检查,因此 stale cache 不能继承固定 source identity。
+Source-built DeepEP V2 使用独立的 mode-0700 cluster-local cache,并且只以 `/cx-cache` 挂载。
+其 content key 绑定版本化 build recipe、verified image digest、CPU/GPU architecture、
+upstream source trees 和固定 build dependencies。该 cache 既不是 artifact,也不是 publisher
+input;每次执行的 source/results stage 仍然隔离且可丢弃,并在复用前以 marker 和 runtime probe
+fail closed。Runner UID 属于受信任的 cluster boundary:该 cache 用于防止 stale 或意外修改,
+不防御恶意的同 UID job。只有从未发布的 partial build 才能自动重置;已发布 cache 一旦未通过
+integrity 或 runtime 检查,将保持原样并被拒绝,避免并发 allocation 正在使用的文件被删除。
+
+Publication 采用 fail-closed:
+
+1. 获取 exclusive filesystem lock,并在 destination filesystem 上 stage;
+2. 解析前归档 source bytes;
+3. 要求精确 matrix-declared artifact set,并拒绝每个未消费 archive member;
+4. 验证 strict schemas、privacy、checksums、identities、timing 和精确 matrix outcomes;
+5. 写入 checksums 和 `COMPLETE`,fsync,然后原子 rename private bundle;
+6. 构建并验证 sanitized content-addressed dataset,fsync,然后原子 rename;
+7. 仅在全部 promotion gates 通过后原子替换 `dev-latest.json`。
+
+Rejected attempts 可以更新 `latest-attempt`,但不能更新 `dev-latest`。Channel responses 使用
+`no-cache`;immutable datasets 使用 content hashes 和 long-lived caching。InferenceX 前端中的
+same-host read-only HTTP route 只暴露 `public/` 下两个 channel documents 和 digest-addressed
+datasets;它拒绝 incomplete objects、directory listing 和 client-supplied filesystem paths。
+
+`publisher.py ingest` 接受精确 matrix,并为每个 GitHub artifact 接受一个 `--artifact` directory
+或 ZIP。`promote` 接受显式 immutable bundle IDs。默认 `verify` 要求 `latest-attempt`;若存在
+`dev-latest` 也会验证,而显式 `--channel dev-latest` 则要求其存在。Frontend process 接收相同的
+absolute、non-symlinked `COLLECTIVEX_STORE_ROOT`,并执行唯一的 HTTP serving。
+
+Frontend 获取 channel pointer,在 runtime 验证它,解析 immutable dataset,验证其
+digest/format,并渲染 terminal coverage。它不会虚构 missing values、选择 retries,或重新计算
+decision eligibility。
+
+## 历史数据
+
+Numeric schemas 3-5 不在 v1 publisher 和 frontend reader 范围内。它们仍是 historical
+diagnostic evidence,不能作为 `dev-latest` 初始数据或驱动 v1 decisions。
diff --git a/experimental/CollectiveX/identity.py b/experimental/CollectiveX/identity.py
new file mode 100644
index 0000000000..f3cec953a3
--- /dev/null
+++ b/experimental/CollectiveX/identity.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""Canonical, cross-runtime identities for CollectiveX v1."""
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from typing import Any
+
+IDENTITY_VERSION = 1
+MAX_SAFE_INTEGER = (1 << 53) - 1
+PREFIXES = {
+ "case": "cxcase-v1-",
+ "workload": "cxwork-v1-",
+ "series": "cxseries-v1-",
+ "point": "cxpoint-v1-",
+ "evidence": "cxevidence-v1-",
+ "allocation": "cxallocation-v1-",
+ "attempt": "cxattempt-v1-",
+}
+V1_CASE_PROFILE = {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67,
+}
+
+
+class IdentityError(ValueError):
+ """An identity payload cannot be represented consistently across runtimes."""
+
+
+def _validate(value: Any, path: str = "$") -> None:
+ if value is None or isinstance(value, bool):
+ return
+ if isinstance(value, str):
+ if any(ord(character) < 0x20 or ord(character) > 0x7E for character in value):
+ raise IdentityError(f"{path}: string must contain printable ASCII only")
+ return
+ if type(value) is int:
+ if abs(value) > MAX_SAFE_INTEGER:
+ raise IdentityError(f"{path}: integer exceeds the cross-runtime safe range")
+ return
+ if isinstance(value, list):
+ for index, item in enumerate(value):
+ _validate(item, f"{path}[{index}]")
+ return
+ if isinstance(value, dict):
+ for key, item in value.items():
+ if not isinstance(key, str):
+ raise IdentityError(f"{path}: object key is not a string")
+ if any(ord(character) < 0x20 or ord(character) > 0x7E for character in key):
+ raise IdentityError(f"{path}: object key must contain printable ASCII only")
+ _validate(item, f"{path}.{key}")
+ return
+ raise IdentityError(f"{path}: unsupported identity value {type(value).__name__}")
+
+
+def canonical_bytes(value: Any) -> bytes:
+ """Return compact UTF-8 JSON after enforcing the portable value subset."""
+ _validate(value)
+ return json.dumps(
+ value,
+ ensure_ascii=False,
+ allow_nan=False,
+ sort_keys=True,
+ separators=(",", ":"),
+ ).encode("utf-8")
+
+
+def digest(kind: str, value: Any) -> str:
+ """Hash a typed v1 identity payload and return its typed identifier."""
+ try:
+ prefix = PREFIXES[kind]
+ except KeyError as exc:
+ raise IdentityError(f"unknown identity kind {kind!r}") from exc
+ body = {"kind": kind, "value": value, "version": IDENTITY_VERSION}
+ return prefix + hashlib.sha256(canonical_bytes(body)).hexdigest()
+
+
+def is_typed_id(value: Any, kind: str) -> bool:
+ prefix = PREFIXES.get(kind)
+ return bool(
+ isinstance(value, str)
+ and prefix
+ and re.fullmatch(re.escape(prefix) + r"[0-9a-f]{64}", value)
+ )
+
+
+def case_id(*, sku: str, profile: dict[str, Any], case: dict[str, Any]) -> str:
+ return digest("case", {"case": case, "profile": profile, "sku": sku})
+
+
+def workload_id(value: dict[str, Any]) -> str:
+ return digest("workload", value)
+
+
+def series_id(value: dict[str, Any]) -> str:
+ return digest("series", value)
+
+
+def point_id(*, series: str, tokens_per_rank: int) -> str:
+ return digest("point", {"series_id": series, "tokens_per_rank": tokens_per_rank})
+
+
+def allocation_id(value: dict[str, Any]) -> str:
+ return digest("allocation", value)
+
+
+def attempt_id(*, allocation: str, case: str, ordinal: int) -> str:
+ return digest(
+ "attempt", {"allocation_id": allocation, "case_id": case, "ordinal": ordinal}
+ )
+
+
+def evidence_id(
+ *, point: str, allocation: str, attempt: str, sample_sha256: str
+) -> str:
+ return digest(
+ "evidence",
+ {
+ "allocation_id": allocation,
+ "attempt_id": attempt,
+ "point_id": point,
+ "sample_sha256": sample_sha256,
+ },
+ )
+
+
+IDENTITY_TEST_VECTOR = {
+ "payload": {"backend": "deepep", "ep": 8, "shape": [7168, 8, 256]},
+ "series_id": "cxseries-v1-a79bf758488e3edd50f5531f3af825f371bf42aae7c4097e461fd2a32615af81",
+}
+
+
+def verify_test_vector() -> None:
+ observed = series_id(IDENTITY_TEST_VECTOR["payload"])
+ if observed != IDENTITY_TEST_VECTOR["series_id"]:
+ raise IdentityError(
+ f"identity implementation differs: {observed} != {IDENTITY_TEST_VECTOR['series_id']}"
+ )
+
+
+if __name__ == "__main__":
+ verify_test_vector()
+ print(IDENTITY_TEST_VECTOR["series_id"])
diff --git a/experimental/CollectiveX/launchers/launch_gb-nv.sh b/experimental/CollectiveX/launchers/launch_gb-nv.sh
new file mode 100644
index 0000000000..97d0377e00
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb-nv.sh
@@ -0,0 +1,335 @@
+#!/usr/bin/env bash
+# CollectiveX shared GB200/GB300 NVL72 (aarch64) launcher.
+#
+# Two paths by CX_NODES:
+# CX_NODES<=1 (EP4): single NVL72 tray, 4 GPU. Hands off to run_in_container.sh (torchrun -g 4).
+# CX_NODES==2 (EP8): 2 trays, 8 GPU over the MNNVL NVLink domain. run_in_container's single-node
+# torchrun can't span nodes, so this path runs run_ep.py DIRECTLY across 8 srun tasks (1 rank
+# each), per-rank RANK/LOCAL_RANK from SLURM_*, MASTER_ADDR=first node — the intranode NVLink
+# path works across <=8 ranks on MNNVL (no internode/NVSHMEM). One allocation runs the shard.
+#
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-${CX_PUBLIC_RUNNER:-}}}"
+case "$PRODUCT" in
+ gb200|gb300) ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to gb200 or gb300" ;;
+esac
+RUNNER="$PRODUCT"
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}"
+export CX_IMAGE_PLATFORM=linux/arm64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-4}"
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}"
+EXPECTED_WORLD=$((NODES * GPN))
+NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}"
+if [ "$PRODUCT" = gb200 ]; then default_time=30; else default_time=90; fi
+TIME_MIN="${CX_TIME:-$default_time}"
+[ "$NODES" = 1 ] || [ "$NODES" = 2 ] || cx_die "$PRODUCT supports one or two four-GPU trays"
+[ "$GPN" = 4 ] || cx_die "$PRODUCT requires four GPUs per tray"
+[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "$PRODUCT requires the NVL72 scale-up domain"
+[ "$NGPUS" = "$EXPECTED_WORLD" ] || cx_die "$PRODUCT world size must equal nodes x GPUs per tray"
+cx_apply_timing_profile
+# CX_IMAGE is a Docker tag; cx_ensure_squash derives the local squash filename.
+IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="${PRODUCT}-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_NODES="$NODES" CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN"
+export CX_NGPUS="$NGPUS"
+case "$CX_BENCH" in
+ deepep|deepep-v2|deepep-hybrid|nccl-ep) ;;
+ *) cx_die "unsupported $PRODUCT EP backend: $CX_BENCH" ;;
+esac
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR
+[ "$PRODUCT" != gb300 ] || cx_require_vars CX_ENROOT_CACHE_PATH
+PARTITION="$CX_PARTITION"; ACCOUNT="$CX_ACCOUNT"; SQUASH_DIR="$CX_SQUASH_DIR"
+[ -z "${CX_ENROOT_CACHE_PATH:-}" ] || export ENROOT_CACHE_PATH="$CX_ENROOT_CACHE_PATH"
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "$PRODUCT runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH phase=${CX_PHASE:-decode}"
+[ "${CX_DRYRUN:-0}" = "1" ] && { cx_log "DRYRUN"; exit 0; }
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "$CX_STAGE_DIR")"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+CONTAINER_MOUNTS="$MOUNT_SRC:/ix"
+if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then
+ cx_set_failure_stage backend-setup
+ cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \
+ || cx_die "cannot stage the pinned backend source"
+ export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources
+fi
+if [ "$CX_BENCH" = deepep-v2 ]; then
+ cx_prepare_backend_cache "$CX_SQUASH_DIR" \
+ || cx_die "cannot prepare the isolated backend cache"
+ BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE"
+ CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$BACKEND_CACHE:/cx-cache"
+ export CX_BACKEND_CACHE_ROOT=/cx-cache
+fi
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found"
+
+if [ "$NODES" -le 1 ]; then
+ cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" \
+ --gres=gpu:"$GPN" --exclusive --mem=0 --cpus-per-task=72 \
+ --time="$TIME_MIN" --job-name="$RUNNER"
+else
+ cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+ --gres=gpu:"$GPN" --ntasks-per-node="$GPN" --exclusive --mem=0 --cpus-per-task=35 \
+ --time="$TIME_MIN" --job-name="$RUNNER"
+fi
+[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+cx_set_failure_stage container-import
+SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$SQUASH_DIR" "$IMAGE")"
+cx_set_failure_stage container-hash
+cx_export_squash_identity "$SQUASH_FILE"
+cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \
+ "${CX_SHARD_FILE:-}"
+
+if [ "$NODES" -le 1 ]; then # ---- EP4: single tray, run_in_container (torchrun -g 4) ----
+ run_rc=0
+ cx_set_failure_stage container-launch
+ runtime_log="$(cx_private_log_path runtime-ep4)"
+ srun --jobid="$JOB_ID" --chdir=/tmp --container-image="$SQUASH_FILE" \
+ --container-mounts="$CONTAINER_MOUNTS" \
+ --no-container-mount-home --container-writable --container-remap-root \
+ --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \
+ --export="$(cx_container_exports)" bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \
+ >"$runtime_log" 2>&1 || run_rc=$?
+ cx_adopt_runtime_stage "$MOUNT_SRC"
+ [ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true
+ collect_rc=0
+ cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+ [ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+ final_rc="$run_rc"
+ [ "$final_rc" != 0 ] || final_rc="$collect_rc"
+ exit "$final_rc"
+fi
+
+# ---- EP8: 2 trays, run_ep.py directly across 8 ranks (no torchrun; MNNVL intranode path) ----
+cx_set_failure_stage scheduler-allocation
+MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)" 2>/dev/null | head -1)"
+MP="${CX_MASTER_PORT:-29551}"
+[[ "$MA" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \
+ || cx_die "could not resolve the allocated primary node"
+[[ "$MP" =~ ^[1-9][0-9]*$ ]] && [ "$MP" -le 65535 ] \
+ || cx_die "invalid distributed rendezvous port"
+mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+# Restore process-local loader/import paths and exact backend build identity from build-only.
+SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66'
+BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; esac'
+WRAP="$SOURCE_BACKEND_ENV"'; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+# Prepare the backend once per node in the persistent container reused by every case.
+CNAME="cxep8_${JOB_ID}"
+CMOUNT=(--container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home
+ --container-writable --container-remap-root
+ --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint)
+cx_log "EP backend preparation: bench=$CX_BENCH"
+cx_set_failure_stage backend-setup
+build_log="$(cx_private_log_path backend-prepare)"
+set +e
+srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \
+ --container-name="$CNAME" --container-image="$SQUASH_FILE" "${CMOUNT[@]}" \
+ --export="$(cx_container_exports),CX_BUILD_ONLY=1" \
+ bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \
+ "$build_log" 2>&1
+build_rc=$?
+if [ "$build_rc" = 0 ]; then
+ srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \
+ --container-name="$CNAME" "${CMOUNT[@]}" --export="$(cx_container_exports)" \
+ bash -c "$BACKEND_PROBE" \
+ >"$build_log" 2>&1
+ build_rc=$?
+fi
+set -e
+if [ "$build_rc" != 0 ]; then
+ cx_fail_stage backend-setup "$build_log" || true
+ cx_log "ERROR: EP backend preparation failed rc=$build_rc"
+ cx_emit_setup_failures "$CX_DIR" "$MOUNT_SRC/experimental/CollectiveX/results" \
+ "$CX_BENCH" "$build_rc"
+ cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || true
+ exit "$build_rc"
+fi
+cx_set_failure_stage execution
+
+# The EP8 case list as pipe-delimited records. SWEEP (CX_SHARD_FILE set): one line per shard case,
+# so the rack-scale EP8 path sweeps EVERY case of its shard (parity with run_in_container's single-
+# node SHARD loop). MANUAL (no shard file) emits one line per requested phase.
+cx_ep8_cases() {
+ # CX_SHARD_FILE is workflow-relative (.shards/.json, written by the Extract step with
+ # working-directory=experimental/CollectiveX). This EP8 path runs on the SUBMIT HOST where cwd is
+ # the repo root, so resolve it against $CX_DIR (=experimental/CollectiveX) when not found as-is —
+ # else the SHARD branch is skipped and only ONE default case runs instead of the shard's N.
+ local sf="${CX_SHARD_FILE:-}"
+ [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf"
+ if [ -n "$sf" ]; then
+ [ -f "$sf" ] || { cx_log "ERROR: shard control disappeared"; return 1; }
+ # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` would collapse consecutive tabs and
+ # swallow empty fields like a false eplb, shifting every column. No case field contains '|'.)
+ python3 - "$sf" <<'PY'
+import json, sys
+d = json.load(open(sys.argv[1]))
+for c in d["cases"]:
+ g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv)
+ print("|".join([g("phase","decode"), g("routing","uniform"),
+ ("1" if c.get("eplb") else ""), g("hidden","7168"), g("topk","8"), g("experts","256"),
+ g("ladder",""), g("suite",""), g("workload",""), g("required_publication",""),
+ ("1" if c.get("canonical") else ""), g("case_id",""), g("ep",""),
+ g("timing","8:64:32")]))
+PY
+ else
+ local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+ local ph; local -a fields
+ for ph in $phases; do
+ fields=("$ph" "${CX_ROUTING:-uniform}" "${CX_EPLB:+1}"
+ "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}"
+ "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" "${CX_REQUIRED_PUBLICATION:-}"
+ "${CX_CANONICAL:+1}" "${CX_CASE_ID:-}" "$NGPUS"
+ "${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}")
+ (IFS='|'; printf '%s\n' "${fields[*]}")
+ done
+ fi
+}
+
+# Per-rank env for the EP8 case sruns. DeepEP main's Buffer gates multi-tray NVLink behind allow_mnnvl, which defaults
+# False -> DeepEP then sets NVSHMEM_DISABLE_MNNVL=1 and the legacy buffer takes the intranode-only CUDA-IPC
+# peer path, faulting across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). CX_ALLOW_MNNVL=1
+# makes tests/ep_deepep.py pass allow_mnnvl=True so the NVL buffer spans both trays over the fabric API.
+# The pinned V1 exposes this flag explicitly; the adapter fails closed if that API changes.
+EP8_EXPORTS="$(cx_container_exports),MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1"
+[ "$CX_BENCH" = "deepep" ] && EP8_EXPORTS="$EP8_EXPORTS,CX_ALLOW_MNNVL=1"
+
+ci=0
+failed_cases=0
+cases_file="$(mktemp)"
+if ! cx_ep8_cases > "$cases_file"; then
+ rm -f "$cases_file"
+ cx_die "could not enumerate validated shard cases"
+fi
+expected_cases="$(wc -l < "$cases_file" | tr -d ' ')"
+[ "$expected_cases" -gt 0 ] || { rm -f "$cases_file"; cx_die "case list is empty"; }
+while IFS='|' read -r ph routing eplb hidden topk experts lad suite workload required_pub \
+ canonical case_id ep timing; do
+ [ -n "$ph" ] || continue
+ ci=$((ci+1))
+ case_stem="${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")"
+ IFS=':' read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}"
+ case_iters="${case_iters:-8}"; case_trials="${case_trials:-64}"; case_warmup="${case_warmup:-32}"
+ ep="${ep:-$NGPUS}"
+ export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+ export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep"
+ export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_TOKENS_LADDER="$lad"
+ export CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts"
+ export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup"
+ export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))"
+ export CX_WARMUP_SEMANTICS="full-roundtrip-before-each-component-trial-point-v1"
+ cx_log "EP${NGPUS}[$ci] id=${case_id:-manual} $ph $CX_BENCH routing=$routing eplb=${eplb:-0}"
+ if [ "$ep" != "$NGPUS" ]; then
+ cx_log "ERROR: case EP$ep does not match allocated world size $NGPUS"
+ export CX_ATTEMPT_ID=1
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5
+ failed_cases=$((failed_cases + 1))
+ continue
+ fi
+
+ workload_dir=""
+ if [ -n "$canonical" ]; then
+ workload_dir=".cx_workloads/ep${ep}_${routing}"
+ workload_ladder="$lad"
+ [ -n "$workload_ladder" ] || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+ workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir" --routing "$routing"
+ --ep "$ep" --hidden "$hidden" --topk "$topk" --experts "$experts"
+ --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder")
+ workload_log="$(cx_private_log_path "workload-c$(printf '%03d' "$ci")")"
+ stage_rc=0
+ set +e
+ srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --container-name="$CNAME" "${CMOUNT[@]}" \
+ --export="$EP8_EXPORTS" "${workload_args[@]}" \
+ "$workload_log" 2>&1
+ stage_rc=$?
+ set -e
+ if [ "$stage_rc" != 0 ]; then
+ cx_log "ERROR: canonical workload staging failed rc=$stage_rc"
+ export CX_ATTEMPT_ID=1
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc"
+ failed_cases=$((failed_cases + 1))
+ continue
+ fi
+ fi
+
+ ep_args=(--backend "$CX_BENCH" --phase "$ph" --routing "$routing"
+ --gpus-per-node "$GPN" --scale-up-domain "$SCALE_UP_DOMAIN"
+ --tokens-ladder "$lad"
+ --hidden "$hidden" --topk "$topk" --experts "$experts"
+ --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials"
+ --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$CX_TOPO"
+ --transport "$CX_TRANSPORT" --case-id "$case_id" --suite "$suite"
+ --workload-name "$workload" --required-publication "$required_pub")
+ [ -n "$eplb" ] && ep_args+=(--eplb)
+ [ -n "$workload_dir" ] && ep_args+=(--workload-dir "$workload_dir")
+ attempt=1
+ case_ok=0
+ export CX_ATTEMPT_ID="$attempt"
+ attempt_tag="a01"
+ out="results/${case_stem}_${attempt_tag}.json"
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json"
+ runtime_log="$(cx_private_log_path "runtime-c$(printf '%03d' "$ci")-$attempt_tag")"
+ set +e
+ timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" \
+ --ntasks="$NGPUS" --chdir=/tmp \
+ --ntasks-per-node="$GPN" --container-name="$CNAME" "${CMOUNT[@]}" \
+ --export="$EP8_EXPORTS" \
+ bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" \
+ "$runtime_log" 2>&1
+ run_rc=$?
+ set -e
+ expected_out="$MOUNT_SRC/experimental/CollectiveX/$out"
+ if [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" success; then
+ case_ok=1
+ elif [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" invalid; then
+ cx_log "ERROR: EP${NGPUS}[$ci] completed with invalid semantic evidence"
+ else
+ [ "$run_rc" = 0 ] && run_rc=1
+ if cx_has_result_doc "$expected_out"; then
+ cx_demote_result_doc "$expected_out" "$run_rc" \
+ || { cx_quarantine_result_doc "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; }
+ else
+ cx_quarantine_result_doc "$expected_out"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"
+ fi
+ fi
+ if [ "$case_ok" = 0 ]; then
+ failed_cases=$((failed_cases + 1))
+ cx_log "ERROR: EP${NGPUS}[$ci] failed"
+ fi
+done < "$cases_file"
+rm -f "$cases_file"
+[ "$ci" -eq "$expected_cases" ] || cx_die "enumerated $expected_cases cases but executed $ci"
+run_rc=0
+if [ "$failed_cases" -ne 0 ]; then
+ summary_log="$(cx_private_log_path shard-summary)"
+ printf 'SHARD done: %s/%s case(s) failed\n' "$failed_cases" "$expected_cases" > "$summary_log"
+ cx_fail_stage execution "$summary_log" || true
+ run_rc=1
+fi
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/launchers/launch_mi-amds.sh b/experimental/CollectiveX/launchers/launch_mi-amds.sh
new file mode 100644
index 0000000000..5f3de33078
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi-amds.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+# CollectiveX shared MI325X/MI355X AMD Slurm launcher.
+#
+# The ROCm path imports its squash in the allocation and uses writable/remapped
+# pyxis containers. Scheduling, exclusions, node pins, and storage come from the
+# runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}"
+case "$RUNNER" in
+ mi325x) CPUS_PER_TASK=256; DEVICE_MOUNTS=",/dev/kfd:/dev/kfd,/dev/dri:/dev/dri" ;;
+ mi355x) CPUS_PER_TASK=128; DEVICE_MOUNTS="" ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to mi325x or mi355x" ;;
+esac
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-mori}"
+export CX_IMAGE_PLATFORM=linux/amd64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm image
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}"
+# Optional node pin overrides the exclusion list.
+NODELIST="${CX_NODELIST:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# AMD EP backends: MoRI and the portable NCCL/RCCL all-to-all reference.
+case "$CX_BENCH" in
+ mori|nccl-ep) ;;
+ *) cx_die "unsupported AMD EP backend: $CX_BENCH" ;;
+esac
+if [ "$RUNNER" = mi325x ]; then
+ export MORI_DISABLE_AUTO_XGMI="${MORI_DISABLE_AUTO_XGMI:-0}"
+ export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}"
+ export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-info}"
+ export MORI_SHMEM_LOG_LEVEL="${MORI_SHMEM_LOG_LEVEL:-info}"
+ export MORI_IO_LOG_LEVEL="${MORI_IO_LOG_LEVEL:-info}"
+ if [ "$CX_BENCH" = mori ]; then
+ export CX_IMAGE="${CX_IMAGE:-$CX_IMAGE_AMD_MORI_MI325}"
+ export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-asyncll}"
+ fi
+fi
+# Resolve the image now that CX_BENCH and RUNNER are both final (see note at IMAGE decl).
+IMAGE="${CX_IMAGE:-$(cx_default_image "$RUNNER")}"
+export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES=1 CX_GPUS_PER_NODE="$NGPUS"
+export CX_SCALE_UP_DOMAIN="$NGPUS" CX_TS="$TS"
+# topology_class is part of comparison_key; label the actual SKU when the MI325X wrapper calls this.
+case "${RUNNER}" in
+ mi325x*) export CX_TOPO="mi325x-xgmi" ;;
+ *) export CX_TOPO="mi355x-xgmi" ;;
+esac
+export CX_TRANSPORT="xgmi"
+# Allow a longer per-phase guard for large MoRI prefill points.
+export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}"
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR
+PARTITION="$CX_PARTITION"
+SQUASH_DIR="$CX_SQUASH_DIR"
+cx_log "runner=$RUNNER ngpus=$NGPUS bench=$CX_BENCH"
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER"
+
+# Pin to specific nodes when configured, otherwise apply the optional exclusion list.
+allocation=(--partition="$PARTITION" --gres=gpu:"$NGPUS" --exclusive
+ --cpus-per-task="$CPUS_PER_TASK"
+ --time="$TIME_MIN" --job-name="$RUNNER")
+if [ -n "$NODELIST" ]; then
+ cx_log "using configured node pin"
+ allocation+=(--nodelist="$NODELIST")
+elif [ -n "$EXCLUDE_NODES" ]; then
+ allocation+=(--exclude="$EXCLUDE_NODES")
+fi
+cx_salloc_jobid "${allocation[@]}"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+
+cx_set_failure_stage container-import
+SQUASH_FILE="$(cx_ensure_squash_on_job \
+ "$JOB_ID" "$SQUASH_DIR" "$IMAGE" "${CX_LOCK_DIR:-}")"
+cx_set_failure_stage container-hash
+import_log="$(cx_private_log_path image-hash)"
+if ! COLLECTIVEX_SQUASH_SHA256="$(
+ srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --export="$(cx_host_exports)" \
+ sha256sum "$SQUASH_FILE" \
+ 2>>"$import_log" | awk 'NR==1 {print $1}'
+)"; then
+ cx_fail_stage container-hash "$import_log"
+fi
+[[ "$COLLECTIVEX_SQUASH_SHA256" =~ ^[0-9a-f]{64}$ ]] \
+ || cx_fail_stage container-hash "$import_log"
+export COLLECTIVEX_SQUASH_SHA256
+cx_preflight_allocation "$JOB_ID" 1 "$MOUNT_SRC" "$SQUASH_FILE" "${CX_SHARD_FILE:-}"
+
+run_rc=0
+cx_set_failure_stage container-launch
+runtime_log="$(cx_private_log_path runtime)"
+srun --jobid="$JOB_ID" --chdir=/tmp \
+ --container-image="$SQUASH_FILE" \
+ --container-mounts="$MOUNT_SRC:$MOUNT_DIR$DEVICE_MOUNTS" \
+ --container-writable --container-remap-root --no-container-mount-home \
+ --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+ --no-container-entrypoint --export="$(cx_container_exports)" \
+ bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" \
+ >"$runtime_log" 2>&1 || run_rc=$?
+
+cx_adopt_runtime_stage "$MOUNT_SRC"
+[ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the
+# next checkout on this runner is clean.
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+cx_log "done — result artifacts collected"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/launchers/launch_single-slurm.sh b/experimental/CollectiveX/launchers/launch_single-slurm.sh
new file mode 100644
index 0000000000..b9b1ef9e8d
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_single-slurm.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+# CollectiveX shared single-node NVIDIA Slurm launcher.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}"
+ALLOC_EXTRA=(); SRUN_EXTRA=(); LOCAL_IMPORT=0
+case "$RUNNER" in
+ h100-dgxc) PRODUCT=h100; TOPO=h100-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 ;;
+ h200-dgxc)
+ PRODUCT=h200; TOPO=h200-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=0
+ SRUN_EXTRA=(--container-remap-root)
+ ;;
+ b200-dgxc)
+ PRODUCT=b200; TOPO=b200-nvlink-island; DEFAULT_TIME=30; REQUIRE_ACCOUNT=1
+ ALLOC_EXTRA=(--mem=0)
+ ;;
+ b300)
+ PRODUCT=b300; TOPO=b300-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1
+ ALLOC_EXTRA=(-N 1 --mem=0)
+ SRUN_EXTRA=(--mpi=none --container-remap-root)
+ LOCAL_IMPORT=1
+ ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to a registered single-node SKU" ;;
+esac
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}"
+export CX_IMAGE_PLATFORM=linux/amd64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-$DEFAULT_TIME}"
+IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES=1 CX_GPUS_PER_NODE="$NGPUS"
+export CX_SCALE_UP_DOMAIN="$NGPUS" CX_TS="$TS" CX_TOPO="$TOPO" CX_TRANSPORT=nvlink
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export NCCL_CUMEM_ENABLE=1
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR
+[ "$REQUIRE_ACCOUNT" = 0 ] || cx_require_vars CX_ACCOUNT
+[ "$RUNNER" != b300 ] || cx_require_vars CX_STAGE_DIR
+
+cx_log "runner=$RUNNER ngpus=$NGPUS bench=$CX_BENCH"
+[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; }
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+SQUASH_FILE=""
+if [ "$LOCAL_IMPORT" = 1 ]; then
+ cx_set_failure_stage container-import
+ SQUASH_FILE="$(CX_ENROOT_LOCAL_IMPORT=1 \
+ cx_ensure_squash "$CX_SQUASH_DIR" "$IMAGE")"
+ cx_set_failure_stage container-hash
+ cx_export_squash_identity "$SQUASH_FILE"
+fi
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+CONTAINER_MOUNTS="$MOUNT_SRC:/ix"
+if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then
+ cx_set_failure_stage backend-setup
+ cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \
+ || cx_die "cannot stage the pinned backend source"
+ export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources
+fi
+if [ "$CX_BENCH" = deepep-v2 ]; then
+ cx_prepare_backend_cache "$CX_SQUASH_DIR" \
+ || cx_die "cannot prepare the isolated backend cache"
+ BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE"
+ CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$BACKEND_CACHE:/cx-cache"
+ export CX_BACKEND_CACHE_ROOT=/cx-cache
+fi
+
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER"
+
+allocation=(--partition="$CX_PARTITION" --gres=gpu:"$NGPUS" --exclusive
+ --time="$TIME_MIN" --job-name="$RUNNER" "${ALLOC_EXTRA[@]}")
+[ -z "${CX_ACCOUNT:-}" ] || allocation+=(--account="$CX_ACCOUNT")
+[ -z "${CX_EXCLUDE_NODES:-}" ] || allocation+=(--exclude="$CX_EXCLUDE_NODES")
+cx_salloc_jobid "${allocation[@]}"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+if [ "$LOCAL_IMPORT" = 0 ]; then
+ cx_set_failure_stage container-import
+ SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$CX_SQUASH_DIR" "$IMAGE")"
+ cx_set_failure_stage container-hash
+ cx_export_squash_identity "$SQUASH_FILE"
+fi
+cx_preflight_allocation "$JOB_ID" 1 "$MOUNT_SRC" "$SQUASH_FILE" "${CX_SHARD_FILE:-}"
+
+run_rc=0
+cx_set_failure_stage container-launch
+runtime_log="$(cx_private_log_path runtime)"
+srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" \
+ --container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home \
+ --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \
+ "${SRUN_EXTRA[@]}" --export="$(cx_container_exports)" \
+ bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \
+ >"$runtime_log" 2>&1 || run_rc=$?
+cx_adopt_runtime_stage "$MOUNT_SRC"
+[ "$run_rc" = 0 ] || cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+cx_log "done - result artifacts collected"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/publisher.py b/experimental/CollectiveX/publisher.py
new file mode 100644
index 0000000000..a90dc99970
--- /dev/null
+++ b/experimental/CollectiveX/publisher.py
@@ -0,0 +1,3167 @@
+#!/usr/bin/env python3
+"""Fail-closed filesystem publisher for CollectiveX EP v1 artifacts."""
+from __future__ import annotations
+
+import argparse
+import contextlib
+import datetime as dt
+import fcntl
+import hashlib
+import json
+import math
+import os
+from pathlib import Path, PurePosixPath
+import re
+import shutil
+import stat
+import statistics
+import sys
+import tempfile
+from typing import Any, Iterator, Sequence
+import zipfile
+
+import jsonschema
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+
+import artifact_safety # noqa: E402
+import capability # noqa: E402
+import contracts # noqa: E402
+import identity # noqa: E402
+import sweep_matrix # noqa: E402
+
+FORMAT_BUNDLE = "collectivex.private.bundle.v1"
+FORMAT_PUBLIC = "collectivex.public.v1"
+FORMAT_CHANNEL = "collectivex.channel.v1"
+POLICY = "collectivex-decision-grade-v1"
+PUBLISHER_POLICY = "collectivex-publisher-v1"
+OUTCOMES = ("success", "unsupported", "failed", "invalid", "diagnostic")
+REQUIRED_ALLOCATIONS = 3
+REQUIRED_COHORT_KINDS = ("library", "chip", "system", "routing")
+REQUIRED_PROMOTION_COHORT_COUNTS = {"library": 48, "system": 12, "routing": 76}
+CANONICAL_FULL_V1_MATRIX_SHA256 = (
+ "292e05f8faccaa4971eda527a327190a9943e99d4f71611987f7b95f57f253e8"
+)
+CANONICAL_FULL_V1_CASE_CATALOG_SHA256 = (
+ "29a9e2d65777e0bf388d49bfe31f91e0ec6537dafdaa71ac91c6ed75f9e44b00"
+)
+P50_STABILITY_LIMIT = 1.10
+P99_STABILITY_LIMIT = 1.25
+MAX_ARCHIVE_MEMBERS = 20_000
+MAX_ARCHIVE_MEMBER_BYTES = 2 * 1024**3
+MAX_ARCHIVE_TOTAL_BYTES = 16 * 1024**3
+MAX_PUBLIC_DATASET_BYTES = 32 * 1024**2
+HEX64 = re.compile(r"[0-9a-f]{64}")
+SAFE_ID = re.compile(r"[a-z0-9][a-z0-9_.-]{0,127}")
+REASON = re.compile(r"[a-z0-9][a-z0-9.-]{0,95}")
+ARTIFACT_NAME = re.compile(
+ r"cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*"
+)
+CHANNEL_PATH = re.compile(r"datasets/([0-9a-f]{64})/dataset\.json")
+SCHEMA_DIR = HERE / "schemas"
+_SCHEMAS: dict[str, jsonschema.protocols.Validator] = {}
+
+
+class PublisherError(ValueError):
+ """Input or stored state violates the publication contract."""
+
+
+strict_load = contracts.strict_load
+_canonical = contracts.canonical_json_bytes
+
+
+def _sha_bytes(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+def _sha_file(path: Path) -> str:
+ digest = hashlib.sha256()
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ return digest.hexdigest()
+
+
+def _latest_timestamp(values: Sequence[str]) -> str:
+ """Return the latest evidence timestamp without introducing publisher wall time."""
+ if not values:
+ raise PublisherError("cannot derive a timestamp without evidence")
+
+ def parsed(value: str) -> dt.datetime:
+ try:
+ timestamp = dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except ValueError as exc:
+ raise PublisherError("evidence timestamp is not ISO-8601") from exc
+ if timestamp.tzinfo is None:
+ raise PublisherError("evidence timestamp must include a timezone")
+ return timestamp.astimezone(dt.timezone.utc)
+
+ return max(values, key=lambda value: (parsed(value), value))
+
+
+def _schema(name: str, value: Any) -> None:
+ validator = _SCHEMAS.get(name)
+ if validator is None:
+ schema = strict_load(SCHEMA_DIR / name)
+ jsonschema.Draft202012Validator.check_schema(schema)
+ validator = jsonschema.Draft202012Validator(
+ schema, format_checker=jsonschema.FormatChecker()
+ )
+ _SCHEMAS[name] = validator
+ errors = sorted(validator.iter_errors(value), key=lambda error: list(error.absolute_path))
+ if errors:
+ error = errors[0]
+ location = ".".join(map(str, error.absolute_path)) or "$"
+ raise PublisherError(f"{name}:{location}: {error.message}")
+def _exact(obj: Any, fields: set[str], path: str) -> dict[str, Any]:
+ if not isinstance(obj, dict):
+ raise PublisherError(f"{path} must be an object")
+ actual = set(obj)
+ if actual != fields:
+ raise PublisherError(
+ f"{path} fields differ: missing={sorted(fields - actual)}, "
+ f"extra={sorted(actual - fields)}"
+ )
+ return obj
+def _array(value: Any, path: str, *, nonempty: bool = False) -> list[Any]:
+ if not isinstance(value, list) or (nonempty and not value):
+ qualifier = "a nonempty" if nonempty else "an"
+ raise PublisherError(f"{path} must be {qualifier} array")
+ return value
+
+
+def _integer(value: Any, path: str, *, minimum: int = 0) -> int:
+ if type(value) is not int or value < minimum:
+ raise PublisherError(f"{path} must be an integer >= {minimum}")
+ return value
+
+
+def _unique(values: Sequence[Any], path: str) -> None:
+ serialized = [_canonical(value) for value in values]
+ if len(serialized) != len(set(serialized)):
+ raise PublisherError(f"{path} contains duplicates")
+
+def _eligibility(value: dict[str, Any], path: str) -> dict[str, Any]:
+ allocations = value["allocation_ids"]
+ p50 = value["p50_max_min_ratio"]
+ p99 = value["p99_max_min_ratio"]
+ gates = (
+ len(allocations) >= REQUIRED_ALLOCATIONS,
+ value["complete"], value["correct"], value["measured_roundtrip_p99"],
+ value["stable_p50"], value["stable_p99"], value["stable_ordering"],
+ p50 is not None and p50 <= P50_STABILITY_LIMIT,
+ p99 is not None and p99 <= P99_STABILITY_LIMIT,
+ )
+ if value["decision_grade"] != (all(gates) and not value["reasons"]):
+ raise PublisherError(f"{path}.decision_grade does not match promotion gates")
+ if value["decision_grade"] == bool(value["reasons"]):
+ raise PublisherError(f"{path}.reasons does not match decision status")
+ return value
+
+
+def validate_channel(doc: Any, *, expected_channel: str | None = None) -> dict[str, Any]:
+ _schema("channel-v1.schema.json", doc)
+ if expected_channel and doc["channel"] != expected_channel:
+ raise PublisherError("channel name does not match its file")
+ target = doc["dataset"]
+ match = CHANNEL_PATH.fullmatch(target["path"]) if isinstance(target["path"], str) else None
+ if not match or match.group(1) != target["sha256"]:
+ raise PublisherError("channel dataset path and sha256 do not agree")
+ return doc
+
+
+def _metric_value(series: dict[str, Any], metric: dict[str, Any]) -> tuple[str, float, str]:
+ point = next(
+ (point for point in series["points"] if point["tokens_per_rank"] == metric["tokens_per_rank"]),
+ None,
+ )
+ if point is None or series["phase"] != metric["phase"]:
+ raise PublisherError("decision metric references an unavailable point")
+ component = point["components"]["roundtrip"]
+ if metric["measure"] == "latency_us":
+ value = component["latency_us"][metric["statistic"]]
+ unit = "us"
+ else:
+ rates = component["logical_payload_rate_gbps_at_latency_percentile"]
+ if rates is None:
+ raise PublisherError("logical bandwidth decision has no logical byte contract")
+ value = rates[metric["statistic"]]
+ unit = "GB/s"
+ return point["point_id"], value, unit
+
+
+def _validate_metric(metric: dict[str, Any]) -> None:
+ expected = "min" if metric["measure"] == "latency_us" else "max"
+ if metric["objective"] != expected:
+ raise PublisherError(f"{metric['measure']} objective must be {expected}")
+
+
+def _metric_label(measure: str, statistic: str) -> str:
+ return (
+ f"{statistic} latency"
+ if measure == "latency_us"
+ else f"payload rate at {statistic} latency"
+ )
+
+
+def _routing_build_control(build: dict[str, Any]) -> dict[str, Any]:
+ return {
+ key: build[key]
+ for key in (
+ "routing_control_sha256", "image_digest", "source_sha", "squash_sha256",
+ )
+ }
+
+
+def _routing_implementation_mismatch(members: Sequence[dict[str, Any]]) -> bool:
+ off_eplb_hashes = {
+ member["build"]["implementation_contract_sha256"]
+ for member in members if not member["workload"]["eplb"]
+ }
+ return len(off_eplb_hashes) > 1
+
+
+def _public_case_factors(series: dict[str, Any]) -> dict[str, Any]:
+ workload = series["workload"]
+ system = series["system"]
+ measurement = series["measurement"]
+ platform = capability.PLATFORMS[system["sku"]]
+ ep_size = system["ep_size"]
+ return {
+ "case": {
+ "backend": series["backend"]["id"],
+ "canonical": True,
+ "eplb": workload["eplb"],
+ "ep": ep_size,
+ "experts": workload["experts"],
+ "gpus_per_node": platform["gpus_per_node"],
+ "hidden": workload["hidden"],
+ "ladder": " ".join(str(point["tokens_per_rank"]) for point in series["points"]),
+ "nodes": ep_size // platform["gpus_per_node"],
+ "phase": series["phase"],
+ "required_publication": series["publication_tier"],
+ "routing": workload["routing"],
+ "samples_per_point": measurement["samples_per_component"],
+ "scale_up_domain": platform["scale_up_domain"],
+ "suite": series["suite"],
+ "timing": (
+ f"{measurement['iters']}:{measurement['trials']}:"
+ f"{measurement['warmups']}"
+ ),
+ "topk": workload["top_k"],
+ "warmup_semantics": sweep_matrix.ep_harness.WARMUP_SEMANTICS,
+ "workload": series["model"],
+ },
+ "profile": identity.V1_CASE_PROFILE,
+ "sku": system["sku"],
+ }
+
+
+def _public_series_config(series: dict[str, Any]) -> dict[str, Any]:
+ return {
+ "backend": {
+ "generation": series["backend"]["generation"],
+ "version": series["backend"]["version"],
+ },
+ "resource": series["resource"],
+ "system": {"label": series["system"]["label"]},
+ }
+
+
+def _public_cohort_factors(kind: str, item: dict[str, Any]) -> tuple[Any, Any]:
+ workload = item["workload"]
+ build = item["build"]
+ shape = {
+ key: workload[key]
+ for key in (
+ "hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype",
+ "activation_profile",
+ )
+ }
+ common = {
+ "model": item["model"], "phase": item["phase"], "shape": shape,
+ "measurement": item["measurement"], "ep_size": item["system"]["ep_size"],
+ }
+ if kind == "library":
+ return (
+ {**common, "system": item["system"], "workload": workload,
+ "resource_mode": item["resource"]["mode"], "source": build["source_sha"]},
+ item["backend"]["id"],
+ )
+ if kind == "chip":
+ return (
+ {**common, "backend": item["backend"], "workload": workload,
+ "resource_mode": item["resource"]["mode"], "source": build["source_sha"]},
+ item["system"],
+ )
+ if kind == "system":
+ return {**common, "workload": workload, "source": build["source_sha"]}, [
+ item["system"]["sku"], item["backend"]["id"], item["resource"]["profile"]
+ ]
+ if kind == "routing":
+ return (
+ {**common, "backend": item["backend"], "system": item["system"],
+ "resource": item["resource"], "build": _routing_build_control(build)},
+ [workload["routing"], workload["eplb"],
+ build["implementation_contract_sha256"]],
+ )
+ raise PublisherError(f"unknown cohort kind {kind}")
+
+
+def _case_disposition_catalog_sha256(coverage: Sequence[dict[str, Any]]) -> str:
+ catalog = [
+ {"case_id": item["case_id"], "disposition": item["disposition"]}
+ for item in sorted(coverage, key=lambda item: item["case_id"])
+ ]
+ return _sha_bytes(_canonical(catalog))
+
+
+def validate_public_dataset(doc: Any) -> dict[str, Any]:
+ _schema("public-dataset-v1.schema.json", doc)
+ if len(_canonical(doc)) + 1 > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("public dataset exceeds the serving size limit")
+ try:
+ artifact_safety.assert_publication_safe([doc])
+ except artifact_safety.ArtifactSafetyError as exc:
+ raise PublisherError(str(exc)) from exc
+ if doc["source_bundle_ids"] != sorted(doc["source_bundle_ids"]):
+ raise PublisherError("source bundle IDs are not canonical")
+ for field, key in (
+ ("coverage", "case_id"), ("attempts", "attempt_id"),
+ ("series", "series_id"), ("cohorts", "cohort_id"),
+ ("rankings", "ranking_id"), ("recommendations", "recommendation_id"),
+ ("sensitivities", "sensitivity_id"),
+ ):
+ if doc[field] != sorted(doc[field], key=lambda item: item[key]):
+ raise PublisherError(f"{field} are not in canonical identity order")
+ promotion = doc["promotion"]
+ quarantined = promotion["status"] == "quarantined"
+ if quarantined != (promotion["reason"] is not None) or quarantined != (
+ promotion["matrix_id"] is None
+ ):
+ raise PublisherError("promotion reason/matrix identity differs from status")
+ attempts = {item["attempt_id"]: item for item in doc["attempts"]}
+ if len(attempts) != len(doc["attempts"]):
+ raise PublisherError("dataset has duplicate attempt IDs")
+ evidence = [
+ value["evidence_id"] for item in doc["attempts"] for value in item["evidence"]
+ ]
+ _unique(evidence, "dataset attempt evidence")
+ series = {item["series_id"]: item for item in doc["series"]}
+ if len(series) != len(doc["series"]):
+ raise PublisherError("dataset has duplicate series IDs")
+ allocation_ids = set(promotion["allocation_ids"])
+ case_ids = {item["case_id"] for item in doc["coverage"]}
+ if len(case_ids) != len(doc["coverage"]):
+ raise PublisherError("dataset has duplicate case coverage")
+ coverage_by_case = {item["case_id"]: item for item in doc["coverage"]}
+ for item in doc["attempts"]:
+ if item["case_id"] not in case_ids or item["allocation_id"] not in allocation_ids:
+ raise PublisherError("attempt references undeclared coverage or allocation")
+ if item["series_id"] is not None and item["series_id"] not in series:
+ raise PublisherError("attempt references unknown series")
+ if (item["outcome"] == "success") != (item["reason"] is None):
+ raise PublisherError("attempt reason must be null exactly for success")
+ if item["outcome"] == "success" and item["failure_mode"] is not None:
+ raise PublisherError("successful attempt cannot have a failure mode")
+ if (item["outcome"] == "success" and item["selected"]) != (
+ item["series_id"] is not None
+ ):
+ raise PublisherError("attempt series must be present exactly for selected success")
+ if {item["allocation_id"] for item in doc["attempts"]} != allocation_ids:
+ raise PublisherError("promotion allocation catalog differs from attempts")
+ attempt_groups: dict[tuple[str, str], list[dict[str, Any]]] = {}
+ for item in doc["attempts"]:
+ attempt_groups.setdefault((item["case_id"], item["allocation_id"]), []).append(item)
+ for (case_id, allocation_id), group in attempt_groups.items():
+ ordinals = sorted(item["attempt_index"] for item in group)
+ if ordinals != list(range(1, len(group) + 1)):
+ raise PublisherError("public retries must retain contiguous attempt indexes")
+ if any(
+ item["attempt_id"] != identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=item["attempt_index"]
+ )
+ for item in group
+ ):
+ raise PublisherError("public retry identity differs from its case/allocation/index")
+ selected = [item for item in group if item["selected"]]
+ if len(selected) != 1 or selected[0]["attempt_index"] != ordinals[-1]:
+ raise PublisherError("publisher must select the latest retry per case/allocation")
+ selected_by_series: dict[str, list[dict[str, Any]]] = {}
+ for item in doc["attempts"]:
+ if item["selected"] and item["outcome"] == "success":
+ selected_by_series.setdefault(item["series_id"], []).append(item)
+ terminal = 0
+ for item in doc["coverage"]:
+ listed = set(item["attempt_ids"])
+ selected = item["selected_attempt_id"]
+ expected_attempts = {
+ attempt_id for attempt_id, attempt in attempts.items()
+ if attempt["case_id"] == item["case_id"]
+ }
+ if listed != expected_attempts:
+ raise PublisherError("coverage references attempts from another case")
+ if selected is not None:
+ terminal += 1
+ if (selected not in listed or not attempts[selected]["selected"]
+ or any(attempts[selected][field] != item[field]
+ for field in ("outcome", "failure_mode", "reason"))):
+ raise PublisherError("coverage selected outcome differs")
+ selected_candidates = [attempts[value] for value in listed if attempts[value]["selected"]]
+ latest = max(
+ selected_candidates,
+ key=lambda attempt: (
+ int(attempt["run_id"]), attempt["run_attempt"],
+ attempt["attempt_index"], attempt["attempt_id"]
+ ),
+ )
+ if selected != latest["attempt_id"]:
+ raise PublisherError("coverage does not select the latest canonical allocation")
+ if promotion["requested_cases"] != len(doc["coverage"]) or promotion["terminal_cases"] != terminal:
+ raise PublisherError("promotion coverage counts differ")
+ selected_evidence: dict[tuple[str, str], set[str]] = {}
+ for attempt in doc["attempts"]:
+ if attempt["selected"] and attempt["series_id"] is not None:
+ for value in attempt["evidence"]:
+ selected_evidence.setdefault(
+ (attempt["series_id"], value["point_id"]), set()
+ ).add(value["evidence_id"])
+ for item in doc["series"]:
+ eligibility = _eligibility(item["eligibility"], f"series {item['series_id']}")
+ workload = item["workload"]
+ model, hidden, top_k, experts = sweep_matrix.V1_WORKLOAD
+ suite_contract = sweep_matrix.V1_SUITE_CONTRACTS.get(item["suite"])
+ coordinate = (item["phase"], workload["routing"], workload["eplb"])
+ if (
+ item["model"] != model
+ or (workload["hidden"], workload["top_k"], workload["experts"])
+ != (hidden, top_k, experts)
+ or suite_contract is None
+ or coordinate not in suite_contract["coordinates"]
+ or item["publication_tier"] != suite_contract["publication"]
+ ):
+ raise PublisherError("series differs from the frozen v1 workload/suite profile")
+ backend_id = item["backend"]["id"]
+ expected_role = "reference" if backend_id == "nccl-ep" else "library"
+ if (
+ backend_id not in capability.BACKENDS
+ or item["backend"]["label"] != BACKEND_LABELS[backend_id]
+ or item["backend"]["role"] != expected_role
+ or item["backend"]["version"] is None
+ ):
+ raise PublisherError("series backend projection differs from v1")
+ sku = item["system"]["sku"]
+ platform = capability.PLATFORMS.get(sku)
+ ep_size = item["system"]["ep_size"]
+ if platform is None or ep_size % platform["gpus_per_node"]:
+ raise PublisherError("series system projection differs from v1")
+ nodes = ep_size // platform["gpus_per_node"]
+ supported, _ = capability.resolve(
+ sku, backend_id, nodes=nodes,
+ routing=workload["routing"], eplb=workload["eplb"],
+ )
+ if (
+ not supported
+ or item["system"]["vendor"] != platform["vendor"]
+ or item["system"]["transport"] != platform["transport"]
+ or item["system"]["topology_class"] != platform["topology_class"]
+ or item["system"]["world_size"] != ep_size
+ or platform["product"] not in set(
+ re.findall(r"[a-z]+\d+[a-z]*", item["system"]["label"].lower())
+ )
+ ):
+ raise PublisherError("series system projection differs from v1")
+ if contracts.public_series_config_sha256(_public_series_config(item)) != item[
+ "build"
+ ]["public_config_sha256"]:
+ raise PublisherError("public series configuration differs from its commitment")
+ covered = [coverage_by_case.get(case_id) for case_id in item["case_ids"]]
+ if not covered or any(
+ case is None
+ or (case["sku"], case["backend"], case["phase"])
+ != (sku, backend_id, item["phase"])
+ for case in covered
+ ):
+ raise PublisherError("series projection differs from its case coverage")
+ if (
+ item["eplb"]["enabled"] != item["workload"]["eplb"]
+ or item["eplb"]["logical_experts"] != item["workload"]["experts"]
+ ):
+ raise PublisherError("series EPLB descriptor differs from its workload")
+ eplb = item["eplb"]
+ expected_physical = eplb["logical_experts"] + eplb["redundant_experts"]
+ nullable_eplb = (
+ "planner", "mapping_sha256", "reference_tokens_per_rank", "max_replicas",
+ "imbalance_before", "imbalance_after",
+ )
+ if eplb["enabled"]:
+ if (
+ item["workload"]["routing"] != "zipf"
+ or any(eplb[field] is None for field in nullable_eplb)
+ or eplb["planner"] != "greedy-rank-major-v1"
+ or eplb["reference_tokens_per_rank"] != 2048
+ or eplb["redundant_experts"] != 32
+ or eplb["redundant_experts"] % ep_size != 0
+ or eplb["physical_experts"] != expected_physical
+ or eplb["logical_experts"] % ep_size != 0
+ or eplb["physical_experts"] % ep_size != 0
+ or not 1 <= eplb["replicated_experts"] <= min(
+ eplb["logical_experts"], eplb["redundant_experts"]
+ )
+ or not 2 <= eplb["max_replicas"] <= 1 + eplb["redundant_experts"]
+ or not 1 <= eplb["imbalance_after"] <= eplb["imbalance_before"] <= ep_size
+ ):
+ raise PublisherError("enabled EPLB descriptor is incomplete")
+ expected_plan = contracts._expected_eplb_plan(
+ workload["routing"], workload["top_k"],
+ eplb["logical_experts"], eplb["physical_experts"], ep_size,
+ identity.V1_CASE_PROFILE["seed"],
+ identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"],
+ )
+ expected_eplb = {
+ "enabled": True,
+ "planner": identity.V1_CASE_PROFILE["eplb_planner"],
+ "mapping_sha256": contracts.eplb_contract.mapping_hash(expected_plan),
+ "logical_experts": eplb["logical_experts"],
+ "physical_experts": eplb["physical_experts"],
+ "redundant_experts": identity.V1_CASE_PROFILE["eplb_redundant_experts"],
+ "reference_tokens_per_rank": identity.V1_CASE_PROFILE[
+ "eplb_reference_tokens_per_rank"
+ ],
+ "replicated_experts": expected_plan["replicated_experts"],
+ "max_replicas": expected_plan["max_replicas"],
+ "imbalance_before": expected_plan["imbalance_before"],
+ "imbalance_after": expected_plan["imbalance_after"],
+ }
+ if eplb != expected_eplb:
+ raise PublisherError("enabled EPLB descriptor differs from deterministic plan")
+ elif (
+ any(eplb[field] is not None for field in nullable_eplb)
+ or eplb["physical_experts"] != expected_physical
+ or eplb["redundant_experts"] != 0
+ or eplb["replicated_experts"] != 0
+ ):
+ raise PublisherError("disabled EPLB descriptor claims a plan")
+ if item["backend"]["id"] == "nccl-ep":
+ expected_generation = (
+ "nccl" if item["system"]["vendor"] == "nvidia" else "rccl"
+ )
+ if item["backend"]["generation"] != expected_generation:
+ raise PublisherError("NCCL/RCCL reference generation differs from system vendor")
+ if (item["status"] == "decision-grade") != eligibility["decision_grade"]:
+ raise PublisherError("series status differs from eligibility")
+ if (
+ set(eligibility["allocation_ids"]) != set(item["allocation_ids"])
+ or eligibility["correct"] != all(point["correct"] for point in item["points"])
+ ):
+ raise PublisherError("series eligibility differs from its evidence")
+ selected_attempts = selected_by_series.get(item["series_id"], [])
+ if (
+ set(item["case_ids"]) != {attempt["case_id"] for attempt in selected_attempts}
+ or set(item["allocation_ids"])
+ != {attempt["allocation_id"] for attempt in selected_attempts}
+ ):
+ raise PublisherError("series case/allocation catalog differs from selected attempts")
+ if item["eligibility"]["decision_grade"] and len(
+ {attempt["run_id"] for attempt in selected_attempts}
+ ) < REQUIRED_ALLOCATIONS:
+ raise PublisherError("decision-grade series lacks independent workflow runs")
+ tokens = [point["tokens_per_rank"] for point in item["points"]]
+ if tokens != sorted(set(tokens)):
+ raise PublisherError("series points are not in unique ascending token order")
+ if len(item["case_ids"]) != 1:
+ raise PublisherError("public series must represent exactly one v1 case")
+ case_id = item["case_ids"][0]
+ if identity.digest("case", _public_case_factors(item)) != case_id:
+ raise PublisherError("public series projection differs from its case identity")
+ build = item["build"]
+ expected_series_id = identity.series_id({
+ "backend": backend_id,
+ "case_id": case_id,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build[
+ "implementation_contract_sha256"
+ ],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": workload["workload_id"],
+ })
+ if item["series_id"] != expected_series_id:
+ raise PublisherError("public series identity differs from its committed factors")
+ for point in item["points"]:
+ if point["point_id"] != identity.point_id(series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]):
+ raise PublisherError("point identity differs")
+ if point["global_tokens"] != point["tokens_per_rank"] * item["system"]["ep_size"]:
+ raise PublisherError("global_tokens must use EP size")
+ routing = point["routing"]
+ max_fanout = min(item["workload"]["top_k"], item["system"]["ep_size"])
+ if (
+ routing["routed_copies"] < point["global_tokens"]
+ or routing["routed_copies"] > point["global_tokens"] * max_fanout
+ or routing["recv_tokens_max"] > routing["routed_copies"]
+ or routing["recv_tokens_max"] * item["system"]["ep_size"]
+ < routing["routed_copies"]
+ or not math.isclose(
+ routing["fanout_mean"],
+ routing["routed_copies"] / point["global_tokens"],
+ rel_tol=1e-12,
+ )
+ or routing["hotspot_ratio"] < 1
+ or routing["empty_expert_count"] >= eplb["physical_experts"]
+ or routing["empty_rank_count"] >= item["system"]["ep_size"]
+ ):
+ raise PublisherError("point routing/load facts are internally inconsistent")
+ expected_evidence = selected_evidence.get(
+ (item["series_id"], point["point_id"]), set()
+ )
+ if set(point["evidence_ids"]) != expected_evidence:
+ raise PublisherError("point evidence differs from selected series attempts")
+ components = point["components"]
+ if (components["dispatch"] is None) != (components["combine"] is None):
+ raise PublisherError("dispatch/combine availability differs")
+ for name, component in components.items():
+ if component is None:
+ continue
+ expected_origin = "derived" if name == "isolated_sum" else "measured"
+ expected_samples = None if name == "isolated_sum" else 512
+ if component["origin"] != expected_origin or component["sample_count"] != expected_samples:
+ raise PublisherError(f"{name} origin or sample count differs")
+ if name == "isolated_sum" and (
+ component["logical_bytes"] is not None
+ or component["logical_payload_rate_gbps_at_latency_percentile"] is not None
+ ):
+ raise PublisherError("isolated_sum cannot publish logical bandwidth")
+ if name != "isolated_sum" and (
+ component["logical_bytes"] is None
+ or component["logical_payload_rate_gbps_at_latency_percentile"] is None
+ ):
+ raise PublisherError(f"{name} measured logical bandwidth is missing")
+ latency = component["latency_us"]
+ if list(latency.values()) != sorted(latency.values()):
+ raise PublisherError("latency percentiles are not ordered")
+ if component["logical_payload_rate_gbps_at_latency_percentile"] is not None:
+ for statistic, rate in component["logical_payload_rate_gbps_at_latency_percentile"].items():
+ expected = component["logical_bytes"] / (latency[statistic] * 1000.0)
+ if not math.isclose(rate, expected, rel_tol=1e-9, abs_tol=1e-12):
+ raise PublisherError("logical GB/s formula differs")
+ if components["roundtrip"] is None or components["roundtrip"]["origin"] != "measured":
+ raise PublisherError("roundtrip must be measured")
+ for statistic, throughput in point["roundtrip_token_rate_at_latency_percentile"].items():
+ expected = point["global_tokens"] / (
+ components["roundtrip"]["latency_us"][statistic] * 1e-6
+ )
+ if not math.isclose(throughput, expected, rel_tol=1e-9):
+ raise PublisherError("roundtrip token throughput formula differs")
+ if components["dispatch"] is not None:
+ derived = components["isolated_sum"]
+ if derived is None or any(not math.isclose(
+ derived["latency_us"][statistic],
+ components["dispatch"]["latency_us"][statistic]
+ + components["combine"]["latency_us"][statistic], rel_tol=1e-12
+ ) for statistic in ("p50", "p90", "p95", "p99")):
+ raise PublisherError("isolated_sum is not the component percentile sum")
+ elif components["isolated_sum"] is not None:
+ raise PublisherError("isolated_sum requires measured dispatch/combine components")
+ cohorts = {item["cohort_id"]: item for item in doc["cohorts"]}
+ if len(cohorts) != len(doc["cohorts"]):
+ raise PublisherError("dataset has duplicate cohort IDs")
+ for item in doc["cohorts"]:
+ if not set(item["series_ids"]).issubset(series):
+ raise PublisherError("cohort references unknown series")
+ members = [series[series_id] for series_id in item["series_ids"]]
+ expected_tier = (
+ "comparable-experimental"
+ if any(member["publication_tier"] == "comparable-experimental" for member in members)
+ else "official"
+ )
+ if item["publication_tier"] != expected_tier:
+ raise PublisherError("cohort publication tier differs from its members")
+ roles = {member["backend"]["role"] for member in members}
+ if item["kind"] == "library" and roles != {"library"}:
+ raise PublisherError("library cohort contains non-library evidence")
+ if item["kind"] == "system" and roles != {"reference"}:
+ raise PublisherError("system cohort is not a portable reference comparison")
+ if item["kind"] in {"chip", "routing"} and len(
+ {_canonical(member["backend"]) for member in members}
+ ) != 1:
+ raise PublisherError(f"{item['kind']} cohort mixes backend implementations")
+ public_factors = [_public_cohort_factors(item["kind"], member) for member in members]
+ if len({_canonical(value[0]) for value in public_factors}) != 1:
+ raise PublisherError(f"{item['kind']} cohort does not control its public factors")
+ if len({_canonical(value[1]) for value in public_factors}) < 2:
+ raise PublisherError(f"{item['kind']} cohort does not vary its declared contrast")
+ if item["kind"] == "routing":
+ if item["publication_tier"] != "comparable-experimental":
+ raise PublisherError("routing cohort must be experimental")
+ has_baseline = sum(
+ member["workload"]["routing"] == "uniform"
+ and not member["workload"]["eplb"]
+ for member in members
+ ) == 1
+ missing_reason = "missing-uniform-baseline" in item["eligibility"]["reasons"]
+ if has_baseline == missing_reason:
+ raise PublisherError("routing baseline and eligibility reason disagree")
+ mismatch = _routing_implementation_mismatch(members)
+ mismatch_reason = "implementation-config-mismatch" in item["eligibility"]["reasons"]
+ if mismatch != mismatch_reason:
+ raise PublisherError("routing implementation control and eligibility disagree")
+ expected_id = _derived_id("cxcohort-v1-", {
+ "kind": item["kind"], "series_ids": item["series_ids"],
+ "controlled_factors": item["controlled_factors"],
+ "varying_factors": item["varying_factors"],
+ })
+ if item["cohort_id"] != expected_id:
+ raise PublisherError("cohort ID differs from its public factors")
+ expected_factors = {
+ "library": (
+ ["system", "workload", "phase", "measurement", "resource.mode", "source"],
+ ["backend", "resource"],
+ ),
+ "chip": (
+ ["backend", "source", "workload", "phase", "measurement", "resource.mode"],
+ ["system", "resource"],
+ ),
+ "system": (
+ ["workload", "phase", "measurement", "source"],
+ ["system", "backend", "resource"],
+ ),
+ "routing": (
+ ["backend", "implementation-static-build", "system", "model-shape", "phase", "measurement", "resource"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ ),
+ }[item["kind"]]
+ member_allocations = {
+ allocation for series_id in item["series_ids"]
+ for allocation in series[series_id]["allocation_ids"]
+ }
+ if (
+ (item["controlled_factors"], item["varying_factors"]) != expected_factors
+ or set(item["eligibility"]["allocation_ids"]) != member_allocations
+ ):
+ raise PublisherError("cohort factors or allocations differ from its members")
+ _eligibility(item["eligibility"], f"cohort {item['cohort_id']}")
+ expected_ranking_keys: set[tuple[str, str, str, int]] = set()
+ for cohort in doc["cohorts"]:
+ if not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series[series_id] for series_id in cohort["series_ids"]]
+ tokens = set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]}
+ for member in members
+ ))
+ expected_ranking_keys.update(
+ (cohort["cohort_id"], measure, statistic, token)
+ for token in tokens
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile")
+ for statistic in ("p50", "p99")
+ )
+ ranking_top: dict[tuple[str, str, str, int], dict[str, Any]] = {}
+ ranking_ids: set[str] = set()
+ for ranking in doc["rankings"]:
+ cohort = cohorts.get(ranking["cohort_id"])
+ if (
+ cohort is None
+ or not cohort["eligibility"]["decision_grade"]
+ or ranking["eligibility"] != cohort["eligibility"]
+ or ranking["publication_tier"] != cohort["publication_tier"]
+ ):
+ raise PublisherError("ranking references an ineligible cohort")
+ entries = ranking["entries"]
+ _validate_metric(ranking["metric"])
+ if cohort["kind"] == "library" and any(
+ series[series_id]["backend"]["role"] == "reference"
+ for series_id in cohort["series_ids"]
+ ):
+ raise PublisherError("reference evidence cannot drive a library ranking")
+ if {entry["series_id"] for entry in entries} != set(cohort["series_ids"]):
+ raise PublisherError("ranking does not cover its cohort")
+ for entry in entries:
+ point_id, value, unit = _metric_value(series[entry["series_id"]], ranking["metric"])
+ if entry["point_id"] != point_id or entry["unit"] != unit or not math.isclose(entry["value"], value, rel_tol=1e-12):
+ raise PublisherError("ranking entry differs from series data")
+ reverse = ranking["metric"]["objective"] == "max"
+ expected = sorted(entries, key=lambda entry: (entry["value"], entry["series_id"]), reverse=reverse)
+ if entries != expected or [entry["rank"] for entry in entries] != list(range(1, len(entries) + 1)):
+ raise PublisherError("ranking order differs")
+ metric = ranking["metric"]
+ expected_id = _derived_id("cxranking-v1-", {
+ "cohort_id": ranking["cohort_id"], "metric": metric,
+ })
+ if ranking["ranking_id"] != expected_id or expected_id in ranking_ids:
+ raise PublisherError("ranking ID is duplicate or differs")
+ ranking_ids.add(expected_id)
+ ranking_top[(ranking["cohort_id"], metric["measure"], metric["statistic"], metric["tokens_per_rank"])] = entries[0]
+ if set(ranking_top) != expected_ranking_keys:
+ raise PublisherError("rankings do not cover every eligible cohort metric")
+ objective = {
+ "min-p50-latency": ("latency_us", "p50"), "min-p99-latency": ("latency_us", "p99"),
+ "max-payload-rate-at-p50-latency": (
+ "logical_payload_rate_gbps_at_latency_percentile", "p50"
+ ),
+ "max-payload-rate-at-p99-latency": (
+ "logical_payload_rate_gbps_at_latency_percentile", "p99"
+ ),
+ }
+ recommendation_ids: set[str] = set()
+ for item in doc["recommendations"]:
+ measure, statistic = objective[item["objective"]]
+ candidates = [top for key, top in ranking_top.items()
+ if key[:3] == (item["cohort_id"], measure, statistic) and top["point_id"] == item["point_id"]]
+ if len(candidates) != 1 or any(item[field] != candidates[0][field] for field in ("series_id", "point_id", "value", "unit")):
+ raise PublisherError("recommendation is not a ranking winner")
+ matching_ranking = next(
+ ranking for ranking in doc["rankings"]
+ if ranking["cohort_id"] == item["cohort_id"]
+ and ranking["metric"]["measure"] == measure
+ and ranking["metric"]["statistic"] == statistic
+ and ranking["entries"][0]["point_id"] == item["point_id"]
+ )
+ expected_id = _derived_id("cxrecommendation-v1-", {
+ "objective": item["objective"], "ranking_id": matching_ranking["ranking_id"],
+ })
+ cohort = cohorts[item["cohort_id"]]
+ if (item["recommendation_id"] != expected_id or expected_id in recommendation_ids
+ or cohort["publication_tier"] != "official"
+ or item["publication_tier"] != "official"
+ or item["eligibility"] != cohort["eligibility"]):
+ raise PublisherError("recommendation ID/eligibility differs")
+ recommendation_ids.add(expected_id)
+ expected_recommendations = sum(
+ cohorts[ranking["cohort_id"]]["publication_tier"] == "official"
+ for ranking in doc["rankings"]
+ )
+ if len(doc["recommendations"]) != expected_recommendations:
+ raise PublisherError("recommendations do not cover every actionable ranking")
+ sensitivity_ids: set[str] = set()
+ sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set()
+ for item in doc["sensitivities"]:
+ cohort = cohorts.get(item["cohort_id"])
+ if (
+ cohort is None
+ or cohort["kind"] != "routing"
+ or not cohort["eligibility"]["decision_grade"]
+ or item["publication_tier"] != cohort["publication_tier"]
+ or item["eligibility"] != cohort["eligibility"]
+ ):
+ raise PublisherError("sensitivity references a non-routing cohort")
+ if (
+ item["baseline_series_id"] == item["candidate_series_id"]
+ or not {item["baseline_series_id"], item["candidate_series_id"]}.issubset(cohort["series_ids"])
+ ):
+ raise PublisherError("sensitivity series differ from its routing cohort")
+ _validate_metric(item["metric"])
+ baseline_series = series[item["baseline_series_id"]]
+ if (
+ baseline_series["workload"]["routing"] != "uniform"
+ or baseline_series["workload"]["eplb"]
+ ):
+ raise PublisherError("sensitivity baseline is not uniform without EPLB")
+ _, baseline, _ = _metric_value(series[item["baseline_series_id"]], item["metric"])
+ _, candidate, _ = _metric_value(series[item["candidate_series_id"]], item["metric"])
+ if not math.isclose(item["signed_change_ratio"], (candidate - baseline) / baseline, rel_tol=1e-12):
+ raise PublisherError("sensitivity ratio differs")
+ expected_id = _derived_id("cxsensitivity-v1-", {
+ "baseline": item["baseline_series_id"],
+ "candidate": item["candidate_series_id"],
+ "cohort": item["cohort_id"], "metric": item["metric"],
+ })
+ if item["sensitivity_id"] != expected_id or expected_id in sensitivity_ids:
+ raise PublisherError("sensitivity ID is duplicate or differs")
+ sensitivity_ids.add(expected_id)
+ sensitivity_keys.add((
+ item["cohort_id"], item["baseline_series_id"], item["candidate_series_id"],
+ item["metric"]["measure"], item["metric"]["statistic"],
+ item["metric"]["tokens_per_rank"],
+ ))
+ expected_sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set()
+ for cohort in doc["cohorts"]:
+ if cohort["kind"] != "routing" or not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series[series_id] for series_id in cohort["series_ids"]]
+ baseline = next((
+ member for member in members
+ if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]
+ ), None)
+ if baseline is None:
+ continue
+ tokens = set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]}
+ for member in members
+ ))
+ expected_sensitivity_keys.update(
+ (cohort["cohort_id"], baseline["series_id"], candidate["series_id"],
+ measure, statistic, token)
+ for candidate in members if candidate is not baseline
+ for token in tokens
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile")
+ for statistic in ("p50", "p99")
+ )
+ if sensitivity_keys != expected_sensitivity_keys:
+ raise PublisherError("sensitivities do not cover every routing contrast metric")
+ if promotion["status"] == "promoted":
+ run_ids = {item["run_id"] for item in doc["attempts"] if item["selected"]}
+ repeated_cases = all(
+ len({
+ attempts[attempt_id]["run_id"]
+ for attempt_id in coverage["attempt_ids"]
+ if attempts[attempt_id]["selected"]
+ }) == REQUIRED_ALLOCATIONS
+ for coverage in doc["coverage"]
+ )
+ if promotion["matrix_id"] != CANONICAL_FULL_V1_MATRIX_SHA256:
+ raise PublisherError("promotion requires the canonical full-v1 matrix")
+ if (
+ _case_disposition_catalog_sha256(doc["coverage"])
+ != CANONICAL_FULL_V1_CASE_CATALOG_SHA256
+ ):
+ raise PublisherError("promotion requires the canonical case/disposition catalog")
+ if (
+ terminal != len(doc["coverage"])
+ or len(doc["source_bundle_ids"]) != REQUIRED_ALLOCATIONS
+ or len(run_ids) != REQUIRED_ALLOCATIONS
+ or not repeated_cases
+ ):
+ raise PublisherError("promoted dataset lacks complete coverage")
+ expected_outcomes = {
+ item["case_id"]: (
+ "success" if item["disposition"] == "runnable" else "unsupported"
+ )
+ for item in doc["coverage"]
+ }
+ if any(
+ item["selected"]
+ and item["outcome"] != expected_outcomes[item["case_id"]]
+ for item in doc["attempts"]
+ ):
+ raise PublisherError("promoted outcomes differ from requested dispositions")
+ runnable_cases = {
+ item["case_id"] for item in doc["coverage"]
+ if item["disposition"] == "runnable"
+ }
+ if any(
+ item["case_id"] in runnable_cases and item["outcome"] != "success"
+ for item in doc["attempts"]
+ ):
+ raise PublisherError(
+ "promotion rejects runnable cases with failed, invalid, or diagnostic retries"
+ )
+ _require_promotion_series(doc["series"])
+ _require_promotion_cohorts(doc["cohorts"], doc["series"])
+ if not doc["rankings"] or not doc["recommendations"]:
+ raise PublisherError("promoted dataset lacks eligible decisions")
+ if promotion["status"] == "quarantined" and any((
+ doc["source_bundle_ids"], promotion["allocation_ids"], doc["coverage"],
+ doc["attempts"], doc["series"], doc["cohorts"], doc["rankings"],
+ doc["recommendations"], doc["sensitivities"],
+ )):
+ raise PublisherError("quarantined dataset exposes unvalidated evidence")
+ return doc
+
+
+def _file_record(value: Any, path: str) -> dict[str, Any]:
+ item = _exact(value, {"path", "sha256", "bytes"}, path)
+ if not isinstance(item["path"], str) or PurePosixPath(item["path"]).is_absolute() or ".." in PurePosixPath(item["path"]).parts:
+ raise PublisherError(f"{path}.path is unsafe")
+ if not isinstance(item["sha256"], str) or HEX64.fullmatch(item["sha256"]) is None:
+ raise PublisherError(f"{path}.sha256 is invalid")
+ _integer(item["bytes"], f"{path}.bytes", minimum=1)
+ return item
+
+def validate_bundle_manifest(doc: Any) -> dict[str, Any]:
+ _schema("private-bundle-v1.schema.json", doc)
+ attempts = {item["attempt_id"]: item for item in doc["attempts"]}
+ if len(attempts) != len(doc["attempts"]):
+ raise PublisherError("bundle has duplicate attempt IDs")
+ selections = doc["coverage"]["selections"]
+ if len({item["case_id"] for item in selections}) != len(selections):
+ raise PublisherError("bundle has duplicate selected cases")
+ counts = {name: 0 for name in OUTCOMES}
+ for selection in selections:
+ attempt = attempts.get(selection["selected_attempt_id"])
+ if attempt is None or not attempt["selected"] or attempt["case_id"] != selection["case_id"] or attempt["outcome"] != selection["outcome"]:
+ raise PublisherError("bundle selection differs from retained attempt")
+ counts[selection["outcome"]] += 1
+ coverage = doc["coverage"]
+ if coverage["terminal_cases"] != len(selections) or coverage["outcome_counts"] != counts:
+ raise PublisherError("bundle terminal counts differ")
+ if coverage["complete"] != (coverage["expected_cases"] == len(selections)):
+ raise PublisherError("bundle completeness differs from coverage")
+ fingerprints: dict[str, set[str]] = {}
+ for attempt in doc["attempts"]:
+ value = attempt["runtime_fingerprint_sha256"]
+ if value:
+ fingerprints.setdefault(attempt["allocation_id"], set()).add(value)
+ if any(len(values) != 1 for values in fingerprints.values()):
+ raise PublisherError("bundle runtime is heterogeneous within an allocation")
+ return doc
+
+
+def _fsync_dir(path: Path) -> None:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_DIRECTORY", 0))
+ try:
+ os.fsync(descriptor)
+ finally:
+ os.close(descriptor)
+
+
+def _write_bytes(path: Path, data: bytes, *, mode: int) -> None:
+ descriptor = os.open(
+ path,
+ os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0),
+ mode,
+ )
+ try:
+ os.fchmod(descriptor, mode)
+ with os.fdopen(descriptor, "wb", closefd=False) as handle:
+ handle.write(data)
+ handle.flush()
+ os.fsync(handle.fileno())
+ finally:
+ os.close(descriptor)
+
+
+def _write_all(descriptor: int, data: bytes) -> None:
+ view = memoryview(data)
+ while view:
+ view = view[os.write(descriptor, view):]
+
+
+def _write_json(path: Path, value: Any, *, mode: int) -> bytes:
+ data = _canonical(value) + b"\n"
+ _write_bytes(path, data, mode=mode)
+ return data
+
+
+def _file_metadata(path: Path, relative_to: Path) -> dict[str, Any]:
+ return {
+ "path": path.relative_to(relative_to).as_posix(),
+ "sha256": _sha_file(path),
+ "bytes": path.stat().st_size,
+ }
+
+
+def _tree_files(root: Path) -> list[Path]:
+ return sorted(
+ path for path in root.rglob("*")
+ if path.is_file() and not path.is_symlink() and path.name != "COMPLETE"
+ )
+
+
+def _verify_regular_file(path: Path, expected_mode: int) -> None:
+ _reject_symlinked_path(path.parent)
+ try:
+ metadata = os.lstat(path)
+ except FileNotFoundError as exc:
+ raise PublisherError(f"required file is missing: {path.name}") from exc
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != expected_mode
+ ):
+ raise PublisherError(
+ f"file is not an owned regular {expected_mode:o} object: {path.name}"
+ )
+
+
+def _verify_frozen_tree(root: Path, *, private: bool) -> None:
+ _reject_symlinked_path(root)
+ directory_mode = 0o500 if private else 0o555
+ file_mode = 0o400 if private else 0o444
+ try:
+ root_metadata = os.lstat(root)
+ except OSError as exc:
+ raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc
+ if not stat.S_ISDIR(root_metadata.st_mode):
+ raise PublisherError(f"immutable object is not a real directory: {root.name}")
+ try:
+ entries = [root, *root.rglob("*")]
+ except OSError as exc:
+ raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc
+ for path in entries:
+ metadata = os.lstat(path)
+ if metadata.st_uid != os.getuid():
+ raise PublisherError(f"immutable object has the wrong owner: {path.name}")
+ if stat.S_ISDIR(metadata.st_mode):
+ expected = directory_mode
+ elif stat.S_ISREG(metadata.st_mode):
+ expected = file_mode
+ else:
+ raise PublisherError(f"immutable object contains a linked or special entry: {path.name}")
+ if stat.S_IMODE(metadata.st_mode) != expected:
+ raise PublisherError(
+ f"immutable object mode differs for {path.name}: expected {expected:o}"
+ )
+
+
+def _freeze_tree(root: Path, *, private: bool) -> None:
+ files: list[Path] = []
+ directories = [root]
+ for path in root.rglob("*"):
+ metadata = os.lstat(path)
+ if stat.S_ISDIR(metadata.st_mode):
+ directories.append(path)
+ elif stat.S_ISREG(metadata.st_mode):
+ files.append(path)
+ else:
+ raise PublisherError(f"immutable object contains a linked or special entry: {path.name}")
+ for path in files:
+ os.chmod(path, 0o400 if private else 0o444)
+ for path in sorted(directories, key=lambda item: len(item.parts), reverse=True):
+ os.chmod(path, 0o500 if private else 0o555)
+ _fsync_dir(path)
+ _verify_frozen_tree(root, private=private)
+
+
+def _reject_symlinked_path(path: Path) -> None:
+ current = Path(path.anchor)
+ for part in path.parts[1:]:
+ current /= part
+ try:
+ metadata = os.lstat(current)
+ except FileNotFoundError:
+ break
+ if stat.S_ISLNK(metadata.st_mode):
+ raise PublisherError("COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent")
+ if not stat.S_ISDIR(metadata.st_mode):
+ raise PublisherError(f"store path component is not a directory: {current}")
+
+
+class Store:
+ """Atomic private/public directory operations on one operator filesystem."""
+
+ def __init__(self, root: str | os.PathLike[str]):
+ candidate = Path(os.path.abspath(os.path.expanduser(root)))
+ _reject_symlinked_path(candidate)
+ candidate.mkdir(parents=True, exist_ok=True, mode=0o750)
+ resolved = candidate.resolve()
+ if candidate != resolved:
+ raise PublisherError(
+ "COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent"
+ )
+ root_metadata = candidate.stat()
+ if root_metadata.st_uid != os.getuid() or stat.S_IMODE(root_metadata.st_mode) & 0o022:
+ raise PublisherError(
+ "COLLECTIVEX_STORE_ROOT must be owned by this user and not group/world writable"
+ )
+ os.chmod(candidate, 0o750)
+ if stat.S_IMODE(candidate.stat().st_mode) != 0o750:
+ raise PublisherError("COLLECTIVEX_STORE_ROOT mode must be 750")
+ self.root = resolved
+ raw = self.root
+ self.private = raw / "private"
+ self.incoming = self.private / "incoming"
+ self.bundles = self.private / "bundles"
+ self.quarantine = self.private / "quarantine"
+ self.public = raw / "public"
+ self.datasets = self.public / "datasets"
+ self.channels = self.public / "channels"
+ self.locks = raw / "locks"
+ for path, mode in (
+ (self.private, 0o700), (self.incoming, 0o700), (self.bundles, 0o700),
+ (self.quarantine, 0o700), (self.public, 0o755), (self.datasets, 0o755),
+ (self.channels, 0o755), (self.locks, 0o700),
+ ):
+ path.mkdir(parents=True, exist_ok=True, mode=mode)
+ if path.is_symlink() or not path.is_dir():
+ raise PublisherError(f"store path is not a real directory: {path}")
+ os.chmod(path, mode)
+
+ @contextlib.contextmanager
+ def locked(self) -> Iterator[None]:
+ lock_path = self.locks / "publisher.lock"
+ descriptor = os.open(
+ lock_path,
+ os.O_RDWR | os.O_CREAT | getattr(os, "O_NOFOLLOW", 0),
+ 0o600,
+ )
+ try:
+ os.fchmod(descriptor, 0o600)
+ metadata = os.fstat(descriptor)
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != 0o600
+ ):
+ raise PublisherError("publisher lock is not an owned regular 600 file")
+ fcntl.flock(descriptor, fcntl.LOCK_EX)
+ yield
+ finally:
+ fcntl.flock(descriptor, fcntl.LOCK_UN)
+ os.close(descriptor)
+
+ @contextlib.contextmanager
+ def staging(self, parent: Path, *, private: bool) -> Iterator[Path]:
+ stage = Path(tempfile.mkdtemp(prefix=".staging-", dir=parent))
+ os.chmod(stage, 0o700 if private else 0o755)
+ try:
+ yield stage
+ finally:
+ if stage.exists():
+ for path in stage.rglob("*"):
+ metadata = os.lstat(path)
+ if stat.S_ISDIR(metadata.st_mode):
+ os.chmod(path, 0o700)
+ elif stat.S_ISREG(metadata.st_mode):
+ os.chmod(path, 0o600)
+ os.chmod(stage, 0o700)
+ shutil.rmtree(stage, ignore_errors=True)
+
+ @staticmethod
+ def complete(stage: Path, value: str, *, private: bool) -> None:
+ _write_bytes(stage / "COMPLETE", (value + "\n").encode(), mode=0o600 if private else 0o644)
+ _fsync_dir(stage)
+
+ @staticmethod
+ def install(stage: Path, destination: Path, *, private: bool) -> None:
+ if destination.is_symlink():
+ raise PublisherError(f"immutable destination is a symlink: {destination.name}")
+ if destination.exists():
+ _verify_frozen_tree(destination, private=private)
+ marker = destination / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != destination.name:
+ raise PublisherError(f"immutable destination is incomplete: {destination.name}")
+ return
+ _freeze_tree(stage, private=private)
+ os.rename(stage, destination)
+ _fsync_dir(destination.parent)
+ _verify_frozen_tree(destination, private=private)
+
+ def install_dataset(self, dataset: dict[str, Any]) -> tuple[str, int]:
+ validate_public_dataset(dataset)
+ payload = _canonical(dataset) + b"\n"
+ if len(payload) > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("public dataset exceeds the serving size limit")
+ digest = _sha_bytes(payload)
+ destination = self.datasets / digest
+ with self.staging(self.datasets, private=False) as stage:
+ _write_bytes(stage / "dataset.json", payload, mode=0o644)
+ self.complete(stage, digest, private=False)
+ self.install(stage, destination, private=False)
+ stored = destination / "dataset.json"
+ marker = destination / "COMPLETE"
+ if (not marker.is_file() or marker.read_text().strip() != digest
+ or _sha_file(stored) != digest or stored.stat().st_size != len(payload)):
+ raise PublisherError("stored dataset checksum differs after installation")
+ return digest, len(payload)
+
+ def update_channel(self, channel: str, digest: str, size: int, generated_at: str) -> None:
+ if size > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("channel dataset exceeds the serving size limit")
+ _verify_frozen_tree(self.datasets / digest, private=False)
+ marker = self.datasets / digest / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != digest:
+ raise PublisherError("cannot advance a channel to an incomplete dataset")
+ dataset_path = self.datasets / digest / "dataset.json"
+ dataset = validate_public_dataset(strict_load(dataset_path))
+ if (
+ _sha_file(dataset_path) != digest
+ or dataset_path.stat().st_size != size
+ or dataset["generated_at"] != generated_at
+ ):
+ raise PublisherError("channel metadata differs from its stored dataset")
+ if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted":
+ raise PublisherError("dev-latest may only reference a promoted dataset")
+ pointer = {
+ "format": FORMAT_CHANNEL,
+ "channel": channel,
+ "dataset": {
+ "path": f"datasets/{digest}/dataset.json",
+ "sha256": digest,
+ "bytes": size,
+ },
+ "generated_at": generated_at,
+ }
+ validate_channel(pointer, expected_channel=channel)
+ destination = self.channels / f"{channel}.json"
+ temporary = self.channels / f".{channel}.tmp-{os.getpid()}"
+ try:
+ data = _canonical(pointer) + b"\n"
+ _write_bytes(temporary, data, mode=0o644)
+ os.replace(temporary, destination)
+ _fsync_dir(self.channels)
+ finally:
+ temporary.unlink(missing_ok=True)
+
+ def verify_channel(self, channel: str) -> dict[str, Any]:
+ channel_path = self.channels / f"{channel}.json"
+ _verify_regular_file(channel_path, 0o644)
+ pointer = validate_channel(strict_load(channel_path), expected_channel=channel)
+ target = self.public / pointer["dataset"]["path"]
+ _verify_frozen_tree(target.parent, private=False)
+ if target.stat().st_size != pointer["dataset"]["bytes"] or _sha_file(target) != pointer["dataset"]["sha256"]:
+ raise PublisherError(f"channel {channel} dataset checksum differs")
+ marker = target.parent / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != pointer["dataset"]["sha256"]:
+ raise PublisherError(f"channel {channel} dataset is incomplete")
+ dataset = validate_public_dataset(strict_load(target))
+ if pointer["generated_at"] != dataset["generated_at"]:
+ raise PublisherError(f"channel {channel} metadata differs from its dataset")
+ if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted":
+ raise PublisherError("dev-latest points to a non-promoted dataset")
+ return pointer
+
+
+def _copy_source(source: Path, destination: Path) -> None:
+ if source.is_symlink() or not source.is_file() or not stat.S_ISREG(source.stat().st_mode):
+ raise PublisherError(f"source must be a regular non-symlink file: {source}")
+ descriptor = os.open(source, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ output = os.open(destination, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ try:
+ while True:
+ chunk = os.read(descriptor, 1024 * 1024)
+ if not chunk:
+ break
+ _write_all(output, chunk)
+ os.fsync(output)
+ finally:
+ os.close(output)
+ finally:
+ os.close(descriptor)
+
+
+def _archive_download_directory(source: Path, destination: Path) -> None:
+ if source.is_symlink() or not source.is_dir():
+ raise PublisherError(f"artifact directory is invalid: {source}")
+ files: list[Path] = []
+ for path in source.rglob("*"):
+ if path.is_symlink():
+ raise PublisherError("artifact directory contains a symlink")
+ if path.is_dir():
+ continue
+ if not path.is_file():
+ raise PublisherError("artifact directory contains a non-regular entry")
+ files.append(path)
+ files.sort()
+ if not files or len(files) > MAX_ARCHIVE_MEMBERS:
+ raise PublisherError("artifact directory has an invalid file count")
+ total = 0
+ with zipfile.ZipFile(destination, "x", compression=zipfile.ZIP_STORED) as archive:
+ for path in files:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ with os.fdopen(descriptor, "rb") as handle:
+ metadata = os.fstat(handle.fileno())
+ if not stat.S_ISREG(metadata.st_mode):
+ raise PublisherError("artifact directory member changed type")
+ size = metadata.st_size
+ total += size
+ if size > MAX_ARCHIVE_MEMBER_BYTES or total > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact directory exceeds size limits")
+ relative = path.relative_to(source).as_posix()
+ _safe_member(relative)
+ info = zipfile.ZipInfo(relative, date_time=(1980, 1, 1, 0, 0, 0))
+ info.compress_type = zipfile.ZIP_STORED
+ info.external_attr = (stat.S_IFREG | 0o600) << 16
+ with archive.open(info, "w") as output:
+ written = 0
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ output.write(chunk)
+ written += len(chunk)
+ if written != size:
+ raise PublisherError("artifact directory member changed size")
+ descriptor = os.open(destination, os.O_RDONLY)
+ try:
+ os.fsync(descriptor)
+ finally:
+ os.close(descriptor)
+
+
+def _artifact_name(source: Path) -> str:
+ name = source.name if source.is_dir() else source.name.removesuffix(".zip")
+ if (
+ not source.is_dir() and source.suffix != ".zip"
+ or ARTIFACT_NAME.fullmatch(name) is None
+ ):
+ raise PublisherError(f"artifact source has an invalid GHA name: {source.name}")
+ return name
+
+
+def archive_incoming(
+ store: Store,
+ matrix: Path,
+ artifacts: Sequence[Path],
+ run: dict[str, Any],
+) -> tuple[str, Path, list[dict[str, Any]]]:
+ """Copy exact delivery bytes into immutable incoming before any JSON/ZIP parse."""
+ if not artifacts:
+ raise PublisherError("at least one GitHub artifact archive is required")
+ with store.staging(store.incoming, private=True) as stage:
+ sources = stage / "sources"
+ sources.mkdir(mode=0o700)
+ copied: list[dict[str, Any]] = []
+ named_artifacts = sorted(
+ ((_artifact_name(path), path) for path in artifacts), key=lambda item: item[0]
+ )
+ artifact_names = [name for name, _ in named_artifacts]
+ if len(artifact_names) != len(set(artifact_names)):
+ raise PublisherError("artifact delivery contains duplicate GHA names")
+ inputs = [("matrix.json", matrix, "matrix", None)] + [
+ (f"artifact-{index:04d}.zip", path, "artifact", artifact_name)
+ for index, (artifact_name, path) in enumerate(named_artifacts)
+ ]
+ for name, source, kind, artifact_name in inputs:
+ destination = sources / name
+ if source.is_dir():
+ _archive_download_directory(source, destination)
+ else:
+ if source != matrix and source.stat().st_size > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact archive exceeds the size limit")
+ _copy_source(source, destination)
+ copied.append({
+ **_file_metadata(destination, stage),
+ "kind": kind,
+ "artifact_name": artifact_name,
+ })
+ ingest_id = _sha_bytes(_canonical({"run": run, "sources": copied}))
+ incoming_manifest = {
+ "format": "collectivex.incoming.v1",
+ "schema_version": 1,
+ "ingest_id": ingest_id,
+ "run": run,
+ "sources": copied,
+ }
+ _write_json(stage / "incoming.json", incoming_manifest, mode=0o600)
+ store.complete(stage, ingest_id, private=True)
+ destination = store.incoming / ingest_id
+ store.install(stage, destination, private=True)
+ installed = store.incoming / ingest_id
+ if strict_load(installed / "incoming.json") != incoming_manifest:
+ raise PublisherError("existing incoming object differs from archived delivery")
+ for record in copied:
+ _resolve_bundle_file(installed, record)
+ return ingest_id, installed, copied
+
+
+def _safe_member(name: str) -> PurePosixPath:
+ if "\\" in name or "\0" in name:
+ raise PublisherError("archive member has an unsafe separator")
+ path = PurePosixPath(name)
+ if path.is_absolute() or not path.parts or any(part in {"", ".", ".."} for part in path.parts):
+ raise PublisherError("archive member path escapes its artifact")
+ return path
+
+
+def extract_archive(archive: Path, destination: Path) -> list[Path]:
+ """Extract a bounded regular-file ZIP without trusting member paths or links."""
+ try:
+ handle = zipfile.ZipFile(archive)
+ except (OSError, zipfile.BadZipFile) as exc:
+ raise PublisherError("artifact is not a valid ZIP archive") from exc
+ extracted: list[Path] = []
+ seen: set[str] = set()
+ total = 0
+ with handle:
+ members = handle.infolist()
+ if not members or len(members) > MAX_ARCHIVE_MEMBERS:
+ raise PublisherError("artifact has an invalid member count")
+ for member in members:
+ path = _safe_member(member.filename.rstrip("/"))
+ key = path.as_posix()
+ if key in seen:
+ raise PublisherError("artifact contains duplicate member paths")
+ seen.add(key)
+ mode = member.external_attr >> 16
+ if stat.S_ISLNK(mode) or (mode and not (stat.S_ISREG(mode) or stat.S_ISDIR(mode))):
+ raise PublisherError("artifact contains a non-regular member")
+ if member.flag_bits & 0x1:
+ raise PublisherError("encrypted artifact members are not accepted")
+ if member.file_size > MAX_ARCHIVE_MEMBER_BYTES:
+ raise PublisherError("artifact member exceeds the size limit")
+ total += member.file_size
+ if total > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact exceeds the expanded size limit")
+ target = destination.joinpath(*path.parts)
+ if member.is_dir():
+ target.mkdir(parents=True, exist_ok=True, mode=0o700)
+ continue
+ target.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
+ output = os.open(target, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ try:
+ with handle.open(member, "r") as source:
+ written = 0
+ while True:
+ chunk = source.read(1024 * 1024)
+ if not chunk:
+ break
+ _write_all(output, chunk)
+ written += len(chunk)
+ if written != member.file_size:
+ raise PublisherError("artifact member size changed during extraction")
+ os.fsync(output)
+ finally:
+ os.close(output)
+ extracted.append(target)
+ return extracted
+
+
+def validate_matrix(document: Any) -> list[dict[str, Any]]:
+ try:
+ artifact_safety.assert_publication_safe([document])
+ matrix = sweep_matrix.validate_matrix_document(document)
+ except (SystemExit, ValueError, artifact_safety.ArtifactSafetyError) as exc:
+ raise PublisherError(f"requested matrix is invalid: {exc}") from exc
+ return [
+ {
+ "sku": item["sku"],
+ **item["case"],
+ "_disposition": item["disposition"],
+ "_reason": item["reason"],
+ }
+ for item in matrix["requested_cases"]
+ ]
+
+
+def _expected_deliveries(
+ matrix: dict[str, Any], cases: Sequence[dict[str, Any]], run: dict[str, Any]
+) -> dict[str, tuple[str, str, str]]:
+ shard_by_case: dict[str, str] = {}
+ for shard in matrix["include"]:
+ for case_id in shard["case_ids"]:
+ if case_id in shard_by_case:
+ raise PublisherError("requested case appears in two runnable shards")
+ shard_by_case[case_id] = shard["id"]
+ suffix = f"{run['run_id']}-{run['run_attempt']}"
+ deliveries: dict[str, tuple[str, str, str]] = {}
+ for case in cases:
+ case_id = case["case_id"]
+ if case["_disposition"] == "unsupported":
+ deliveries[case_id] = (
+ f"cxunsupported-{suffix}", "setup",
+ f"{run['run_id']}_{run['run_attempt']}_unsupported",
+ )
+ continue
+ shard_id = shard_by_case.get(case_id)
+ if shard_id is None:
+ raise PublisherError("runnable case has no matrix shard")
+ deliveries[case_id] = (
+ f"cxshard-{shard_id}-{suffix}", "sweep",
+ f"{run['run_id']}_{run['run_attempt']}_{shard_id}",
+ )
+ return deliveries
+
+
+def _document_git_run(document: dict[str, Any]) -> dict[str, Any] | None:
+ provenance = document.get("provenance")
+ if not isinstance(provenance, dict):
+ return None
+ value = provenance.get("git_run", provenance)
+ return value if isinstance(value, dict) else None
+
+
+def _run_matches(document: dict[str, Any], run: dict[str, Any]) -> bool:
+ git_run = _document_git_run(document)
+ if git_run is None:
+ return False
+ return (
+ str(git_run.get("run_id")) == run["run_id"]
+ and str(git_run.get("run_attempt")) == str(run["run_attempt"])
+ and git_run.get("source_sha") == run["source_sha"]
+ and (git_run.get("repo") or git_run.get("repository")) == run["repository"]
+ )
+
+
+def _case_matches(document: dict[str, Any], expected: dict[str, Any]) -> bool:
+ scheduled = {
+ key: value for key, value in expected.items()
+ if key not in {"sku", "case_id"} and not key.startswith("_")
+ }
+ return document.get("identity", {}).get("case_factors") == {
+ "case": scheduled,
+ "profile": identity.V1_CASE_PROFILE,
+ "sku": expected["sku"],
+ }
+
+
+def _outcome(document: dict[str, Any]) -> tuple[str, str | None]:
+ status = document["outcome"]["status"]
+ if status == "success":
+ return status, None
+ native = document["outcome"].get("reason")
+ reason = native if isinstance(native, str) and REASON.fullmatch(native) else {
+ "unsupported": "unsupported-capability", "failed": "execution-failed",
+ "invalid": "validation-failed", "diagnostic": "diagnostic-evidence",
+ }.get(status)
+ if reason is None:
+ raise PublisherError(f"unsupported native outcome {status!r}")
+ return status, reason
+
+
+def _attempt_record(
+ document: dict[str, Any], path: Path, root: Path, *, selected: bool
+) -> dict[str, Any]:
+ normalized = contracts.normalize_attempt(document)
+ runtime = normalized["runtime_fingerprint"]
+ runtime_sha = _sha_bytes(_canonical(runtime)) if runtime is not None else None
+ sample_record = None
+ evidence_ids: list[str] = []
+ series_ids: list[str] = []
+ if document["format"] == contracts.RAW_FORMAT:
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ sample_record = _file_metadata(sample_path, root)
+ evidence_ids = [row["evidence_id"] for row in document["measurement"]["rows"]]
+ series_ids = [document["identity"]["series_id"]]
+ declared = document["identity"]["series_factors"]["runtime_fingerprint_sha256"]
+ if runtime_sha != declared:
+ raise PublisherError("runtime fingerprint checksum differs from series identity")
+ status, reason = _outcome(document)
+ return {
+ "attempt_id": normalized["attempt_id"],
+ "allocation_id": normalized["allocation_id"],
+ "case_id": normalized["case_id"],
+ "outcome": status,
+ "reason": reason,
+ "selected": selected,
+ "document": _file_metadata(path, root),
+ "samples": sample_record,
+ "runtime_fingerprint_sha256": runtime_sha,
+ "series_ids": series_ids,
+ "evidence_ids": evidence_ids,
+ }
+
+
+def _validate_delivery_binding(
+ document: dict[str, Any], path: Path, raw_root: Path,
+ artifact_by_root: dict[str, str], expected_by_id: dict[str, dict[str, Any]],
+ expected_deliveries: dict[str, tuple[str, str, str]], run: dict[str, Any],
+) -> str:
+ case_id = document["identity"]["case_id"]
+ if case_id not in expected_by_id:
+ raise PublisherError("artifact contains an extra case outcome")
+ expected = expected_by_id[case_id]
+ if not _case_matches(document, expected):
+ raise PublisherError("attempt case coordinates differ from the requested matrix")
+ unsupported = document["outcome"]["status"] == "unsupported"
+ if (expected["_disposition"] == "unsupported") != unsupported:
+ raise PublisherError("terminal outcome differs from requested capability disposition")
+ if unsupported and document["outcome"]["reason"] != expected["_reason"]:
+ raise PublisherError("unsupported outcome reason differs from requested matrix")
+ if not _run_matches(document, run):
+ raise PublisherError("attempt provenance differs from publisher run metadata")
+ relative = path.relative_to(raw_root)
+ if len(relative.parts) < 2:
+ raise PublisherError("attempt document is outside a delivered artifact")
+ delivered_name = artifact_by_root.get(relative.parts[0])
+ expected_name, expected_job, expected_execution = expected_deliveries[case_id]
+ git_run = _document_git_run(document)
+ allocation = document["identity"]["allocation_factors"]
+ if (
+ git_run is None
+ or delivered_name != expected_name
+ or git_run["artifact"] != delivered_name
+ or git_run["job"] != expected_job
+ or allocation["execution_id"] != expected_execution
+ ):
+ raise PublisherError("attempt provenance differs from its delivered GHA shard")
+ return case_id
+
+
+def _parse_extracted(root: Path) -> tuple[list[tuple[Path, dict[str, Any]]], set[Path]]:
+ attempts: list[tuple[Path, dict[str, Any]]] = []
+ consumed_samples: set[Path] = set()
+ json_paths = sorted(path for path in root.rglob("*.json") if path.is_file())
+ for path in json_paths:
+ if path in consumed_samples:
+ continue
+ try:
+ document = contracts.strict_load(path)
+ artifact_safety.assert_publication_safe([document])
+ format_name = document.get("format") if isinstance(document, dict) else None
+ if format_name == contracts.SAMPLES_FORMAT:
+ _schema("samples-v1.schema.json", document)
+ # It must be claimed by a raw document; orphan checking happens after the scan.
+ continue
+ if format_name == contracts.RAW_FORMAT:
+ _schema("raw-case-v1.schema.json", document)
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ sample_document = contracts.strict_load(sample_path)
+ artifact_safety.assert_publication_safe([sample_document])
+ _schema("samples-v1.schema.json", sample_document)
+ validated = contracts.load_raw_attempt(path)
+ consumed_samples.add(sample_path)
+ elif format_name == contracts.TERMINAL_FORMAT:
+ _schema("terminal-outcome-v1.schema.json", document)
+ validated = contracts.validate_terminal_document(document)
+ else:
+ raise PublisherError(f"artifact contains unknown JSON document {path.name}")
+ except (
+ contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError, OSError,
+ ) as exc:
+ raise PublisherError(f"native contract rejected {path.name}: {exc}") from exc
+ attempts.append((path, validated))
+ orphan_samples = [
+ path for path in json_paths
+ if isinstance((doc := contracts.strict_load(path)), dict)
+ and doc.get("format") == contracts.SAMPLES_FORMAT
+ and path not in consumed_samples
+ ]
+ if orphan_samples:
+ raise PublisherError("artifact contains an orphan samples document")
+ if not attempts:
+ raise PublisherError("artifact contains zero native attempt documents")
+ return attempts, consumed_samples
+
+
+def build_bundle(
+ store: Store,
+ incoming_id: str,
+ incoming_path: Path,
+ run: dict[str, Any],
+) -> tuple[str, dict[str, Any], list[dict[str, Any]]]:
+ """Validate one exact workflow delivery and install its immutable private bundle."""
+ incoming_manifest = strict_load(incoming_path / "incoming.json")
+ _exact(
+ incoming_manifest,
+ {"format", "schema_version", "ingest_id", "run", "sources"},
+ "incoming",
+ )
+ artifact_safety.assert_publication_safe([incoming_manifest])
+ if (
+ incoming_manifest["format"] != "collectivex.incoming.v1"
+ or incoming_manifest["schema_version"] != 1
+ or incoming_manifest["ingest_id"] != incoming_id
+ or incoming_manifest["run"] != run
+ or _sha_bytes(_canonical({"run": run, "sources": incoming_manifest["sources"]}))
+ != incoming_id
+ ):
+ raise PublisherError("incoming manifest identity differs from archived delivery")
+ incoming_sources = _array(incoming_manifest["sources"], "incoming.sources", nonempty=True)
+ for index, record in enumerate(incoming_sources):
+ _exact(
+ record,
+ {"path", "sha256", "bytes", "kind", "artifact_name"},
+ f"incoming.sources[{index}]",
+ )
+ _resolve_bundle_file(incoming_path, record)
+ matrix_records = [record for record in incoming_sources if record["kind"] == "matrix"]
+ artifact_records = [record for record in incoming_sources if record["kind"] == "artifact"]
+ if (
+ len(matrix_records) != 1
+ or matrix_records[0]["artifact_name"] is not None
+ or not artifact_records
+ or any(ARTIFACT_NAME.fullmatch(record["artifact_name"] or "") is None
+ for record in artifact_records)
+ or len({record["artifact_name"] for record in artifact_records}) != len(artifact_records)
+ ):
+ raise PublisherError("incoming source catalog is invalid")
+ matrix_source = _resolve_bundle_file(incoming_path, matrix_records[0])
+ matrix_document = strict_load(matrix_source)
+ expected_cases = validate_matrix(matrix_document)
+ expected_by_id = {case["case_id"]: case for case in expected_cases}
+ expected_deliveries = _expected_deliveries(matrix_document, expected_cases, run)
+ if {record["artifact_name"] for record in artifact_records} != {
+ delivery[0] for delivery in expected_deliveries.values()
+ }:
+ raise PublisherError("incoming artifact archive set differs from requested matrix shards")
+ with store.staging(store.bundles, private=True) as stage:
+ source_copy = stage / "source"
+ raw_root = stage / "raw"
+ source_copy.mkdir(mode=0o700)
+ raw_root.mkdir(mode=0o700)
+ matrix_path = stage / "matrix.json"
+ _copy_source(matrix_source, matrix_path)
+ source_records: list[dict[str, Any]] = []
+ artifact_by_root: dict[str, str] = {}
+ for index, source_record in enumerate(artifact_records):
+ archive = _resolve_bundle_file(incoming_path, source_record)
+ copied = source_copy / f"artifact-{index:04d}.zip"
+ _copy_source(archive, copied)
+ source_records.append({
+ **_file_metadata(copied, stage),
+ "artifact_name": source_record["artifact_name"],
+ })
+ artifact_root = raw_root / f"artifact-{index:04d}"
+ artifact_root.mkdir(mode=0o700)
+ artifact_by_root[artifact_root.name] = source_record["artifact_name"]
+ extract_archive(copied, artifact_root)
+ parsed, consumed_samples = _parse_extracted(raw_root)
+ created_at = _latest_timestamp(
+ [document["generated_at"] for _, document in parsed]
+ )
+ consumed_files = {path for path, _ in parsed} | consumed_samples
+ extracted_files = {
+ path for path in raw_root.rglob("*")
+ if path.is_file() and not path.is_symlink()
+ }
+ if consumed_files != extracted_files:
+ raise PublisherError("artifact contains an unconsumed non-native member")
+ by_case: dict[str, list[tuple[Path, dict[str, Any]]]] = {}
+ for path, document in parsed:
+ case_id = _validate_delivery_binding(
+ document, path, raw_root, artifact_by_root, expected_by_id,
+ expected_deliveries, run,
+ )
+ by_case.setdefault(case_id, []).append((path, document))
+ missing = set(expected_by_id) - set(by_case)
+ if missing:
+ raise PublisherError(f"artifact is missing {len(missing)} requested case outcomes")
+ attempt_records: list[dict[str, Any]] = []
+ selections: list[dict[str, Any]] = []
+ selected_documents: list[dict[str, Any]] = []
+ runtime_hashes: set[str] = set()
+ outcome_counts = {name: 0 for name in OUTCOMES}
+ for case_id in sorted(expected_by_id):
+ case_attempts = by_case[case_id]
+ ordinals = [document["identity"]["attempt_ordinal"] for _, document in case_attempts]
+ allocations_for_case = {
+ document["identity"]["allocation_id"] for _, document in case_attempts
+ }
+ if len(allocations_for_case) != 1 or sorted(ordinals) != list(
+ range(1, len(ordinals) + 1)
+ ):
+ raise PublisherError(
+ "case retries must retain contiguous ordinals in one allocation"
+ )
+ _, selected_document = max(
+ case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"]
+ )
+ selected_id = selected_document["identity"]["attempt_id"]
+ selected_documents.append(selected_document)
+ selected_status, _ = _outcome(selected_document)
+ selections.append({
+ "case_id": case_id,
+ "selected_attempt_id": selected_id,
+ "outcome": selected_status,
+ })
+ outcome_counts[selected_status] += 1
+ for path, document in sorted(
+ case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"]
+ ):
+ normalized = contracts.normalize_attempt(document)
+ if document["format"] == contracts.RAW_FORMAT:
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ if sample_path not in consumed_samples:
+ raise PublisherError("validated raw attempt lost its samples document")
+ record = _attempt_record(
+ document, path, stage,
+ selected=normalized["attempt_id"] == selected_id,
+ )
+ if record["runtime_fingerprint_sha256"]:
+ runtime_hashes.add(record["runtime_fingerprint_sha256"])
+ attempt_records.append(record)
+ # Every extracted byte is covered; the bundle manifest anchors this checksum catalog.
+ payload_records = [_file_metadata(path, stage) for path in _tree_files(stage)]
+ checksum_document = {
+ "format": "collectivex.checksums.v1",
+ "files": payload_records,
+ }
+ checksum_path = stage / "checksums.json"
+ _write_json(checksum_path, checksum_document, mode=0o600)
+ bundle = {
+ "format": FORMAT_BUNDLE,
+ "schema_version": 1,
+ "created_at": created_at,
+ "ingest_id": incoming_id,
+ "run": run,
+ "matrix": _file_metadata(matrix_path, stage),
+ "sources": source_records,
+ "attempts": attempt_records,
+ "coverage": {
+ "expected_cases": len(expected_cases),
+ "terminal_cases": len(selections),
+ "complete": len(selections) == len(expected_cases),
+ "outcome_counts": outcome_counts,
+ "selections": selections,
+ },
+ "runtime_fingerprints": sorted(runtime_hashes),
+ "checksums": _file_metadata(checksum_path, stage),
+ "validation": {
+ "policy": PUBLISHER_POLICY,
+ "passed": True,
+ "checks": [
+ "archive-safety", "checksums", "exact-coverage", "identity",
+ "native-schema", "privacy", "runtime-homogeneity", "terminal-outcomes",
+ ],
+ },
+ }
+ validate_bundle_manifest(bundle)
+ # Runtime homogeneity is scoped to a realized allocation, not across unlike SKUs.
+ by_allocation: dict[str, set[str]] = {}
+ for attempt in attempt_records:
+ fingerprint = attempt["runtime_fingerprint_sha256"]
+ if fingerprint:
+ by_allocation.setdefault(attempt["allocation_id"], set()).add(fingerprint)
+ if any(len(values) != 1 for values in by_allocation.values()):
+ raise PublisherError("runtime fingerprint is heterogeneous within an allocation")
+ bundle_bytes = _canonical(bundle) + b"\n"
+ bundle_id = _sha_bytes(bundle_bytes)
+ _write_bytes(stage / "bundle.json", bundle_bytes, mode=0o600)
+ store.complete(stage, bundle_id, private=True)
+ store.install(stage, store.bundles / bundle_id, private=True)
+ installed = load_bundle(store, bundle_id)
+ if installed["manifest"] != bundle:
+ raise PublisherError("existing bundle differs from validated manifest")
+ return bundle_id, bundle, selected_documents
+
+
+def _slug(value: Any, fallback: str = "unknown") -> str:
+ text = re.sub(r"[^a-z0-9_.-]+", "-", str(value or "").lower()).strip("-.")
+ return text[:128] if text and SAFE_ID.fullmatch(text[:128]) else fallback
+
+
+def _derived_id(prefix: str, value: Any) -> str:
+ return f"{prefix}{_sha_bytes(_canonical(value))}"
+
+
+def _git_run(document: dict[str, Any]) -> dict[str, Any]:
+ return _document_git_run(document) or {}
+
+
+def _public_attempt(document: dict[str, Any], *, selected: bool = False) -> dict[str, Any]:
+ normalized = contracts.normalize_attempt(document)
+ run = _git_run(document)
+ evidence = (
+ [{"evidence_id": row["evidence_id"], "point_id": row["point_id"]}
+ for row in document["measurement"]["rows"]]
+ if document["format"] == contracts.RAW_FORMAT else []
+ )
+ status, reason = _outcome(document)
+ failure_mode = document["outcome"].get("failure_mode")
+ if not isinstance(failure_mode, str) or REASON.fullmatch(failure_mode) is None:
+ failure_mode = None if status == "success" else reason
+ series_id = normalized["series_id"] if status == "success" and selected else None
+ return {
+ "attempt_id": normalized["attempt_id"],
+ "evidence": evidence,
+ "case_id": normalized["case_id"],
+ "allocation_id": normalized["allocation_id"],
+ "run_id": str(run["run_id"]),
+ "run_attempt": int(run["run_attempt"]),
+ "attempt_index": document["identity"]["attempt_ordinal"],
+ "selected": selected,
+ "outcome": status,
+ "failure_mode": failure_mode,
+ "reason": reason,
+ "series_id": series_id,
+ "completed_at": document["generated_at"],
+ }
+
+
+def _ratio(values: Sequence[float]) -> float | None:
+ return max(values) / min(values) if len(values) >= REQUIRED_ALLOCATIONS and min(values) > 0 else None
+
+
+def _eligibility_record(
+ allocations: Sequence[str],
+ *,
+ complete: bool,
+ correct: bool,
+ measured: bool,
+ stable_ordering: bool,
+ p50_ratio: float | None,
+ p99_ratio: float | None,
+ extra_reasons: Sequence[str] = (),
+) -> dict[str, Any]:
+ ids = sorted(set(allocations))
+ stable_p50 = p50_ratio is not None and p50_ratio <= P50_STABILITY_LIMIT
+ stable_p99 = p99_ratio is not None and p99_ratio <= P99_STABILITY_LIMIT
+ reasons = list(extra_reasons)
+ for condition, reason in (
+ (len(ids) >= REQUIRED_ALLOCATIONS, "insufficient-allocations"),
+ (complete, "incomplete-repeat-coverage"),
+ (correct, "correctness-failed"),
+ (measured, "missing-measured-roundtrip-p99"),
+ (stable_p50, "unstable-p50"),
+ (stable_p99, "unstable-p99"),
+ (stable_ordering, "unstable-ordering"),
+ ):
+ if not condition:
+ reasons.append(reason)
+ reasons = sorted(set(reasons))
+ decision = not reasons
+ return {
+ "decision_grade": decision,
+ "allocation_ids": ids,
+ "complete": complete,
+ "correct": correct,
+ "measured_roundtrip_p99": measured,
+ "stable_p50": stable_p50,
+ "stable_p99": stable_p99,
+ "stable_ordering": stable_ordering,
+ "p50_max_min_ratio": p50_ratio,
+ "p99_max_min_ratio": p99_ratio,
+ "reasons": reasons,
+ }
+
+
+def _aggregate_percentiles(values: Sequence[dict[str, Any]]) -> dict[str, float]:
+ return {
+ name: float(statistics.median(float(value[name]) for value in values))
+ for name in ("p50", "p90", "p95", "p99")
+ }
+
+
+def _aggregate_component(
+ rows: Sequence[dict[str, Any]], name: str
+) -> dict[str, Any] | None:
+ components = [row["components"][name] for row in rows]
+ if all(component["availability"] == "unavailable" for component in components):
+ return None
+ if any(component["availability"] == "unavailable" for component in components):
+ raise PublisherError("component availability differs across repeat allocations")
+ latency = _aggregate_percentiles([component["percentiles_us"] for component in components])
+ if name == "isolated_sum":
+ return {
+ "origin": "derived",
+ "latency_us": latency,
+ "logical_bytes": None,
+ "logical_payload_rate_gbps_at_latency_percentile": None,
+ "sample_count": None,
+ }
+ byte_values = {row["logical_bytes"][name] for row in rows}
+ if len(byte_values) != 1:
+ raise PublisherError("logical byte accounting differs across repeat allocations")
+ logical_bytes = byte_values.pop()
+ rates = {statistic: logical_bytes / (latency[statistic] * 1000.0) for statistic in latency}
+ return {
+ "origin": "measured",
+ "latency_us": latency,
+ "logical_bytes": logical_bytes,
+ "logical_payload_rate_gbps_at_latency_percentile": rates,
+ "sample_count": 512,
+ }
+
+
+def _exact_repeat_value(values: Sequence[Any], label: str) -> Any:
+ if not values or len({_canonical(value) for value in values}) != 1:
+ raise PublisherError(f"{label} differs across repeat allocations")
+ return values[0]
+
+
+def _eplb_descriptor(document: dict[str, Any]) -> dict[str, Any]:
+ value = document["case"]["eplb"]
+ return {
+ "enabled": value["enabled"],
+ "planner": value["planner"],
+ "mapping_sha256": value["mapping_hash"],
+ "logical_experts": value["num_logical_experts"],
+ "physical_experts": value["num_physical_experts"],
+ "redundant_experts": value["num_redundant"],
+ "reference_tokens_per_rank": value["reference_tokens_per_rank"],
+ "replicated_experts": value["replicated_experts"],
+ "max_replicas": value["max_replicas"],
+ "imbalance_before": value["imbalance_before"],
+ "imbalance_after": value["imbalance_after"],
+ }
+
+
+def _routing_facts(row: dict[str, Any]) -> dict[str, Any]:
+ routing = row["routing"]
+ return {
+ "fanout_mean": routing["fanout_mean"],
+ "recv_tokens_max": row["receive"]["max"],
+ "expert_load_cv": routing["expert_load_cv"],
+ "payload_rank_cv": routing["payload_rank_cv"],
+ "hotspot_ratio": routing["hotspot_ratio"],
+ "empty_expert_count": routing["empty_expert_count"],
+ "empty_rank_count": routing["empty_rank_count"],
+ "routed_copies": routing["routed_copies"],
+ }
+
+
+def _series_extra_reasons(documents: Sequence[dict[str, Any]]) -> list[str]:
+ reasons: set[str] = set()
+ for document in documents:
+ validity = document["outcome"]["validity"]
+ rows = document["measurement"]["rows"]
+ if validity.get("provenance_complete") is not True:
+ reasons.add("incomplete-provenance")
+ if validity.get("workload_source") != "canonical-serialized":
+ reasons.add("noncanonical-workload")
+ if validity.get("anomaly_free") is not True or any(row["anomalies"] for row in rows):
+ reasons.add("unresolved-anomaly")
+ if validity.get("semantic_correctness") != "pass":
+ reasons.add("semantic-correctness-failed")
+ if validity.get("measurement_conformance") != "conformant" or validity.get("sampling_conformance") != "conformant":
+ reasons.add("measurement-nonconformant")
+ scopes = {row["correctness"].get("scope") for row in rows}
+ if scopes != {"dispatch-metadata-and-transformed-combine"}:
+ reasons.add("expert-oracle-incomplete")
+ return sorted(reasons)
+
+
+BACKEND_LABELS = {
+ "deepep": "DeepEP V1",
+ "deepep-v2": "DeepEP V2",
+ "deepep-hybrid": "DeepEP Hybrid",
+ "uccl": "UCCL",
+ "mori": "MoRI",
+ "nccl-ep": "NCCL/RCCL reference",
+}
+
+
+def _build_series(
+ series_id: str,
+ documents: Sequence[dict[str, Any]],
+ expected_repeats: int,
+) -> tuple[dict[str, Any], dict[str, Any]]:
+ if not documents:
+ raise PublisherError("cannot aggregate an empty series")
+ first = documents[0]
+ if any(document["identity"]["series_id"] != series_id for document in documents):
+ raise PublisherError("series aggregation mixed identities")
+ allocations = [document["identity"]["allocation_id"] for document in documents]
+ if len(allocations) != len(set(allocations)):
+ raise PublisherError("series repeats reuse an allocation identity")
+ row_maps = [
+ {row["tokens_per_rank"]: row for row in document["measurement"]["rows"]}
+ for document in documents
+ ]
+ token_sets = {tuple(sorted(rows)) for rows in row_maps}
+ if len(token_sets) != 1:
+ raise PublisherError("series token coverage differs across allocations")
+ tokens = list(next(iter(token_sets)))
+ p50_ratios = [
+ _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p50"] for rows in row_maps])
+ for token in tokens
+ ]
+ p99_ratios = [
+ _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p99"] for rows in row_maps])
+ for token in tokens
+ ]
+ p50_ratio = max((value for value in p50_ratios if value is not None), default=None)
+ p99_ratio = max((value for value in p99_ratios if value is not None), default=None)
+ correct = all(
+ row["correctness"]["passed"]
+ for document in documents for row in document["measurement"]["rows"]
+ )
+ measured = all(
+ row["components"]["roundtrip"]["availability"] == "measured"
+ and row["components"]["roundtrip"]["percentiles_us"].get("p99") is not None
+ for document in documents for row in document["measurement"]["rows"]
+ )
+ eligibility = _eligibility_record(
+ allocations,
+ complete=len(documents) == expected_repeats,
+ correct=correct,
+ measured=measured,
+ # Ordering is defined only across alternatives in a controlled cohort.
+ stable_ordering=True,
+ p50_ratio=p50_ratio,
+ p99_ratio=p99_ratio,
+ extra_reasons=_series_extra_reasons(documents),
+ )
+ case = first["case"]
+ shape = case["shape"]
+ topology = first["topology"]
+ runtime = first["runtime_fingerprint"]
+ workload_id = first["workload"]["workload_id"]
+ if not identity.is_typed_id(workload_id, "workload"):
+ raise PublisherError("raw workload is not canonical")
+ backend_id = case["backend"]
+ resource_raw = first["implementation"]["resource_profile"]
+ public_config = contracts.public_series_config(
+ kernel_generation=first["implementation"]["kernel_generation"],
+ provenance=first["implementation"]["provenance"],
+ resource_profile=resource_raw,
+ resource_mode=case["resource_mode"],
+ device_product=topology["device_product"],
+ )
+ resource_profile = public_config["resource"]["profile"]
+ configured_units = public_config["resource"]["configured_units"]
+ units_kind = public_config["resource"]["comm_units_kind"]
+ resource_label = (
+ f"{configured_units} {str(units_kind).upper()}"
+ if configured_units is not None and units_kind
+ else resource_profile
+ )
+ eplb = _exact_repeat_value(
+ [_eplb_descriptor(document) for document in documents], "EPLB descriptor"
+ )
+ points: list[dict[str, Any]] = []
+ run_metrics: dict[str, dict[int, dict[str, float]]] = {}
+ for document, rows in zip(documents, row_maps, strict=True):
+ run_id = str(_git_run(document)["run_id"])
+ if run_id in run_metrics:
+ raise PublisherError("series has two allocations from one workflow run")
+ run_metrics[run_id] = {}
+ for token in tokens:
+ latency = rows[token]["components"]["roundtrip"]["percentiles_us"]
+ logical_bytes = rows[token]["logical_bytes"]["roundtrip"]
+ run_metrics[run_id][token] = {
+ "latency_us": {statistic: latency[statistic] for statistic in ("p50", "p99")},
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ statistic: logical_bytes / (latency[statistic] * 1000.0)
+ for statistic in ("p50", "p99")
+ },
+ }
+ for token in tokens:
+ rows = [row_map[token] for row_map in row_maps]
+ routing = _exact_repeat_value(
+ [_routing_facts(row) for row in rows], "routing/load facts"
+ )
+ components = {
+ name: _aggregate_component(rows, name)
+ for name in ("dispatch", "combine", "roundtrip")
+ }
+ if components["dispatch"] is None:
+ components["isolated_sum"] = None
+ else:
+ latency = {
+ statistic: components["dispatch"]["latency_us"][statistic]
+ + components["combine"]["latency_us"][statistic]
+ for statistic in ("p50", "p90", "p95", "p99")
+ }
+ components["isolated_sum"] = {
+ "origin": "derived", "latency_us": latency, "logical_bytes": None,
+ "logical_payload_rate_gbps_at_latency_percentile": None, "sample_count": None,
+ }
+ points.append({
+ "point_id": rows[0]["point_id"],
+ "tokens_per_rank": token,
+ "global_tokens": token * case["ep_size"],
+ "correct": all(row["correctness"]["passed"] for row in rows),
+ "routing": routing,
+ "components": components,
+ "roundtrip_token_rate_at_latency_percentile": {
+ statistic: (token * case["ep_size"])
+ / (components["roundtrip"]["latency_us"][statistic] * 1e-6)
+ for statistic in ("p50", "p90", "p95", "p99")
+ },
+ "evidence_ids": [row["evidence_id"] for row in rows],
+ })
+ series = {
+ "series_id": series_id,
+ "label": (
+ f"{case['runner'].upper()} / {BACKEND_LABELS.get(backend_id, backend_id)} / "
+ f"EP{case['ep_size']} / {case['phase']} / {shape['routing']}"
+ f"{' + EPLB' if case['eplb']['enabled'] else ''} / {resource_label}"
+ ),
+ "status": "decision-grade" if eligibility["decision_grade"] else "diagnostic",
+ "case_ids": sorted({document["identity"]["case_id"] for document in documents}),
+ "allocation_ids": sorted(allocations),
+ "model": _slug(case["workload_name"]),
+ "suite": _slug(case["suite"]),
+ "phase": case["phase"],
+ "publication_tier": case["required_publication"],
+ "backend": {
+ "id": _slug(backend_id),
+ "label": BACKEND_LABELS.get(backend_id, backend_id),
+ "role": "reference" if backend_id == "nccl-ep" else "library",
+ **public_config["backend"],
+ },
+ "build": {
+ "implementation_contract_sha256": first["identity"]["series_factors"][
+ "implementation_contract_sha256"
+ ],
+ "public_config_sha256": first["identity"]["series_factors"][
+ "public_config_sha256"
+ ],
+ "routing_control_sha256": first["identity"]["series_factors"][
+ "routing_control_sha256"
+ ],
+ "runtime_fingerprint_sha256": first["identity"]["series_factors"][
+ "runtime_fingerprint_sha256"
+ ],
+ "image_digest": first["identity"]["series_factors"]["image_digest"],
+ "source_sha": first["identity"]["series_factors"]["source_sha"],
+ "squash_sha256": first["identity"]["series_factors"]["squash_sha256"],
+ },
+ "system": {
+ "sku": _slug(case["runner"]),
+ "label": public_config["system"]["label"],
+ "vendor": runtime["vendor"],
+ "topology_class": _slug(topology["topology_class"]),
+ "transport": _slug(topology["transport"]),
+ "world_size": topology["world_size"],
+ "ep_size": case["ep_size"],
+ "placement": topology["placement"],
+ },
+ "workload": {
+ "workload_id": workload_id,
+ "hidden": shape["hidden"],
+ "top_k": shape["topk"],
+ "experts": case["eplb"]["num_logical_experts"],
+ "routing": shape["routing"],
+ "eplb": case["eplb"]["enabled"],
+ "dispatch_dtype": shape["dispatch_dtype"],
+ "combine_dtype": shape["quant"]["combine_output_dtype"],
+ "activation_profile": shape["activation_profile"],
+ },
+ "eplb": eplb,
+ "resource": public_config["resource"],
+ "measurement": {
+ "contract": first["measurement"]["contract"],
+ "sampling_contract": first["measurement"]["sampling"]["contract"],
+ "iters": first["measurement"]["sampling"]["iterations_per_trial"],
+ "trials": first["measurement"]["sampling"]["trials"],
+ "warmups": first["measurement"]["sampling"]["warmup_iterations"],
+ "samples_per_component": first["measurement"]["sampling"]["samples_per_component"],
+ "headline_component": "roundtrip",
+ "headline_percentile": "p99",
+ },
+ "points": points,
+ "eligibility": eligibility,
+ }
+ internal = {
+ "documents": list(documents),
+ "run_metrics": run_metrics,
+ "series_factors": first["identity"]["series_factors"],
+ }
+ return series, internal
+
+
+def _resolve_bundle_file(root: Path, record: dict[str, Any]) -> Path:
+ path = root.joinpath(*PurePosixPath(record["path"]).parts)
+ try:
+ path.relative_to(root)
+ except ValueError as exc:
+ raise PublisherError("bundle record escapes its directory") from exc
+ if path.resolve() != path or path.is_symlink() or not path.is_file():
+ raise PublisherError("bundle record points to a missing or linked file")
+ if path.stat().st_size != record["bytes"] or _sha_file(path) != record["sha256"]:
+ raise PublisherError("bundle file checksum differs from its manifest")
+ return path
+
+
+def load_bundle(store: Store, bundle_id: str) -> dict[str, Any]:
+ if HEX64.fullmatch(bundle_id) is None:
+ raise PublisherError("bundle ID must be a SHA-256 digest")
+ root = store.bundles / bundle_id
+ if root.is_symlink() or not (root / "COMPLETE").is_file():
+ raise PublisherError(f"bundle {bundle_id} is missing or incomplete")
+ _verify_frozen_tree(root, private=True)
+ if (root / "COMPLETE").read_text().strip() != bundle_id:
+ raise PublisherError("bundle COMPLETE marker differs")
+ manifest_path = root / "bundle.json"
+ if _sha_file(manifest_path) != bundle_id:
+ raise PublisherError("bundle directory digest differs from bundle.json")
+ manifest = validate_bundle_manifest(strict_load(manifest_path))
+ checksum_path = _resolve_bundle_file(root, manifest["checksums"])
+ checksum_document = strict_load(checksum_path)
+ checksum_document = _exact(checksum_document, {"format", "files"}, "checksums")
+ if checksum_document["format"] != "collectivex.checksums.v1":
+ raise PublisherError("bundle checksum format is invalid")
+ records = [_file_record(value, f"checksums.files[{index}]")
+ for index, value in enumerate(_array(checksum_document["files"], "checksums.files"))]
+ _unique([record["path"] for record in records], "checksums.files[].path")
+ for record in records:
+ _resolve_bundle_file(root, record)
+ expected_paths = {
+ path.relative_to(root).as_posix() for path in _tree_files(root)
+ if path.name not in {"bundle.json", "checksums.json"}
+ }
+ if {record["path"] for record in records} != expected_paths:
+ raise PublisherError("bundle checksum catalog does not cover its payload exactly")
+ artifact_by_root: dict[str, str] = {}
+ for index, source in enumerate(manifest["sources"]):
+ _resolve_bundle_file(root, source)
+ archive_key = f"artifact-{index:04d}"
+ if source["path"] != f"source/{archive_key}.zip":
+ raise PublisherError("bundle source catalog order/path differs")
+ artifact_by_root[archive_key] = source["artifact_name"]
+ if len(set(artifact_by_root.values())) != len(artifact_by_root):
+ raise PublisherError("bundle source catalog repeats an artifact name")
+ matrix_path = _resolve_bundle_file(root, manifest["matrix"])
+ matrix_document = strict_load(matrix_path)
+ cases = validate_matrix(matrix_document)
+ expected_by_id = {case["case_id"]: case for case in cases}
+ expected_deliveries = _expected_deliveries(
+ matrix_document, cases, manifest["run"]
+ )
+ if {item["case_id"] for item in manifest["coverage"]["selections"]} != set(expected_by_id):
+ raise PublisherError("bundle selected coverage differs from requested matrix")
+ documents: dict[str, dict[str, Any]] = {}
+ runtime_fingerprints: set[str] = set()
+ for attempt in manifest["attempts"]:
+ document_path = _resolve_bundle_file(root, attempt["document"])
+ document = contracts.strict_load(document_path)
+ artifact_safety.assert_publication_safe([document])
+ if document.get("format") == contracts.RAW_FORMAT:
+ _schema("raw-case-v1.schema.json", document)
+ sample_path = document_path.with_name(document["sample_artifact"]["path"])
+ if attempt["samples"] is None:
+ raise PublisherError("raw attempt is missing its sample manifest record")
+ manifest_sample_path = _resolve_bundle_file(root, attempt["samples"])
+ if manifest_sample_path != sample_path:
+ raise PublisherError("sample manifest record points to the wrong raw evidence")
+ sample_document = contracts.strict_load(sample_path)
+ artifact_safety.assert_publication_safe([sample_document])
+ _schema("samples-v1.schema.json", sample_document)
+ document = contracts.load_raw_attempt(document_path)
+ else:
+ if attempt["samples"] is not None:
+ raise PublisherError("terminal attempt unexpectedly names a sample artifact")
+ _schema("terminal-outcome-v1.schema.json", document)
+ document = contracts.validate_terminal_document(document)
+ _validate_delivery_binding(
+ document, document_path, root / "raw", artifact_by_root,
+ expected_by_id, expected_deliveries, manifest["run"],
+ )
+ expected_record = _attempt_record(
+ document, document_path, root, selected=attempt["selected"]
+ )
+ if expected_record != attempt:
+ raise PublisherError("bundle attempt record differs from native document")
+ if attempt["runtime_fingerprint_sha256"]:
+ runtime_fingerprints.add(attempt["runtime_fingerprint_sha256"])
+ documents[attempt["attempt_id"]] = document
+ if sorted(runtime_fingerprints) != manifest["runtime_fingerprints"]:
+ raise PublisherError("bundle runtime fingerprint catalog differs from attempts")
+ selected = {
+ selection["case_id"]: documents[selection["selected_attempt_id"]]
+ for selection in manifest["coverage"]["selections"]
+ }
+ return {
+ "id": bundle_id,
+ "root": root,
+ "manifest": manifest,
+ "cases": cases,
+ "documents": documents,
+ "selected": selected,
+ }
+
+
+def _cohort_control(
+ kind: str, series: dict[str, Any], internal: dict[str, Any]
+) -> tuple[dict[str, Any], list[str], list[str], Any]:
+ binary_build = series["build"]
+ source = binary_build["source_sha"]
+ workload = series["workload"]
+ shape = {
+ key: workload[key]
+ for key in ("hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype", "activation_profile")
+ }
+ common = {
+ "model": series["model"], "phase": series["phase"], "shape": shape,
+ "measurement": series["measurement"], "ep_size": series["system"]["ep_size"],
+ }
+ if kind == "library":
+ control = {**common, "system": series["system"], "workload": workload,
+ "resource_mode": series["resource"]["mode"], "source": source}
+ return control, ["system", "workload", "phase", "measurement", "resource.mode", "source"], ["backend", "resource"], series["backend"]["id"]
+ if kind == "chip":
+ control = {**common, "backend": series["backend"], "source": source,
+ "workload": workload, "resource_mode": series["resource"]["mode"]}
+ return control, ["backend", "source", "workload", "phase", "measurement", "resource.mode"], ["system", "resource"], series["system"]
+ if kind == "system":
+ control = {**common, "workload": workload, "source": source}
+ varying = [series["system"]["sku"], series["backend"]["id"], series["resource"]["profile"]]
+ return control, ["workload", "phase", "measurement", "source"], ["system", "backend", "resource"], varying
+ if kind == "routing":
+ control = {
+ **common,
+ "backend": series["backend"],
+ "system": series["system"],
+ "resource": series["resource"],
+ "build": _routing_build_control(binary_build),
+ }
+ varying = [
+ workload["routing"], workload["eplb"],
+ binary_build["implementation_contract_sha256"],
+ ]
+ return (
+ control,
+ ["backend", "implementation-static-build", "system", "model-shape", "phase", "measurement", "resource"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ varying,
+ )
+ raise PublisherError(f"unknown cohort kind {kind}")
+
+
+def _cohort_ordering(
+ members: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]], tokens: Sequence[int]
+) -> tuple[bool, int]:
+ run_ids = set.intersection(*(
+ set(internals[member["series_id"]]["run_metrics"]) for member in members
+ ))
+ if len(run_ids) < REQUIRED_ALLOCATIONS:
+ return False, len(run_ids)
+ orders: list[tuple[str, str, int, str, tuple[str, ...]]] = []
+ for run_id in sorted(run_ids):
+ for token in tokens:
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"):
+ for statistic in ("p50", "p99"):
+ ordered = tuple(
+ member["series_id"]
+ for member in sorted(
+ members,
+ key=lambda item: (
+ internals[item["series_id"]]["run_metrics"][run_id][token][measure][statistic],
+ item["series_id"],
+ ),
+ reverse=measure == "logical_payload_rate_gbps_at_latency_percentile",
+ )
+ )
+ orders.append((measure, statistic, token, run_id, ordered))
+ for token in tokens:
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"):
+ for statistic in ("p50", "p99"):
+ observed = {
+ entry[4]
+ for entry in orders
+ if entry[0] == measure and entry[1] == statistic and entry[2] == token
+ }
+ if len(observed) != 1:
+ return False, len(run_ids)
+ return True, len(run_ids)
+
+
+def build_decisions(
+ series: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]]
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
+ cohorts: list[dict[str, Any]] = []
+ for kind in ("library", "chip", "system", "routing"):
+ groups: dict[bytes, list[tuple[dict[str, Any], Any, list[str], list[str]]]] = {}
+ for item in series:
+ if kind == "library" and item["backend"]["role"] != "library":
+ continue
+ if kind == "system" and item["backend"]["role"] != "reference":
+ continue
+ control, controlled, varying, variant = _cohort_control(kind, item, internals[item["series_id"]])
+ groups.setdefault(_canonical(control), []).append((item, variant, controlled, varying))
+ for entries in groups.values():
+ variants = {_canonical(entry[1]) for entry in entries}
+ if len(entries) < 2 or len(variants) < 2:
+ continue
+ members = sorted((entry[0] for entry in entries), key=lambda item: item["series_id"])
+ token_sets = [set(point["tokens_per_rank"] for point in member["points"]) for member in members]
+ tokens = sorted(set.intersection(*token_sets))
+ same_points = len({tuple(sorted(values)) for values in token_sets}) == 1
+ ordering, aligned_runs = _cohort_ordering(members, internals, tokens) if tokens else (False, 0)
+ allocations = sorted({value for member in members for value in member["allocation_ids"]})
+ p50_ratio = max(
+ (member["eligibility"]["p50_max_min_ratio"] for member in members
+ if member["eligibility"]["p50_max_min_ratio"] is not None), default=None
+ )
+ p99_ratio = max(
+ (member["eligibility"]["p99_max_min_ratio"] for member in members
+ if member["eligibility"]["p99_max_min_ratio"] is not None), default=None
+ )
+ extra = {
+ reason for member in members for reason in member["eligibility"]["reasons"]
+ if reason not in {"unstable-ordering"}
+ }
+ if aligned_runs < REQUIRED_ALLOCATIONS:
+ extra.add("incomplete-aligned-repeats")
+ if kind == "routing" and sum(
+ member["workload"]["routing"] == "uniform"
+ and not member["workload"]["eplb"]
+ for member in members
+ ) != 1:
+ extra.add("missing-uniform-baseline")
+ if kind == "routing" and {
+ (member["workload"]["routing"], member["workload"]["eplb"])
+ for member in members
+ } != {("uniform", False), ("zipf", False), ("zipf", True)}:
+ extra.add("incomplete-routing-anchors")
+ if kind == "routing" and _routing_implementation_mismatch(members):
+ extra.add("implementation-config-mismatch")
+ if not tokens or (kind != "routing" and not same_points):
+ extra.add("unmatched-token-coverage")
+ eligibility = _eligibility_record(
+ allocations,
+ complete=all(member["eligibility"]["complete"] for member in members)
+ and bool(tokens) and (kind == "routing" or same_points),
+ correct=all(member["eligibility"]["correct"] for member in members),
+ measured=all(member["eligibility"]["measured_roundtrip_p99"] for member in members),
+ stable_ordering=ordering,
+ p50_ratio=p50_ratio,
+ p99_ratio=p99_ratio,
+ extra_reasons=sorted(extra),
+ )
+ member_ids = [member["series_id"] for member in members]
+ publication_tier = (
+ "comparable-experimental"
+ if any(member["publication_tier"] == "comparable-experimental" for member in members)
+ else "official"
+ )
+ controlled, varying = entries[0][2], entries[0][3]
+ cohort_id = _derived_id("cxcohort-v1-", {
+ "kind": kind, "series_ids": member_ids,
+ "controlled_factors": controlled, "varying_factors": varying,
+ })
+ kind_label = "Platform" if kind == "chip" else kind.title()
+ first = members[0]
+ routing_label = first["workload"]["routing"] + (
+ "+EPLB" if first["workload"]["eplb"] else ""
+ )
+ context = {
+ "library": (
+ f"{first['system']['sku'].upper()} EP{first['system']['ep_size']} / "
+ f"{first['phase']} / {routing_label}"
+ ),
+ "chip": (
+ f"{first['backend']['label']} EP{first['system']['ep_size']} / "
+ f"{first['phase']} / {routing_label}"
+ ),
+ "system": (
+ f"Reference EP{first['system']['ep_size']} / {first['phase']} / "
+ f"{routing_label}"
+ ),
+ "routing": (
+ f"{first['system']['sku'].upper()} / {first['backend']['label']} / "
+ f"EP{first['system']['ep_size']} / {first['phase']}"
+ ),
+ }[kind]
+ cohorts.append({
+ "cohort_id": cohort_id,
+ "kind": kind,
+ "label": f"{context} / {kind_label} contrast ({len(members)} series)",
+ "description": (
+ "Publisher-controlled NCCL/RCCL system comparison"
+ if kind == "system"
+ else f"Publisher-controlled {kind_label.lower()} comparison"
+ ),
+ "series_ids": member_ids,
+ "controlled_factors": controlled,
+ "varying_factors": varying,
+ "publication_tier": publication_tier,
+ "eligibility": eligibility,
+ })
+ cohorts.sort(key=lambda item: item["cohort_id"])
+ series_by_id = {item["series_id"]: item for item in series}
+ rankings: list[dict[str, Any]] = []
+ recommendations: list[dict[str, Any]] = []
+ sensitivities: list[dict[str, Any]] = []
+ for cohort in cohorts:
+ if not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series_by_id[series_id] for series_id in cohort["series_ids"]]
+ tokens = sorted(set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]} for member in members
+ )))
+ for token in tokens:
+ for measure, objective, unit in (
+ ("latency_us", "min", "us"), ("logical_payload_rate_gbps_at_latency_percentile", "max", "GB/s")
+ ):
+ for statistic in ("p50", "p99"):
+ metric = {
+ "operation": "roundtrip", "statistic": statistic,
+ "measure": measure, "objective": objective,
+ "tokens_per_rank": token, "phase": members[0]["phase"],
+ }
+ entries = []
+ for member in members:
+ point_id, value, observed_unit = _metric_value(member, metric)
+ if observed_unit != unit:
+ raise PublisherError("publisher metric unit differs")
+ entries.append({
+ "rank": 0, "series_id": member["series_id"], "point_id": point_id,
+ "value": value, "unit": unit,
+ })
+ entries.sort(key=lambda item: (item["value"], item["series_id"]), reverse=objective == "max")
+ for rank, entry in enumerate(entries, 1):
+ entry["rank"] = rank
+ ranking_id = _derived_id("cxranking-v1-", {
+ "cohort_id": cohort["cohort_id"], "metric": metric,
+ })
+ metric_label = _metric_label(measure, statistic)
+ rankings.append({
+ "ranking_id": ranking_id, "cohort_id": cohort["cohort_id"],
+ "label": f"{cohort['kind'].title()} {metric_label} T={token}",
+ "metric": metric, "entries": entries,
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ if cohort["publication_tier"] != "official":
+ continue
+ objective_name = (
+ f"min-{statistic}-latency"
+ if measure == "latency_us"
+ else f"max-payload-rate-at-{statistic}-latency"
+ )
+ top = entries[0]
+ recommendation_id = _derived_id("cxrecommendation-v1-", {
+ "objective": objective_name, "ranking_id": ranking_id,
+ })
+ recommendations.append({
+ "recommendation_id": recommendation_id,
+ "cohort_id": cohort["cohort_id"],
+ "label": f"Best {metric_label} at T={token}",
+ "objective": objective_name,
+ "series_id": top["series_id"], "point_id": top["point_id"],
+ "value": top["value"], "unit": top["unit"],
+ "rationale": "Top stable measured roundtrip result in a controlled cohort",
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ if cohort["kind"] == "routing":
+ baseline = next(
+ (member for member in members
+ if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]),
+ None,
+ )
+ if baseline:
+ for candidate in members:
+ if candidate is baseline:
+ continue
+ for token in tokens:
+ for measure, objective in (("latency_us", "min"), ("logical_payload_rate_gbps_at_latency_percentile", "max")):
+ for statistic in ("p50", "p99"):
+ metric = {
+ "operation": "roundtrip", "statistic": statistic,
+ "measure": measure, "objective": objective,
+ "tokens_per_rank": token, "phase": baseline["phase"],
+ }
+ _, base_value, _ = _metric_value(baseline, metric)
+ _, candidate_value, _ = _metric_value(candidate, metric)
+ sensitivity_id = _derived_id("cxsensitivity-v1-", {
+ "baseline": baseline["series_id"], "candidate": candidate["series_id"],
+ "cohort": cohort["cohort_id"], "metric": metric,
+ })
+ sensitivities.append({
+ "sensitivity_id": sensitivity_id,
+ "cohort_id": cohort["cohort_id"],
+ "label": (
+ f"Routing sensitivity: "
+ f"{_metric_label(measure, statistic)} T={token}"
+ ),
+ "baseline_series_id": baseline["series_id"],
+ "candidate_series_id": candidate["series_id"],
+ "metric": metric,
+ "signed_change_ratio": (candidate_value - base_value) / base_value,
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ rankings.sort(key=lambda item: item["ranking_id"])
+ recommendations.sort(key=lambda item: item["recommendation_id"])
+ sensitivities.sort(key=lambda item: item["sensitivity_id"])
+ return cohorts, rankings, recommendations, sensitivities
+
+
+def _require_runnable_promotion_success(
+ bundles: Sequence[dict[str, Any]], cases: dict[str, dict[str, Any]]
+) -> None:
+ for bundle in bundles:
+ for case_id, case in cases.items():
+ if case["_disposition"] != "runnable":
+ continue
+ status, _ = _outcome(bundle["selected"][case_id])
+ if status != "success":
+ raise PublisherError(
+ "promotion requires every runnable matrix case to succeed "
+ "in every selected bundle"
+ )
+ prior_statuses = {
+ _outcome(document)[0]
+ for document in bundle["documents"].values()
+ if document["identity"]["case_id"] == case_id
+ }
+ if prior_statuses != {"success"}:
+ raise PublisherError(
+ "promotion rejects runnable cases with failed, invalid, or diagnostic retries"
+ )
+
+
+def _expected_chip_cohort_count(series: Sequence[dict[str, Any]]) -> int:
+ groups: dict[bytes, set[bytes]] = {}
+ for item in series:
+ control, variant = _public_cohort_factors("chip", item)
+ groups.setdefault(_canonical(control), set()).add(_canonical(variant))
+ return sum(len(variants) >= 2 for variants in groups.values())
+
+
+def _require_promotion_cohorts(
+ cohorts: Sequence[dict[str, Any]], series: Sequence[dict[str, Any]]
+) -> None:
+ eligible_kinds = {
+ cohort["kind"]
+ for cohort in cohorts
+ if cohort["eligibility"]["decision_grade"]
+ }
+ missing = [kind for kind in REQUIRED_COHORT_KINDS if kind not in eligible_kinds]
+ if missing:
+ raise PublisherError(
+ "promotion lacks decision-grade cohort kinds: " + ", ".join(missing)
+ )
+ for kind, expected in REQUIRED_PROMOTION_COHORT_COUNTS.items():
+ members = [cohort for cohort in cohorts if cohort["kind"] == kind]
+ if len(members) != expected or any(
+ not cohort["eligibility"]["decision_grade"] for cohort in members
+ ):
+ raise PublisherError(
+ f"promotion requires exactly {expected} decision-grade {kind} cohorts"
+ )
+
+ chip_cohorts = [cohort for cohort in cohorts if cohort["kind"] == "chip"]
+ expected_chips = _expected_chip_cohort_count(series)
+ if len(chip_cohorts) != expected_chips or any(
+ not cohort["eligibility"]["decision_grade"] for cohort in chip_cohorts
+ ):
+ raise PublisherError(
+ f"promotion requires all {expected_chips} derived chip cohorts to be decision-grade"
+ )
+
+ by_id = {item["series_id"]: item for item in series}
+ anchors = {("uniform", False), ("zipf", False), ("zipf", True)}
+ for cohort in (
+ item for item in cohorts
+ if item["kind"] == "routing" and item["eligibility"]["decision_grade"]
+ ):
+ observed = {
+ (by_id[series_id]["workload"]["routing"], by_id[series_id]["workload"]["eplb"]):
+ by_id[series_id]
+ for series_id in cohort["series_ids"]
+ }
+ if len(cohort["series_ids"]) != len(anchors) or set(observed) != anchors:
+ raise PublisherError(
+ "promotion routing cohorts require exact uniform, zipf, and zipf+EPLB anchors"
+ )
+ if (
+ observed[("uniform", False)]["build"]["implementation_contract_sha256"]
+ != observed[("zipf", False)]["build"]["implementation_contract_sha256"]
+ ):
+ raise PublisherError(
+ "promotion routing cohorts require identical off-EPLB generated implementation"
+ )
+
+
+def _require_promotion_series(series: Sequence[dict[str, Any]]) -> None:
+ if not series or any(item["status"] != "decision-grade" for item in series):
+ raise PublisherError("promotion has unstable or incomplete required series")
+
+
+def build_dataset(
+ store: Store,
+ bundle_ids: Sequence[str],
+ *,
+ promote: bool,
+) -> dict[str, Any]:
+ if not bundle_ids or len(bundle_ids) != len(set(bundle_ids)):
+ raise PublisherError("dataset requires unique explicit bundle IDs")
+ loaded = [load_bundle(store, bundle_id) for bundle_id in bundle_ids]
+ loaded.sort(key=lambda bundle: (
+ int(bundle["manifest"]["run"]["run_id"]),
+ bundle["manifest"]["run"]["run_attempt"],
+ bundle["id"],
+ ))
+ matrix_ids = {bundle["manifest"]["matrix"]["sha256"] for bundle in loaded}
+ case_sets = [{case["case_id"] for case in bundle["cases"]} for bundle in loaded]
+ if len(matrix_ids) != 1 or len({tuple(sorted(values)) for values in case_sets}) != 1:
+ raise PublisherError("dataset bundles do not share one exact requested matrix")
+ run_ids = [bundle["manifest"]["run"]["run_id"] for bundle in loaded]
+ if promote and (
+ len(loaded) != REQUIRED_ALLOCATIONS
+ or len(run_ids) != len(set(run_ids))
+ ):
+ raise PublisherError("promotion requires three independent complete workflow runs")
+ if promote and matrix_ids != {CANONICAL_FULL_V1_MATRIX_SHA256}:
+ raise PublisherError("promotion requires the canonical full-v1 matrix")
+ cases = {case["case_id"]: case for case in loaded[0]["cases"]}
+ if promote:
+ _require_runnable_promotion_success(loaded, cases)
+ all_documents = [
+ document for bundle in loaded for document in bundle["documents"].values()
+ ]
+ selected_ids = {
+ selection["selected_attempt_id"]
+ for bundle in loaded for selection in bundle["manifest"]["coverage"]["selections"]
+ }
+ public_attempts = [
+ _public_attempt(
+ document, selected=document["identity"]["attempt_id"] in selected_ids
+ )
+ for document in all_documents
+ ]
+ _unique([attempt["attempt_id"] for attempt in public_attempts], "dataset attempts")
+ selected_by_case: dict[str, list[dict[str, Any]]] = {
+ case_id: [bundle["selected"][case_id] for bundle in loaded]
+ for case_id in sorted(cases)
+ }
+ coverage: list[dict[str, Any]] = []
+ for case_id, case in sorted(cases.items()):
+ attempts = sorted(
+ (attempt for attempt in public_attempts if attempt["case_id"] == case_id),
+ key=lambda attempt: (
+ int(attempt["run_id"]), attempt["run_attempt"],
+ attempt["attempt_index"], attempt["attempt_id"],
+ ),
+ )
+ selected = _public_attempt(selected_by_case[case_id][-1], selected=True)
+ coverage.append({
+ "case_id": case_id,
+ "label": f"{case['sku'].upper()} / {case['backend']} / EP{case['ep']} / {case['phase']} / {case['routing']}",
+ "required": True,
+ "sku": _slug(case["sku"]),
+ "backend": _slug(case["backend"]),
+ "phase": case["phase"],
+ "disposition": case["_disposition"],
+ "selected_attempt_id": selected["attempt_id"],
+ "outcome": selected["outcome"],
+ "failure_mode": selected["failure_mode"],
+ "reason": case["_reason"] if case["_disposition"] == "unsupported" else selected["reason"],
+ "attempt_ids": [attempt["attempt_id"] for attempt in attempts],
+ })
+ by_series: dict[str, list[dict[str, Any]]] = {}
+ for case_documents in selected_by_case.values():
+ for document in case_documents:
+ if (
+ document["format"] == contracts.RAW_FORMAT
+ and document["outcome"]["status"] == "success"
+ ):
+ by_series.setdefault(document["identity"]["series_id"], []).append(document)
+ series: list[dict[str, Any]] = []
+ internals: dict[str, dict[str, Any]] = {}
+ for series_id, documents in sorted(by_series.items()):
+ item, internal = _build_series(series_id, documents, len(loaded))
+ series.append(item)
+ internals[series_id] = internal
+ cohorts, rankings, recommendations, sensitivities = build_decisions(series, internals)
+ allocation_ids = sorted({attempt["allocation_id"] for attempt in public_attempts})
+ status = "promoted" if promote else "diagnostic"
+ dataset = {
+ "format": FORMAT_PUBLIC,
+ "schema_version": 1,
+ "generated_at": _latest_timestamp(
+ [bundle["manifest"]["created_at"] for bundle in loaded]
+ ),
+ "source_bundle_ids": sorted(bundle_ids),
+ "promotion": {
+ "status": status,
+ "reason": None,
+ "matrix_id": next(iter(matrix_ids)),
+ "allocation_ids": allocation_ids,
+ "required_allocations": REQUIRED_ALLOCATIONS,
+ "requested_cases": len(coverage),
+ "terminal_cases": len(coverage),
+ "policy": POLICY,
+ },
+ "coverage": coverage,
+ "attempts": sorted(public_attempts, key=lambda attempt: attempt["attempt_id"]),
+ "series": series,
+ "cohorts": cohorts,
+ "rankings": rankings,
+ "recommendations": recommendations,
+ "sensitivities": sensitivities,
+ }
+ if promote:
+ _require_promotion_series(series)
+ _require_promotion_cohorts(cohorts, series)
+ validate_public_dataset(dataset)
+ return dataset
+
+
+def _quarantine_dataset(reason: str, generated_at: str) -> dict[str, Any]:
+ dataset = {
+ "format": FORMAT_PUBLIC,
+ "schema_version": 1,
+ "generated_at": generated_at,
+ "source_bundle_ids": [],
+ "promotion": {
+ "status": "quarantined",
+ "reason": reason,
+ "matrix_id": None,
+ "allocation_ids": [],
+ "required_allocations": REQUIRED_ALLOCATIONS,
+ "requested_cases": 0,
+ "terminal_cases": 0,
+ "policy": POLICY,
+ },
+ "coverage": [],
+ "attempts": [],
+ "series": [],
+ "cohorts": [],
+ "rankings": [],
+ "recommendations": [],
+ "sensitivities": [],
+ }
+ validate_public_dataset(dataset)
+ return dataset
+
+
+def quarantine_incoming(
+ store: Store, ingest_id: str, reason: str, generated_at: str
+) -> str:
+ if REASON.fullmatch(reason) is None:
+ raise PublisherError("quarantine reason must be a machine code")
+ public_reason = f"{reason}-{ingest_id}"
+ if REASON.fullmatch(public_reason) is None:
+ raise PublisherError("quarantine reason and incoming ID exceed the public reason contract")
+ manifest = {
+ "format": "collectivex.quarantine.v1",
+ "schema_version": 1,
+ "created_at": generated_at,
+ "incoming_id": ingest_id,
+ "reason": reason,
+ }
+ digest = _sha_bytes(_canonical(manifest))
+ with store.staging(store.quarantine, private=True) as stage:
+ _write_json(stage / "quarantine.json", manifest, mode=0o600)
+ store.complete(stage, digest, private=True)
+ store.install(stage, store.quarantine / digest, private=True)
+ if _sha_bytes(_canonical(strict_load(store.quarantine / digest / "quarantine.json"))) != digest:
+ raise PublisherError("existing quarantine object differs")
+ # The incoming digest distinguishes separate rejected deliveries while preserving
+ # byte-identical output when the operator retries the same immutable input.
+ dataset = _quarantine_dataset(public_reason, generated_at)
+ dataset_digest, size = store.install_dataset(dataset)
+ store.update_channel("latest-attempt", dataset_digest, size, generated_at)
+ return digest
+
+
+def _store_from_args(args: argparse.Namespace) -> Store:
+ root = args.store_root or os.environ.get("COLLECTIVEX_STORE_ROOT")
+ if not root:
+ raise PublisherError("COLLECTIVEX_STORE_ROOT or --store-root is required")
+ if not Path(root).is_absolute():
+ raise PublisherError("COLLECTIVEX_STORE_ROOT must be an absolute path")
+ return Store(root)
+
+
+def _run_metadata(args: argparse.Namespace) -> dict[str, Any]:
+ """Validate offline operator assertions about a completed successful GHA run.
+
+ The publisher deliberately performs no network access. The caller must preflight workflow
+ identity and conclusion against GitHub before supplying these values; artifact-internal
+ provenance is then required to match them exactly.
+ """
+ run = {
+ "repository": args.repository,
+ "run_id": args.run_id,
+ "run_attempt": args.run_attempt,
+ "source_sha": args.source_sha,
+ }
+ # Reuse the authoritative private schema constraints before any filesystem mutation.
+ if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", run["repository"] or ""):
+ raise PublisherError("--repository must be owner/name")
+ if not re.fullmatch(r"[1-9][0-9]*", run["run_id"] or ""):
+ raise PublisherError("--run-id must be a positive decimal string")
+ if type(run["run_attempt"]) is not int or run["run_attempt"] < 1:
+ raise PublisherError("--run-attempt must be positive")
+ if not re.fullmatch(r"[0-9a-f]{40}", run["source_sha"] or ""):
+ raise PublisherError("--source-sha must be a 40-character lowercase Git SHA")
+ return run
+
+
+def _ingest_inputs(
+ args: argparse.Namespace,
+) -> tuple[dict[str, Any], Path, list[Path]]:
+ run = _run_metadata(args)
+ matrix = Path(args.matrix).absolute()
+ if matrix.is_symlink() or not matrix.is_file():
+ raise PublisherError("--matrix must be a regular non-symlink file")
+ artifacts = [Path(value).absolute() for value in args.artifact]
+ if not artifacts:
+ raise PublisherError("at least one --artifact is required")
+ names = [_artifact_name(path) for path in artifacts]
+ if len(names) != len(set(names)):
+ raise PublisherError("--artifact contains duplicate GHA names")
+ for path in artifacts:
+ if path.is_symlink() or not (path.is_dir() or path.is_file()):
+ raise PublisherError("--artifact must be a regular ZIP or real directory")
+ return run, matrix, artifacts
+
+
+def _bundle_ids(values: Sequence[str], *, promote: bool) -> list[str]:
+ bundle_ids = list(values)
+ if (
+ not bundle_ids
+ or len(bundle_ids) != len(set(bundle_ids))
+ or any(HEX64.fullmatch(value) is None for value in bundle_ids)
+ ):
+ raise PublisherError("bundle IDs must be unique SHA-256 digests")
+ if promote and len(bundle_ids) != REQUIRED_ALLOCATIONS:
+ raise PublisherError("promotion requires exactly three explicit bundle IDs")
+ return bundle_ids
+
+
+def ingest_command(args: argparse.Namespace) -> dict[str, Any]:
+ run, matrix, artifacts = _ingest_inputs(args)
+ store = _store_from_args(args)
+ with store.locked():
+ ingest_id, incoming, _ = archive_incoming(
+ store, matrix, artifacts, run
+ )
+ try:
+ bundle_id, _, _ = build_bundle(store, ingest_id, incoming, run)
+ dataset = build_dataset(store, [bundle_id], promote=False)
+ dataset_id, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", dataset_id, size, dataset["generated_at"]
+ )
+ store.verify_channel("latest-attempt")
+ return {
+ "status": "accepted", "incoming_id": ingest_id,
+ "bundle_id": bundle_id, "dataset_sha256": dataset_id,
+ "channel": "latest-attempt",
+ }
+ except (
+ PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError,
+ ) as exc:
+ # Invalid delivery bytes provide no trusted timestamp. A fixed sentinel keeps
+ # repeated quarantine of the same immutable incoming object content-idempotent.
+ generated_at = "1970-01-01T00:00:00Z"
+ quarantine_id = quarantine_incoming(
+ store, ingest_id, "artifact-validation-failed", generated_at
+ )
+ raise PublisherError(
+ f"incoming {ingest_id} quarantined as {quarantine_id}: {exc}"
+ ) from exc
+
+
+def promote_command(args: argparse.Namespace) -> dict[str, Any]:
+ bundle_ids = _bundle_ids(args.bundle, promote=True)
+ store = _store_from_args(args)
+ with store.locked():
+ dataset = build_dataset(store, bundle_ids, promote=True)
+ digest, size = store.install_dataset(dataset)
+ store.update_channel("dev-latest", digest, size, dataset["generated_at"])
+ store.verify_channel("dev-latest")
+ return {
+ "status": "promoted", "bundle_ids": bundle_ids,
+ "dataset_sha256": digest, "channel": "dev-latest",
+ }
+
+
+def verify_command(args: argparse.Namespace) -> dict[str, Any]:
+ bundle_ids = _bundle_ids(args.bundle, promote=False) if args.bundle else []
+ channels = args.channel or ["latest-attempt"]
+ if any(channel not in {"latest-attempt", "dev-latest"} for channel in channels):
+ raise PublisherError("unknown channel")
+ store = _store_from_args(args)
+ if args.channel is None and (store.channels / "dev-latest.json").is_file():
+ channels.append("dev-latest")
+ with store.locked():
+ pointers = {channel: store.verify_channel(channel) for channel in channels}
+ bundles = [load_bundle(store, bundle_id)["id"] for bundle_id in bundle_ids]
+ return {"status": "verified", "channels": pointers, "bundle_ids": bundles}
+
+
+def _parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="CollectiveX isolated filesystem publisher")
+ parser.add_argument("--store-root", help="defaults to COLLECTIVEX_STORE_ROOT")
+ subparsers = parser.add_subparsers(dest="command", required=True)
+ ingest = subparsers.add_parser("ingest", help="archive and validate one complete GHA run")
+ ingest.add_argument("--matrix", required=True)
+ ingest.add_argument("--artifact", action="append", required=True)
+ ingest.add_argument("--repository", required=True)
+ ingest.add_argument("--run-id", required=True)
+ ingest.add_argument("--run-attempt", required=True, type=int)
+ ingest.add_argument("--source-sha", required=True)
+ promote = subparsers.add_parser("promote", help="publish explicit independent bundles")
+ promote.add_argument("--bundle", action="append", required=True)
+ verify = subparsers.add_parser("verify", help="verify immutable targets and pointers")
+ verify.add_argument("--channel", action="append", choices=["latest-attempt", "dev-latest"])
+ verify.add_argument("--bundle", action="append", default=[])
+ return parser
+
+
+def main() -> int:
+ args = _parser().parse_args()
+ try:
+ if args.command == "ingest":
+ result = ingest_command(args)
+ elif args.command == "promote":
+ result = promote_command(args)
+ elif args.command == "verify":
+ result = verify_command(args)
+ else:
+ raise PublisherError(f"unknown command {args.command!r}")
+ except (
+ PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError, OSError,
+ ) as exc:
+ print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr)
+ return 2
+ print(json.dumps(result, sort_keys=True))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 0000000000..f68f97d83d
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,8 @@
+# Host-side matrix generation. GPU libraries are supplied by benchmark images.
+PyYAML==6.0.2
+
+# Canonical workload serialization.
+numpy>=1.26,<3
+
+# Host-only strict artifact publisher schemas (never imported by GPU execution).
+jsonschema==4.25.1
diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh
new file mode 100644
index 0000000000..3720afcf07
--- /dev/null
+++ b/experimental/CollectiveX/runtime/common.sh
@@ -0,0 +1,1686 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+_CX_COMMON_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+CX_SQUASH_FORMAT_VERSION="repro-v1"
+CX_SQUASH_SOURCE_DATE_EPOCH=1
+CX_DEEPEP_V2_COMMIT="fa8a9b16898204afd347c663b89e65ef87dc6ce6" # pragma: allowlist secret
+CX_DEEPEP_V2_TREE="29809e75c5874e6609dac4804e7b651d5226959f" # pragma: allowlist secret
+CX_DEEPEP_V2_FMT_COMMIT="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" # pragma: allowlist secret
+CX_DEEPEP_HYBRID_COMMIT="e0a5b1d9848ab3e7b4a67842bf06f067bfac67f8" # pragma: allowlist secret
+CX_DEEPEP_HYBRID_TREE="d77aeab7f1bb52b615666fe178d26ced41fae08e" # pragma: allowlist secret
+unset COLLECTIVEX_OPERATOR_CONFIG_LOADED COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Public failure telemetry is a closed vocabulary. Raw scheduler, container,
+# host, and filesystem diagnostics stay in the mode-0600 private logs.
+cx_set_failure_stage() {
+ local stage="$1"
+ case "$stage" in
+ setup|repository-stage|registry-verification|scheduler-allocation|container-import) ;;
+ container-hash|container-launch|backend-setup|execution|artifact-collection) ;;
+ *) cx_die "invalid launcher failure stage" ;;
+ esac
+ export CX_FAILSAFE_MODE="$stage"
+}
+
+cx_fail_stage() {
+ local stage="$1" log_path="${2:-}" diagnostic="unknown"
+ cx_set_failure_stage "$stage"
+ if [ -n "$log_path" ] && [ -f "$log_path" ]; then
+ if grep -aEqi 'no space left|disk quota|quota exceeded' "$log_path"; then
+ diagnostic="storage-capacity"
+ elif grep -aEqi 'permission denied|operation not permitted|read-only file system|source mount (creation|ownership validation|permission inspection|permission normalization|permission validation) failed' "$log_path"; then
+ diagnostic="storage-permission"
+ elif grep -aEqi 'outside one realized LSA domain|lsa(Size| team| domain).*(mismatch|invalid|expected)|ranks.*not in (one|the same) nvlink.domain' "$log_path"; then
+ diagnostic="accelerator-topology"
+ elif grep -aEqi 'cuda driver version is insufficient|call requires newer driver|cudaErrorCallRequiresNewerDriver|CUDA_ERROR_SYSTEM_DRIVER_MISMATCH|unsupported toolchain' "$log_path"; then
+ diagnostic="accelerator-driver"
+ elif grep -aEqi 'ncclDevCommCreate|ncclCommWindowRegister|ncclGetLsa(Device)?Pointer' "$log_path"; then
+ diagnostic="nccl-device-api"
+ elif grep -aEqi 'NVCC (PTX )?compilation failed|cuobjdump failed|invalid device (kernel )?image|no kernel image is available' "$log_path"; then
+ diagnostic="jit-toolchain"
+ elif grep -aEqi 'cuda out of memory|CUDA_ERROR_OUT_OF_MEMORY|out of memory.*cuda' "$log_path"; then
+ diagnostic="accelerator-memory"
+ elif grep -aEqi 'does not match its pinned image contract|requires the exact pinned|version mismatch' "$log_path"; then
+ diagnostic="backend-version"
+ elif grep -aEqi 'nvshmem is unavailable|build-tool installation failed' "$log_path"; then
+ diagnostic="backend-dependency"
+ elif grep -aEqi 'revision fetch failed|submodule fetch failed|package installation failed|staged source is invalid|source (pin resolution|seed validation|seed copy|checkout creation|publication validation|existing source validation) failed' "$log_path"; then
+ diagnostic="backend-source"
+ elif grep -aEqi 'failed to mount|squashfs|enroot|pyxis|mount.*invalid argument|invalid argument.*mount' "$log_path"; then
+ diagnostic="container-runtime"
+ elif grep -aEqi 'backend preparation failed|build (failed|is incomplete)|cache (mount identity )?validation failed|import failed' "$log_path"; then
+ diagnostic="backend-build"
+ elif grep -aEqi 'command not found|not found on this runner|git lookup failed' "$log_path"; then
+ diagnostic="missing-runtime"
+ elif grep -aEqi 'too many requests|rate.?limit' "$log_path"; then
+ diagnostic="registry-rate-limit"
+ elif grep -aEqi 'timed out|operation timeout|wait timeout after|watchdog.*timeout|timeout: sending signal|connection reset|could not resolve|TLS|certificate' "$log_path"; then
+ diagnostic="network-or-timeout"
+ elif grep -aEqi 'salloc:|srun:.*(unable to create step|step creation|invalid partition|invalid account)|unable to create step|job allocation' "$log_path"; then
+ diagnostic="scheduler"
+ elif grep -aEqi 'SHARD done: [0-9]+/[0-9]+ case\(s\) failed|WARN: .* run failed rc=|completed with invalid semantic evidence' "$log_path"; then
+ diagnostic="benchmark-case-failure"
+ elif [ -s "$log_path" ]; then
+ diagnostic="unclassified"
+ else
+ diagnostic="empty-log"
+ fi
+ fi
+ cx_log "ERROR: failure-class=$stage diagnostic=$diagnostic"
+ return 1
+}
+
+# Runner-local deployment settings are strict JSON kept outside the checkout.
+# Only the selected runner's allowlisted values are exported; the document is
+# never sourced or evaluated as shell.
+cx_load_operator_config() {
+ [ -n "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" ] \
+ && [ "$COLLECTIVEX_OPERATOR_CONFIG_LOADED" = "$$" ] && return 0
+ local config_path generated=0 parsed_path config_log key value
+ unset CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR CX_ENROOT_CACHE_PATH
+ unset ENROOT_CACHE_PATH
+ unset CX_EXCLUDE_NODES CX_NODELIST CX_LOCK_DIR CX_MASTER_PORT
+ config_path="${COLLECTIVEX_OPERATOR_CONFIG:-${XDG_CONFIG_HOME:-${HOME}/.config}/inferencex/collectivex.json}"
+ if [ -n "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT:-}" ]; then
+ umask 077
+ if [[ "${CX_JOB_ROOT:-}" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ && [ -d "$CX_JOB_ROOT" ] && [ ! -L "$CX_JOB_ROOT" ] \
+ && [ "$(stat -c '%u:%a' "$CX_JOB_ROOT" 2>/dev/null)" = "$(id -u):700" ]; then
+ config_path="$CX_JOB_ROOT/operator-config.json"
+ (set -C; : > "$config_path") 2>/dev/null \
+ || cx_die "cannot create ephemeral runner configuration"
+ else
+ config_path="$(mktemp /tmp/inferencex-collectivex-config.XXXXXX)" \
+ || cx_die "cannot create ephemeral runner configuration"
+ fi
+ COLLECTIVEX_EPHEMERAL_CONFIG_PATH="$config_path"
+ generated=1
+ if ! printf '%s' "$COLLECTIVEX_OPERATOR_CONFIG_CONTENT" > "$config_path"; then
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT
+ rm -f -- "$config_path"
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ cx_die "cannot materialize runner configuration"
+ fi
+ elif [ "${COLLECTIVEX_OPERATOR_CONFIG_REQUIRED:-0}" = 1 ]; then
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT
+ cx_die "runner configuration is unavailable"
+ fi
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT COLLECTIVEX_OPERATOR_CONFIG_REQUIRED
+ if [ ! -e "$config_path" ]; then
+ COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$"
+ return 0
+ fi
+ umask 077
+ parsed_path="$(mktemp /tmp/inferencex-collectivex-parsed.XXXXXX)" || {
+ [ "$generated" = 0 ] || rm -f -- "$config_path"
+ cx_die "cannot parse runner configuration"
+ }
+ config_log="$(cx_private_log_path operator-config)"
+ if ! python3 - "$config_path" "${CX_RUNNER:-${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}}" \
+ > "$parsed_path" 2> "$config_log" <<'PY'
+import json
+import os
+import posixpath
+import re
+import stat
+import sys
+
+RUNNERS = {
+ "h100-dgxc", "h200-dgxc", "b200-dgxc", "b300",
+ "gb200", "gb300", "mi325x", "mi355x",
+}
+FIELDS = {
+ "partition": "CX_PARTITION",
+ "account": "CX_ACCOUNT",
+ "squash_dir": "CX_SQUASH_DIR",
+ "stage_dir": "CX_STAGE_DIR",
+ "enroot_cache_path": "CX_ENROOT_CACHE_PATH",
+ "exclude_nodes": "CX_EXCLUDE_NODES",
+ "nodelist": "CX_NODELIST",
+ "lock_dir": "CX_LOCK_DIR",
+}
+REQUIRED = {
+ "h100-dgxc": {"partition", "account", "squash_dir"},
+ "h200-dgxc": {"partition", "squash_dir"},
+ "b200-dgxc": {"partition", "account", "squash_dir"},
+ "b300": {"partition", "account", "squash_dir", "stage_dir"},
+ "gb200": {"partition", "account", "storage_roots"},
+ "gb300": {"partition", "account", "squash_dir", "stage_dir", "enroot_cache_path"},
+ "mi325x": {"partition", "squash_dir"},
+ "mi355x": {"partition", "squash_dir"},
+}
+ALLOWED = {
+ "h100-dgxc": REQUIRED["h100-dgxc"] | {"exclude_nodes", "stage_dir"},
+ "h200-dgxc": REQUIRED["h200-dgxc"] | {"account", "exclude_nodes", "stage_dir"},
+ "b200-dgxc": REQUIRED["b200-dgxc"] | {"exclude_nodes", "stage_dir"},
+ "b300": REQUIRED["b300"] | {"exclude_nodes"},
+ "gb200": REQUIRED["gb200"],
+ "gb300": REQUIRED["gb300"],
+ "mi325x": REQUIRED["mi325x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"},
+ "mi355x": REQUIRED["mi355x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"},
+}
+TOKEN = re.compile(r"^[A-Za-z0-9_.\[\],-]+$")
+PATH = re.compile(r"^/[A-Za-z0-9._/+\-]+$")
+IPV4 = re.compile(r"(? 65536
+ ):
+ raise ValueError
+ flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
+ descriptor = os.open(path, flags)
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (metadata.st_dev, metadata.st_ino):
+ raise ValueError
+ payload = b""
+ while len(payload) <= 65536:
+ chunk = os.read(descriptor, 65537 - len(payload))
+ if not chunk:
+ break
+ payload += chunk
+ document = json.loads(
+ payload.decode("utf-8"),
+ object_pairs_hook=pairs,
+ parse_constant=lambda _: (_ for _ in ()).throw(ValueError()),
+ )
+ finally:
+ os.close(descriptor)
+ if (
+ set(document) != {"schema_version", "runners"}
+ or type(document["schema_version"]) is not int
+ or document["schema_version"] != 1
+ ):
+ raise ValueError
+ runners = document["runners"]
+ if (
+ not isinstance(runners, dict) or not runners or set(runners) - RUNNERS
+ or runner not in runners
+ ):
+ raise ValueError
+ selected = None
+ for name, config in runners.items():
+ if not isinstance(config, dict) or not REQUIRED[name].issubset(config):
+ raise ValueError
+ if set(config) - ALLOWED[name]:
+ raise ValueError
+ for field, value in config.items():
+ if field == "storage_roots":
+ if (
+ not isinstance(value, list) or not 1 <= len(value) <= 16
+ or len(value) != len(set(value)) or not all(valid_path(item) for item in value)
+ ):
+ raise ValueError
+ elif field.endswith(("_dir", "_path")):
+ if not valid_path(value):
+ raise ValueError
+ elif (
+ not isinstance(value, str) or not value or len(value) > 512
+ or not TOKEN.fullmatch(value) or IPV4.search(value)
+ ):
+ raise ValueError
+ if name == runner:
+ selected = dict(config)
+ if selected is None:
+ raise ValueError
+ roots = selected.pop("storage_roots", None)
+ if roots is not None:
+ for root in roots:
+ squash = posixpath.join(root, "collectivex", "containers")
+ stage = posixpath.join(root, "collectivex", "stage")
+ probes = []
+ try:
+ for directory in (squash, stage):
+ os.makedirs(directory, mode=0o700, exist_ok=True)
+ probe = posixpath.join(directory, f".write-probe-{os.getpid()}")
+ fd = os.open(probe, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ os.close(fd)
+ probes.append(probe)
+ selected.update(squash_dir=squash, stage_dir=stage)
+ break
+ except OSError:
+ pass
+ finally:
+ for probe in probes:
+ try:
+ os.unlink(probe)
+ except OSError:
+ pass
+ else:
+ raise ValueError
+ for field, value in selected.items():
+ key = FIELDS[field]
+ sys.stdout.buffer.write(key.encode() + b"\0" + value.encode() + b"\0")
+except (KeyError, OSError, TypeError, UnicodeError, ValueError):
+ raise SystemExit(1)
+PY
+ then
+ rm -f -- "$parsed_path"
+ [ "$generated" = 0 ] || rm -f -- "$config_path"
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL
+ cx_die "runner-local configuration failed"
+ fi
+ while IFS= read -r -d '' key && IFS= read -r -d '' value; do
+ printf -v "$key" '%s' "$value"
+ export "${key?}"
+ done < "$parsed_path"
+ rm -f -- "$parsed_path"
+ if [ "$generated" = 1 ] || [ "${COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL:-0}" = 1 ]; then
+ rm -f -- "$config_path" || cx_die "cannot remove ephemeral runner configuration"
+ fi
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL
+ COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$"
+}
+
+cx_private_log_path() {
+ local label="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" path
+ path="$(python3 - "$tag" "$label" <<'PY' 2>/dev/null
+import os
+import re
+import shutil
+import stat
+import sys
+import time
+
+tag, label = sys.argv[1:]
+if not all(re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", value) for value in (tag, label)):
+ raise SystemExit(1)
+root = f"/tmp/inferencex-collectivex-{os.getuid()}"
+old_umask = os.umask(0o077)
+flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+try:
+ try:
+ os.mkdir(root, 0o700)
+ except FileExistsError:
+ pass
+ root_fd = os.open(root, flags)
+ try:
+ metadata = os.fstat(root_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise OSError("unsafe root")
+ cutoff = time.time() - 86400
+ for entry in os.scandir(root):
+ try:
+ if (
+ entry.name != tag and entry.is_dir(follow_symlinks=False)
+ and entry.stat(follow_symlinks=False).st_mtime < cutoff
+ ):
+ shutil.rmtree(entry.path)
+ except OSError:
+ pass
+ try:
+ os.mkdir(tag, 0o700, dir_fd=root_fd)
+ except FileExistsError:
+ pass
+ directory_fd = os.open(tag, flags, dir_fd=root_fd)
+ try:
+ metadata = os.fstat(directory_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise OSError("unsafe directory")
+ log_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
+ log_fd = os.open(f"{label}.log", log_flags, 0o600, dir_fd=directory_fd)
+ os.close(log_fd)
+ finally:
+ os.close(directory_fd)
+ finally:
+ os.close(root_fd)
+finally:
+ os.umask(old_umask)
+print(f"{root}/{tag}/{label}.log", end="")
+PY
+)" || cx_die "cannot create private runtime log"
+ printf '%s' "$path"
+}
+
+# Manual successes delete diagnostics immediately. Canonical workflow logs survive
+# until artifact upload succeeds; failed logs remain private for debugging, and a
+# later run prunes abandoned directories older than 24 hours.
+cx_cleanup_private_logs() {
+ local rc="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}"
+ [ "$rc" = 0 ] || return 0
+ python3 - "$tag" <<'PY' >/dev/null 2>&1 || true
+import os
+import re
+import shutil
+import stat
+import sys
+
+tag = sys.argv[1]
+if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", tag):
+ raise SystemExit(1)
+root = f"/tmp/inferencex-collectivex-{os.getuid()}"
+flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+root_fd = os.open(root, flags)
+try:
+ metadata = os.fstat(root_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise SystemExit(1)
+finally:
+ os.close(root_fd)
+path = os.path.join(root, tag)
+if os.path.isdir(path) and not os.path.islink(path):
+ shutil.rmtree(path)
+PY
+}
+
+# Explicit Slurm export boundary. Operator config, runner credentials, HOME,
+# workspace paths, and unrelated service secrets never enter the container.
+cx_container_exports() {
+ printf '%s' 'COLLECTIVEX_SOURCE_SHA,COLLECTIVEX_ARTIFACT_NAME,COLLECTIVEX_EXECUTION_ID,COLLECTIVEX_CONTROL_SHA256,COLLECTIVEX_IMAGE,COLLECTIVEX_IMAGE_DIGEST,COLLECTIVEX_IMAGE_DIGEST_VERIFIED,COLLECTIVEX_SQUASH_SHA256,GITHUB_REF_NAME,GITHUB_REF,GITHUB_REPOSITORY,GITHUB_JOB,GITHUB_RUN_ID,GITHUB_RUN_ATTEMPT,GITHUB_SHA,CX_RUNNER,CX_BENCH,CX_NODES,CX_GPUS_PER_NODE,CX_SCALE_UP_DOMAIN,CX_SHARD_FILE,CX_SHARD_SKU,CX_NGPUS,CX_TS,CX_TOPO,CX_TRANSPORT,CX_PHASE,CX_ROUTING,CX_EPLB,CX_CASE_ID,CX_SUITE,CX_WORKLOAD_NAME,CX_REQUIRED_PUBLICATION,CX_HIDDEN,CX_TOPK,CX_EXPERTS,CX_TOKENS_LADDER,CX_CANONICAL,CX_ITERS,CX_TRIALS,CX_WARMUP,CX_SAMPLES_PER_POINT,CX_WARMUP_SEMANTICS,CX_SEED,CX_RUN_TIMEOUT,CX_NCCL_HOME,CX_ALLOW_MNNVL,CX_ATTEMPT_ID,CX_RUNTIME_MARKER,CX_MORI_KERNEL_TYPE,CX_WORKLOAD_DIR,CX_BACKEND_CACHE_ROOT,CX_BACKEND_CACHE_SENTINEL_SHA256,CX_BACKEND_SOURCE_ROOT,NCCL_CUMEM_ENABLE,NCCL_MNNVL_ENABLE,MC_FORCE_MNNVL,MORI_DISABLE_AUTO_XGMI,MORI_ENABLE_SDMA,MORI_APP_LOG_LEVEL,MORI_SHMEM_LOG_LEVEL,MORI_IO_LOG_LEVEL'
+ printf '%s' ',MORI_COMMIT'
+}
+
+# Host-side utility steps need only the basic login paths. They never receive
+# the complete Actions or runner environment.
+cx_host_exports() {
+ printf '%s' 'HOME,PATH,USER,XDG_CACHE_HOME,ENROOT_CACHE_PATH'
+}
+
+cx_prepare_runtime_marker() {
+ local mount_src="$1" tag="${COLLECTIVEX_EXECUTION_ID:-${CX_TS:-}}" marker
+ [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \
+ || cx_die "cannot create runtime stage marker"
+ marker=".shards/runtime-stage-${tag}.txt"
+ mkdir -p "$mount_src/experimental/CollectiveX/.shards" >/dev/null 2>&1 \
+ || cx_die "cannot create runtime stage marker"
+ rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 \
+ || cx_die "cannot reset runtime stage marker"
+ export CX_RUNTIME_MARKER="$marker"
+}
+
+cx_write_runtime_stage() {
+ local stage="$1" marker="${CX_RUNTIME_MARKER:-}"
+ [ -n "$marker" ] || return 0
+ [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \
+ || return 1
+ case "$stage" in backend-setup|execution) ;; *) return 1 ;; esac
+ printf '%s\n' "$stage" > "$marker"
+}
+
+cx_adopt_runtime_stage() {
+ local mount_src="$1" marker="${CX_RUNTIME_MARKER:-}" stage=""
+ [ -n "$marker" ] || return 0
+ if [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \
+ && [ -f "$mount_src/experimental/CollectiveX/$marker" ]; then
+ IFS= read -r stage < "$mount_src/experimental/CollectiveX/$marker" || true
+ rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 || true
+ case "$stage" in
+ backend-setup|execution) cx_set_failure_stage "$stage" ;;
+ esac
+ fi
+}
+
+cx_require_vars() {
+ local name
+ local -a missing=()
+ for name in "$@"; do
+ [ -n "${!name:-}" ] || missing+=("$name")
+ done
+ [ "${#missing[@]}" -eq 0 ] || cx_die \
+ "missing runner-local configuration: ${missing[*]} (set them in COLLECTIVEX_OPERATOR_CONFIG)"
+}
+
+cx_require_single_node() {
+ [ "${CX_NODES:-1}" = "1" ] || cx_die "$1 supports one-node EP only"
+}
+
+# A set shard path is an execution contract, never a hint. Validate it before
+# staging/allocation and again in-container so a missing or stale control file
+# cannot silently fall back to a manual single-case run.
+cx_validate_shard_control() {
+ local cx_root="$1" shard="${CX_SHARD_FILE:-}" path expected_sku control_sha256
+ [ -n "$shard" ] || return 0
+ expected_sku="${CX_SHARD_SKU:-}"
+ [ -n "$expected_sku" ] || cx_die "CX_SHARD_SKU is required with CX_SHARD_FILE"
+ [ -n "${CX_BENCH:-}" ] || cx_die "CX_BENCH is required with CX_SHARD_FILE"
+ [[ "${CX_NODES:-}" =~ ^[1-9][0-9]*$ ]] \
+ || cx_die "positive CX_NODES is required with CX_SHARD_FILE"
+ path="$shard"
+ [ -f "$path" ] || path="${cx_root%/}/$shard"
+ [ -f "$path" ] || cx_die "shard control does not exist"
+ [ -s "$path" ] || cx_die "shard control is empty"
+ python3 "${cx_root%/}/sweep_matrix.py" \
+ --validate-control "$path" --expect-sku "$expected_sku" \
+ --expect-backend "$CX_BENCH" --expect-nodes "$CX_NODES" >/dev/null 2>&1 \
+ || cx_die "invalid shard control"
+ control_sha256="$(sha256sum "$path" | awk '{print $1}')"
+ [[ "$control_sha256" =~ ^[0-9a-f]{64}$ ]] \
+ || cx_die "cannot hash shard control"
+ export COLLECTIVEX_CONTROL_SHA256="$control_sha256"
+}
+
+cx_apply_timing_profile() {
+ [ -n "${CX_TIMING:-}" ] || return 0
+ local iters trials warmup extra
+ IFS=: read -r iters trials warmup extra <<< "$CX_TIMING"
+ [[ "$iters" =~ ^[1-9][0-9]*$ && "$trials" =~ ^[1-9][0-9]*$ \
+ && "$warmup" =~ ^[1-9][0-9]*$ && -z "$extra" ]] \
+ || cx_die "CX_TIMING must be positive iters:trials:warmup"
+ export CX_ITERS="$iters" CX_TRIALS="$trials" CX_WARMUP="$warmup"
+}
+
+# Allocate via salloc's stable grant message and assign JOB_ID in this shell.
+# Raw scheduler output remains in the bounded private execution log.
+cx_salloc_jobid() {
+ local log job_id salloc_rc=0
+ log="$(cx_private_log_path scheduler-allocation)"
+ CX_ALLOCATION_REQUESTED=1
+ # salloc has no portable --parsable option. Parse the stable grant message
+ # used by the production launchers, while also accepting a bare ID from
+ # site wrappers.
+ salloc "$@" --no-shell > "$log" 2>&1 || salloc_rc=$?
+ job_id="$(sed -nE \
+ -e 's/^([0-9]+)(;[^[:space:]]+)?$/\1/p' \
+ -e 's/.*Granted job allocation ([0-9]+).*/\1/p' \
+ "$log" | head -n1)"
+ if [ -n "$job_id" ]; then
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ JOB_ID="$job_id"
+ fi
+ if [ "$salloc_rc" != 0 ]; then
+ cx_fail_stage scheduler-allocation "$log"
+ return 1
+ fi
+ [ -n "$JOB_ID" ] || {
+ cx_fail_stage scheduler-allocation "$log"
+ return 1
+ }
+}
+
+cx_cancel_job() {
+ local job_id="$1" active attempt
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ scancel "$job_id" >/dev/null 2>&1 || true
+ for ((attempt = 0; attempt < 60; attempt++)); do
+ if ! active="$(squeue -h -j "$job_id" -o %A 2>/dev/null)"; then
+ sleep 2
+ continue
+ fi
+ [ -n "$active" ] || return 0
+ sleep 2
+ done
+ cx_log "ERROR: scheduled allocation did not terminate during cleanup"
+ return 1
+}
+
+cx_write_cleanup_guard() {
+ local state="$1" root="${CX_JOB_ROOT:-}" safe unsafe
+ [[ "$root" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ && [ -d "$root" ] && [ ! -L "$root" ] \
+ && [ "$(stat -c '%u:%a' "$root" 2>/dev/null)" = "$(id -u):700" ] || return 0
+ safe="$root/cleanup-safe"
+ unsafe="$root/cleanup-unsafe"
+ umask 077
+ case "$state" in
+ safe) : > "$safe" && rm -f -- "$unsafe" ;;
+ unsafe) rm -f -- "$safe" && : > "$unsafe" ;;
+ *) return 1 ;;
+ esac
+}
+
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# Import remains tag-based because Enroot cannot reliably import a digest-qualified
+# Docker Hub reference non-interactively. The registry digest is resolved and checked
+# immediately before import, then recorded as verified provenance.
+CX_IMAGE_MULTIARCH_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly.)
+# Runtime setup verifies the image-bundled DeepEP build for the detected GPU target.
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
+
+# AMD (ROCm/CDNA): separate single-arch images bundle MoRI.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+CX_IMAGE_AMD_MORI_DIGEST="sha256:24c3b30d64475937abbb6498e3b29528649adcb836dde7a468979f767809b0e8"
+CX_MORI_COMMIT_MI355="99bc0a3a6e7a70aacc6372cd9a4275ccfb4de567" # pragma: allowlist secret
+CX_IMAGE_AMD_MORI_MI325="rocm/sgl-dev:sglang-0.5.14-rocm720-mi35x-mori-0701"
+CX_IMAGE_AMD_MORI_MI325_DIGEST="sha256:ea42375343c2ef8f73b3bdb9e1b7b435556e3ca92aba5e3f74ada29ba217fabc"
+CX_MORI_COMMIT_MI325="bf99bdf18fc69887a346913ca01c315c2aa9bd4c" # pragma: allowlist secret
+cx_default_image() {
+ case "$1" in
+ mi325x*) echo "$CX_IMAGE_AMD_MORI_MI325" ;;
+ mi355x*) echo "$CX_IMAGE_AMD_MORI" ;;
+ b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+ *) cx_die "no default image for runner prefix: $1" ;;
+ esac
+}
+
+cx_resolve_registry_digest() {
+ local image="$1" repository reference token digest registry
+ if [[ "$image" == *@* ]]; then
+ cx_die "digest-qualified image overrides are unsupported; configure a tag and pinned digest"
+ fi
+ registry="${image%%/*}"
+ if [[ "$image" == */* && ( "$registry" == *.* || "$registry" == *:* || "$registry" = localhost ) ]]; then
+ case "$registry" in
+ docker.io|registry-1.docker.io) image="${image#*/}" ;;
+ *) cx_die "only Docker Hub images are supported by the registry verifier" ;;
+ esac
+ fi
+ repository="${image%:*}"
+ reference="${image##*:}"
+ [ "$repository" != "$image" ] || { repository="$image"; reference=latest; }
+ [ -n "$repository" ] && [ -n "$reference" ] \
+ || cx_die "configured image reference is malformed"
+ [[ "$repository" == */* ]] || repository="library/$repository"
+ token="$(curl -fsSLG --connect-timeout 10 --max-time 30 --retry 2 \
+ --retry-delay 1 --retry-all-errors 'https://auth.docker.io/token' \
+ --data-urlencode 'service=registry.docker.io' \
+ --data-urlencode "scope=repository:${repository}:pull" \
+ | python3 -c 'import json,sys; print(json.load(sys.stdin)["token"])')" \
+ || cx_die "cannot authenticate to the image registry"
+ digest="$(curl -fsSI --connect-timeout 10 --max-time 30 --retry 2 \
+ --retry-delay 1 --retry-all-errors \
+ -H "Authorization: Bearer $token" \
+ -H 'Accept: application/vnd.oci.image.index.v1+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.docker.distribution.manifest.v2+json' \
+ "https://registry-1.docker.io/v2/${repository}/manifests/${reference}" \
+ | tr -d '\r' | awk 'tolower($1)=="docker-content-digest:" {print $2; exit}')" \
+ || cx_die "cannot resolve the configured image digest"
+ [[ "$digest" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || cx_die "registry returned an invalid image digest"
+ printf '%s' "$digest"
+}
+
+cx_verify_registry_image() {
+ local image="$1" expected actual
+ expected="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$image")}"
+ [[ "$expected" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || cx_die "a pinned digest is required for the configured image"
+ actual="$(cx_resolve_registry_digest "$image")"
+ [ "$actual" = "$expected" ] \
+ || cx_die "configured image tag no longer matches its pinned digest"
+ export COLLECTIVEX_IMAGE="$image" COLLECTIVEX_IMAGE_DIGEST="$actual"
+ export COLLECTIVEX_IMAGE_DIGEST_VERIFIED=1
+}
+
+cx_default_image_digest() {
+ case "$1" in
+ "$CX_IMAGE_MULTIARCH") printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST" ;;
+ "$CX_IMAGE_AMD_MORI") printf '%s' "$CX_IMAGE_AMD_MORI_DIGEST" ;;
+ "$CX_IMAGE_AMD_MORI_MI325") printf '%s' "$CX_IMAGE_AMD_MORI_MI325_DIGEST" ;;
+ esac
+}
+
+# Canonical workflow runs must not inherit benchmark controls from a persistent
+# self-hosted runner service. Manual/SSH diagnostics retain their explicit
+# overrides by leaving COLLECTIVEX_CANONICAL_GHA unset.
+cx_gha_workspace_stage_root() {
+ local workspace="${GITHUB_WORKSPACE:-}"
+ python3 - "$workspace" <<'PY'
+import os
+import stat
+import sys
+
+workspace = sys.argv[1]
+try:
+ if (
+ not os.path.isabs(workspace)
+ or os.path.realpath(workspace) != workspace
+ or not os.path.isdir(workspace)
+ ):
+ raise OSError
+ metadata = os.stat(workspace, follow_symlinks=False)
+ # GitHub runner workspaces are runner-owned but commonly writable by the
+ # trusted runner-service group. Keep the child mode 0700 and reject world write.
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) & stat.S_IWOTH:
+ raise OSError
+except OSError:
+ raise SystemExit(1)
+print(workspace, end="")
+PY
+}
+
+# Create a per-UID cache under validated cluster-local storage. Only the fixed
+# /cx-cache mount enters the container; the operator host path does not.
+cx_prepare_backend_cache() {
+ local stage_parent="$1" cache info sentinel_sha256
+ unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_CACHE_SENTINEL_SHA256
+ info="$(python3 - "$stage_parent" <<'PY'
+import hashlib
+import os
+import secrets
+import stat
+import sys
+
+configured_parent = sys.argv[1]
+try:
+ if (
+ not os.path.isabs(configured_parent)
+ or "\n" in configured_parent
+ or "\r" in configured_parent
+ ):
+ raise OSError
+ parent = os.path.realpath(configured_parent)
+ if not os.path.isdir(parent):
+ raise OSError
+ flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+ parent_fd = os.open(parent, flags)
+ try:
+ probe_name = f".collectivex-owner-probe-{os.getpid()}-{secrets.token_hex(8)}"
+ os.mkdir(probe_name, 0o700, dir_fd=parent_fd)
+ try:
+ probe_fd = os.open(probe_name, flags, dir_fd=parent_fd)
+ try:
+ probe = os.fstat(probe_fd)
+ if stat.S_IMODE(probe.st_mode) & 0o777 != 0o700:
+ raise OSError
+ realized_owner = probe.st_uid
+ finally:
+ os.close(probe_fd)
+ finally:
+ os.rmdir(probe_name, dir_fd=parent_fd)
+ for generation in (3, 4):
+ name = f".collectivex-backend-cache-v{generation}-{os.getuid()}"
+ try:
+ os.mkdir(name, 0o700, dir_fd=parent_fd)
+ except FileExistsError:
+ pass
+ try:
+ cache_fd = os.open(name, flags, dir_fd=parent_fd)
+ try:
+ metadata = os.fstat(cache_fd)
+ if (
+ metadata.st_uid != realized_owner
+ or stat.S_IMODE(metadata.st_mode) & 0o777 != 0o700
+ ):
+ raise OSError
+ sentinel_name = ".collectivex-mount-sentinel-v1"
+ temporary_name = (
+ f"{sentinel_name}.tmp.{os.getpid()}.{secrets.token_hex(8)}"
+ )
+ create_flags = (
+ os.O_WRONLY | os.O_CREAT | os.O_EXCL
+ | getattr(os, "O_NOFOLLOW", 0)
+ )
+ payload = secrets.token_bytes(32)
+ temporary_fd = os.open(
+ temporary_name, create_flags, 0o600, dir_fd=cache_fd
+ )
+ try:
+ try:
+ view = memoryview(payload)
+ try:
+ while view:
+ written = os.write(temporary_fd, view)
+ if written <= 0:
+ raise OSError
+ view = view[written:]
+ os.fsync(temporary_fd)
+ finally:
+ view.release()
+ finally:
+ os.close(temporary_fd)
+ try:
+ os.link(
+ temporary_name,
+ sentinel_name,
+ src_dir_fd=cache_fd,
+ dst_dir_fd=cache_fd,
+ follow_symlinks=False,
+ )
+ except FileExistsError:
+ pass
+ finally:
+ try:
+ os.unlink(temporary_name, dir_fd=cache_fd)
+ except FileNotFoundError:
+ pass
+ sentinel_fd = os.open(
+ sentinel_name,
+ os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0),
+ dir_fd=cache_fd,
+ )
+ try:
+ sentinel = os.fstat(sentinel_fd)
+ payload = os.read(sentinel_fd, 33)
+ if (
+ not stat.S_ISREG(sentinel.st_mode)
+ or sentinel.st_uid != realized_owner
+ or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600
+ or sentinel.st_size != 32
+ or len(payload) != 32
+ ):
+ raise OSError
+ sentinel_sha256 = hashlib.sha256(payload).hexdigest()
+ finally:
+ os.close(sentinel_fd)
+ finally:
+ os.close(cache_fd)
+ except OSError:
+ if generation == 3:
+ continue
+ raise
+ break
+ finally:
+ os.close(parent_fd)
+except OSError:
+ raise SystemExit(1)
+print(sentinel_sha256, os.path.join(parent, name), end="")
+PY
+)" || return 1
+ sentinel_sha256="${info%% *}"
+ cache="${info#* }"
+ [ "$cache" != "$info" ] && [[ "$sentinel_sha256" =~ ^[0-9a-f]{64}$ ]] \
+ && [[ "$cache" = /* ]] || return 1
+ export CX_PREPARED_BACKEND_CACHE="$cache"
+ export CX_BACKEND_CACHE_SENTINEL_SHA256="$sentinel_sha256"
+}
+
+cx_verify_backend_cache_mount() {
+ python3 - "${CX_BACKEND_CACHE_ROOT:-}" \
+ "${CX_BACKEND_CACHE_SENTINEL_SHA256:-}" <<'PY'
+import hashlib
+import os
+import re
+import stat
+import sys
+
+root, expected = sys.argv[1:]
+try:
+ if (
+ not os.path.isabs(root)
+ or os.path.realpath(root) != root
+ or re.fullmatch(r"[0-9a-f]{64}", expected) is None
+ ):
+ raise OSError
+ flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+ root_fd = os.open(root, flags)
+ try:
+ root_item = os.fstat(root_fd)
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ ):
+ raise OSError
+ sentinel_fd = os.open(
+ ".collectivex-mount-sentinel-v1",
+ os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0),
+ dir_fd=root_fd,
+ )
+ try:
+ sentinel = os.fstat(sentinel_fd)
+ payload = os.read(sentinel_fd, 33)
+ if (
+ not stat.S_ISREG(sentinel.st_mode)
+ or sentinel.st_uid != root_item.st_uid
+ or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600
+ or sentinel.st_size != 32
+ or len(payload) != 32
+ or hashlib.sha256(payload).hexdigest() != expected
+ ):
+ raise OSError
+ finally:
+ os.close(sentinel_fd)
+ finally:
+ os.close(root_fd)
+except OSError:
+ raise SystemExit(1)
+PY
+}
+
+cx_git() {
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null GIT_TERMINAL_PROMPT=0 \
+ git -c credential.helper= "$@"
+}
+
+cx_git_in_tree() {
+ local directory="$1" canonical
+ shift
+ [[ "$directory" = /* ]] && [ -d "$directory" ] && [ ! -L "$directory" ] \
+ || return 1
+ [[ "$directory" != *'*'* && "$directory" != *$'\n'* && "$directory" != *$'\r'* ]] \
+ || return 1
+ canonical="$(cd -P -- "$directory" && pwd -P)" || return 1
+ cx_git -c "safe.directory=$canonical" -C "$canonical" "$@"
+}
+
+cx_fetch_revision() {
+ local repository="$1" revision="$2" destination="$3" attempt
+ for attempt in 1 2 3; do
+ rm -rf -- "$destination"
+ if cx_git init -q "$destination" \
+ && cx_git_in_tree "$destination" remote add origin "$repository" \
+ && cx_git_in_tree "$destination" fetch -q --no-tags --depth 1 origin "$revision" \
+ && cx_git_in_tree "$destination" -c advice.detachedHead=false \
+ checkout -q --detach FETCH_HEAD \
+ && [ "$(cx_git_in_tree "$destination" rev-parse HEAD)" = "$revision" ]; then
+ return 0
+ fi
+ [ "$attempt" = 3 ] || sleep $((attempt * 5))
+ done
+ return 1
+}
+
+cx_backend_source_pin() {
+ case "$1" in
+ deepep-v2)
+ printf '%s|%s|%s' \
+ "$CX_DEEPEP_V2_COMMIT" "$CX_DEEPEP_V2_TREE" "$CX_DEEPEP_V2_FMT_COMMIT"
+ ;;
+ deepep-hybrid)
+ printf '%s|%s|' "$CX_DEEPEP_HYBRID_COMMIT" "$CX_DEEPEP_HYBRID_TREE"
+ ;;
+ *) return 1 ;;
+ esac
+}
+
+cx_backend_source_path() {
+ local root="$1" backend="$2" revision tree fmt pin
+ pin="$(cx_backend_source_pin "$backend")" || return 1
+ IFS='|' read -r revision tree fmt <<< "$pin"
+ printf '%s/%s-%s' "$root" "$backend" "$revision"
+}
+
+cx_backend_source_is_valid() {
+ local backend="$1" source="$2" revision tree fmt pin status ignored
+ pin="$(cx_backend_source_pin "$backend")" || return 1
+ IFS='|' read -r revision tree fmt <<< "$pin"
+ [ -d "$source" ] && [ ! -L "$source" ] \
+ && [ "$(cx_git_in_tree "$source" rev-parse HEAD 2>/dev/null)" = "$revision" ] \
+ && [ "$(cx_git_in_tree "$source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ || return 1
+ status="$(cx_git_in_tree "$source" status --porcelain --untracked-files=all \
+ --ignore-submodules=none 2>/dev/null)" || return 1
+ [ -z "$status" ] || return 1
+ ignored="$(cx_git_in_tree "$source" ls-files --others --ignored --exclude-standard \
+ 2>/dev/null)" || return 1
+ [ -z "$ignored" ] || return 1
+ [ -z "$fmt" ] \
+ || [ "$(cx_git_in_tree "$source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt" ]
+}
+
+cx_extension_pair_sha256() {
+ python3 - "$1" "$2" "$3" <<'PY'
+import hashlib
+import os
+from pathlib import Path
+import stat
+import sys
+
+root = Path(sys.argv[1])
+digest = hashlib.sha256()
+try:
+ if root.is_symlink() or not root.is_dir():
+ raise OSError
+ for pattern in sys.argv[2:]:
+ matches = list(root.glob(pattern))
+ if len(matches) != 1 or matches[0].is_symlink():
+ raise OSError
+ path = matches[0]
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ metadata = os.fstat(descriptor)
+ if not stat.S_ISREG(metadata.st_mode):
+ raise OSError
+ file_digest = hashlib.sha256()
+ with os.fdopen(descriptor, "rb", closefd=False) as stream:
+ for chunk in iter(lambda: stream.read(1024 * 1024), b""):
+ file_digest.update(chunk)
+ digest.update(path.name.encode("utf-8") + b"\0")
+ digest.update(str(metadata.st_size).encode("ascii") + b"\0")
+ digest.update(file_digest.digest())
+ finally:
+ os.close(descriptor)
+except (OSError, UnicodeError):
+ raise SystemExit(1)
+print(digest.hexdigest(), end="")
+PY
+}
+
+# Acquire source before compute allocation, preferring the verified same-run GHA seed.
+_cx_prepare_backend_source() {
+ local mount_src="$1" backend="$2" root source temporary revision tree fmt pin
+ local root_mode stage_mode root_owner stage_owner
+ local seed_root="${CX_BACKEND_SOURCE_SEED_ROOT:-}" seed seed_mode
+ root="$mount_src/experimental/CollectiveX/.cx_sources"
+ CX_BACKEND_SOURCE_STEP="source mount creation"
+ if [ ! -e "$root" ] && [ ! -L "$root" ]; then
+ mkdir -m 700 -- "$root" || return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source mount ownership validation"
+ [ -d "$mount_src" ] && [ ! -L "$mount_src" ] \
+ && [ -d "$root" ] && [ ! -L "$root" ] || return 1
+ stage_owner="$(stat -c '%u' "$mount_src" 2>/dev/null)" || return 1
+ root_owner="$(stat -c '%u' "$root" 2>/dev/null)" || return 1
+ [ "$root_owner" = "$stage_owner" ] || return 1
+ stage_mode="$(stat -c '%a' "$mount_src" 2>/dev/null)" || return 1
+ case "$stage_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ # Shared stage parents may retain harmless special bits despite mkdir -m.
+ CX_BACKEND_SOURCE_STEP="source mount permission inspection"
+ root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1
+ case "$root_mode" in
+ 700|[1-7]700) ;;
+ *)
+ CX_BACKEND_SOURCE_STEP="source mount permission normalization"
+ chmod 700 "$root" || return 1
+ CX_BACKEND_SOURCE_STEP="source mount permission validation"
+ root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1
+ case "$root_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ ;;
+ esac
+ CX_BACKEND_SOURCE_STEP="git lookup"
+ command -v git >/dev/null || return 1
+ CX_BACKEND_SOURCE_STEP="source pin resolution"
+ source="$(cx_backend_source_path "$root" "$backend")" || return 1
+ if [ -e "$source" ] || [ -L "$source" ]; then
+ CX_BACKEND_SOURCE_STEP="existing source validation"
+ cx_backend_source_is_valid "$backend" "$source"
+ return
+ fi
+ if [ -n "$seed_root" ]; then
+ CX_BACKEND_SOURCE_STEP="source seed validation"
+ [[ "$seed_root" = /* ]] && [ -d "$seed_root" ] && [ ! -L "$seed_root" ] \
+ || return 1
+ seed_mode="$(stat -c '%a' "$seed_root" 2>/dev/null)" || return 1
+ case "$seed_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ seed="$(cx_backend_source_path "$seed_root" "$backend")" || return 1
+ cx_backend_source_is_valid "$backend" "$seed" || return 1
+ CX_BACKEND_SOURCE_STEP="source seed copy"
+ temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1
+ if ! cp -R -- "$seed/." "$temporary/" \
+ || ! cx_backend_source_is_valid "$backend" "$temporary" \
+ || ! mv -- "$temporary" "$source"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ return
+ fi
+ if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ]; then
+ CX_BACKEND_SOURCE_STEP="source seed validation"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source checkout creation"
+ temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1
+ CX_BACKEND_SOURCE_STEP="source pin resolution"
+ pin="$(cx_backend_source_pin "$backend")" || {
+ rm -rf -- "$temporary"
+ return 1
+ }
+ IFS='|' read -r revision tree fmt <<< "$pin"
+ CX_BACKEND_SOURCE_STEP="revision fetch"
+ if ! cx_fetch_revision \
+ https://github.com/deepseek-ai/DeepEP "$revision" "$temporary"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="submodule fetch"
+ if [ -n "$fmt" ] && ! cx_git_in_tree "$temporary" \
+ -c "safe.directory=$temporary/third-party/fmt" \
+ submodule update -q --init --depth 1 third-party/fmt; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source publication validation"
+ if ! cx_backend_source_is_valid "$backend" "$temporary" \
+ || ! mv -- "$temporary" "$source"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+}
+
+cx_prepare_backend_source() {
+ local log backend="$2" CX_BACKEND_SOURCE_STEP="initialization"
+ log="$(cx_private_log_path "backend-source-$backend")" || return 1
+ if _cx_prepare_backend_source "$@" > "$log" 2>&1; then
+ return 0
+ fi
+ printf '%s failed\n' "$CX_BACKEND_SOURCE_STEP" >> "$log"
+ cx_log "ERROR: backend-source-step=${CX_BACKEND_SOURCE_STEP// /-}"
+ cx_fail_stage backend-setup "$log"
+}
+
+cx_materialize_backend_source() {
+ local backend="$1" destination="$2" source parent temporary
+ [ -n "${CX_BACKEND_SOURCE_ROOT:-}" ] || return 1
+ source="$(cx_backend_source_path "$CX_BACKEND_SOURCE_ROOT" "$backend")" || return 1
+ cx_backend_source_is_valid "$backend" "$source" || return 1
+ parent="${destination%/*}"
+ [ "$parent" != "$destination" ] && [ -d "$parent" ] && [ ! -L "$parent" ] \
+ || return 1
+ temporary="$(mktemp -d "$parent/.collectivex-source.XXXXXX")" || return 1
+ if ! cp -R -- "$source/." "$temporary/" \
+ || ! cx_backend_source_is_valid "$backend" "$temporary"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ if ! rm -rf -- "$destination" || ! mv -- "$temporary" "$destination"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ if ! cx_backend_source_is_valid "$backend" "$destination"; then
+ rm -rf -- "$destination"
+ return 1
+ fi
+ return 0
+}
+
+cx_lock_canonical_gha_env() {
+ local runner="$1" expected_nodes expected_gpn expected_world trusted_lock_dir=""
+ [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || return 0
+ [ "${GITHUB_ACTIONS:-}" = true ] \
+ || cx_die "canonical CollectiveX execution requires GitHub Actions"
+ [ -n "${CX_SHARD_FILE:-}" ] && [ "${CX_SHARD_SKU:-}" = "$runner" ] \
+ || cx_die "canonical CollectiveX execution requires a matched shard"
+ [[ "${GITHUB_RUN_ID:-}" =~ ^[1-9][0-9]*$ \
+ && "${GITHUB_RUN_ATTEMPT:-}" =~ ^[1-9][0-9]*$ \
+ && "${COLLECTIVEX_SOURCE_SHA:-}" =~ ^[0-9a-f]{40,64}$ ]] \
+ || cx_die "canonical CollectiveX workflow identity is incomplete"
+
+ # cx_load_operator_config clears inherited values before setting this process marker.
+ # Preserve only its validated AMD lock path; direct runner-service values stay untrusted.
+ [ "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" != "$$" ] \
+ || trusted_lock_dir="${CX_LOCK_DIR:-}"
+ unset CX_NCCL_HOME CX_MASTER_PORT CX_MORI_KERNEL_TYPE CX_LOCK_DIR
+ unset MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA
+ unset MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL
+ unset NCCL_CUMEM_ENABLE NCCL_MNNVL_ENABLE MC_FORCE_MNNVL
+ unset CX_BACKEND_CACHE_ROOT CX_BACKEND_CACHE_SENTINEL_SHA256
+ unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_SOURCE_ROOT
+
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || cx_die "canonical CollectiveX execution requires shared container storage"
+
+ case "$runner" in
+ h100-dgxc|h200-dgxc|b200-dgxc|b300)
+ expected_nodes=1; expected_gpn=8
+ CX_IMAGE="$CX_IMAGE_MULTIARCH"
+ CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST"
+ CX_NCCL_HOME=/usr
+ ;;
+ gb200|gb300)
+ expected_nodes="${CX_NODES:-}"; expected_gpn=4
+ [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \
+ || cx_die "canonical GB execution requires one or two trays"
+ CX_IMAGE="$CX_IMAGE_MULTIARCH"
+ CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST"
+ CX_NCCL_HOME=/usr
+ CX_MASTER_PORT=29551
+ ;;
+ mi325x)
+ expected_nodes=1; expected_gpn=8
+ CX_STAGE_DIR="$(cx_gha_workspace_stage_root)" \
+ || cx_die "canonical AMD staging workspace is unsafe"
+ CX_IMAGE="$CX_IMAGE_AMD_MORI_MI325"
+ CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_MI325_DIGEST"
+ CX_MORI_KERNEL_TYPE=asyncll
+ MORI_COMMIT="$CX_MORI_COMMIT_MI325"
+ MORI_DISABLE_AUTO_XGMI=0
+ MORI_ENABLE_SDMA=1
+ MORI_APP_LOG_LEVEL=info
+ MORI_SHMEM_LOG_LEVEL=info
+ MORI_IO_LOG_LEVEL=info
+ ;;
+ mi355x)
+ expected_nodes=1; expected_gpn=8
+ CX_STAGE_DIR="$(cx_gha_workspace_stage_root)" \
+ || cx_die "canonical AMD staging workspace is unsafe"
+ CX_IMAGE="$CX_IMAGE_AMD_MORI"
+ CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_DIGEST"
+ CX_MORI_KERNEL_TYPE=intranode
+ MORI_COMMIT="$CX_MORI_COMMIT_MI355"
+ ;;
+ *) cx_die "canonical CollectiveX runner is not registered" ;;
+ esac
+ case "$runner:$trusted_lock_dir" in
+ mi325x:?*|mi355x:?*) export CX_LOCK_DIR="$trusted_lock_dir" ;;
+ esac
+ CX_STAGE_DIR="${CX_STAGE_DIR:-$CX_SQUASH_DIR/.stage}"
+ export CX_STAGE_DIR
+ [ "${CX_NODES:-}" = "$expected_nodes" ] \
+ && [ "${CX_GPUS_PER_NODE:-}" = "$expected_gpn" ] \
+ || cx_die "canonical CollectiveX placement differs from the shard"
+ expected_world=$((expected_nodes * expected_gpn))
+ CX_NGPUS="$expected_world"
+ CX_SEED=67
+ case "$runner" in mi325x|mi355x) CX_RUN_TIMEOUT=1800 ;; *) CX_RUN_TIMEOUT=900 ;; esac
+ unset CX_PUBLIC_RUNNER CX_GB_PRODUCT CX_DRYRUN CX_TIMING CX_ALLOW_MNNVL
+ unset CX_ENROOT_LOCAL_IMPORT COLLECTIVEX_IMAGE COLLECTIVEX_IMAGE_DIGEST
+ unset COLLECTIVEX_IMAGE_DIGEST_VERIFIED COLLECTIVEX_SQUASH_SHA256
+ export CX_IMAGE CX_IMAGE_DIGEST CX_NGPUS CX_SEED CX_RUN_TIMEOUT
+ case "$runner" in
+ h100-dgxc|h200-dgxc|b200-dgxc|b300) export CX_NCCL_HOME ;;
+ gb200|gb300) export CX_NCCL_HOME CX_MASTER_PORT ;;
+ mi325x)
+ export CX_MORI_KERNEL_TYPE MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA
+ export MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL
+ ;;
+ mi355x) export CX_MORI_KERNEL_TYPE MORI_COMMIT ;;
+ esac
+}
+
+cx_reverify_registry_image() {
+ local image="$1" actual
+ [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ && [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" = 1 ] || return 1
+ actual="$(cx_resolve_registry_digest "$image")" || return 1
+ [ "$actual" = "$COLLECTIVEX_IMAGE_DIGEST" ] || {
+ cx_log "ERROR: configured image tag changed during container import"
+ return 1
+ }
+}
+
+cx_export_squash_identity() {
+ local image="$1" digest log
+ log="$(cx_private_log_path container-hash)"
+ digest="$(sha256sum "$image" 2>> "$log" | awk '{print $1}')"
+ [[ "$digest" =~ ^[0-9a-f]{64}$ ]] \
+ || { cx_fail_stage container-hash "$log"; return 1; }
+ export COLLECTIVEX_SQUASH_SHA256="$digest"
+}
+
+cx_squash_path() {
+ local squash_dir="$1" image="$2" key platform
+ [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || return 1
+ case "${CX_IMAGE_PLATFORM:-}" in
+ linux/amd64) platform="" ;;
+ linux/arm64) platform="_linux_arm64" ;;
+ *) return 1 ;;
+ esac
+ key="${CX_SQUASH_FORMAT_VERSION}${platform}_${COLLECTIVEX_IMAGE_DIGEST#sha256:}_$(
+ printf '%s' "$image" | sed 's#[/:@#]#_#g'
+ )"
+ printf '%s' "$squash_dir/${key}.sqsh"
+}
+
+# cx_ensure_squash -> echoes the squash file path.
+# Imports via Enroot only if a valid squash is not already present, under a lock.
+cx_ensure_squash() {
+ local squash_dir="$1" image="$2" key sq locks lock_fd log
+ local enroot_local="" import_rc=0 machine
+ log="$(cx_private_log_path container-import)"
+ machine="$(uname -m)"
+ case "${CX_IMAGE_PLATFORM:-}:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) cx_fail_stage container-import "$log"; return 1 ;;
+ esac
+ mkdir -p "$squash_dir" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ sq="$(cx_squash_path "$squash_dir" "$image")" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ key="${sq##*/}"
+ key="${key%.sqsh}"
+ locks="$squash_dir/.locks"
+ mkdir -p "$locks" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ { exec {lock_fd}>"$locks/${key}.lock"; } 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ flock -w 900 "$lock_fd" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ if unsquashfs -l "$sq" >/dev/null 2>&1; then
+ cx_log "container squash ready"
+ else
+ cx_log "importing configured container image"
+ rm -f "$sq" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ # > "$log" 2>&1 || import_rc=$?
+ rm -rf -- "$enroot_local" >/dev/null 2>&1 || true
+ [ "$import_rc" = 0 ] \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ else
+ SOURCE_DATE_EPOCH="$CX_SQUASH_SOURCE_DATE_EPOCH" \
+ enroot import -o "$sq" "docker://$image" > "$log" 2>&1 \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ fi
+ unsquashfs -l "$sq" >> "$log" 2>&1 \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ fi
+ if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then
+ flock -u "$lock_fd" >/dev/null 2>&1 || true
+ exec {lock_fd}>&-
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ flock -u "$lock_fd"
+ exec {lock_fd}>&-
+ echo "$sq"
+}
+
+# Import on an allocated compute node so multiarch tags resolve for the target
+# architecture. The squash directory must be shared with the submit host.
+cx_ensure_squash_on_job() {
+ local job_id="$1" squash_dir="$2" image="$3" lock_dir="${4:-}" sq key lock log
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ sq="$(cx_squash_path "$squash_dir" "$image")" || return 1
+ key="${sq##*/}"
+ key="${key%.sqsh}"
+ [ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"
+ lock="$lock_dir/${key}.lock"
+ log="$(cx_private_log_path container-import)"
+ if ! srun --jobid="$job_id" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --export="$(cx_host_exports)" \
+ bash -s -- "$sq" "$lock" "$image" "$CX_SQUASH_SOURCE_DATE_EPOCH" \
+ "$CX_IMAGE_PLATFORM" \
+ > "$log" 2>&1 <<'BASH'
+set -euo pipefail
+sq="$1"; lock="$2"; image="$3"; source_date_epoch="$4"; platform="$5"
+machine="$(uname -m)"
+case "$platform:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) exit 13 ;;
+esac
+compute_home="$(mktemp -d /tmp/inferencex-collectivex-home.XXXXXX)"
+trap 'rm -rf -- "$compute_home"' EXIT
+export HOME="$compute_home" XDG_CACHE_HOME="$compute_home/.cache"
+export ENROOT_TEMP_PATH="$compute_home/enroot-tmp"
+export ENROOT_CACHE_PATH="$compute_home/enroot-cache"
+export ENROOT_DATA_PATH="$compute_home/enroot-data"
+export ENROOT_RUNTIME_PATH="$compute_home/enroot-run"
+mkdir -p "$(dirname "$sq")" "$(dirname "$lock")" \
+ "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH"
+exec 9>"$lock"
+flock -w 900 9
+if unsquashfs -l "$sq" >/dev/null 2>&1; then
+ echo 'container squash ready'
+else
+ rm -f -- "$sq"
+ SOURCE_DATE_EPOCH="$source_date_epoch" \
+ enroot import -o "$sq" "docker://$image" /dev/null 2>&1
+fi
+BASH
+ then
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ printf '%s' "$sq"
+}
+
+cx_preflight_allocation() {
+ local job_id="$1" nodes="$2" mount_src="$3" squash="$4" shard="${5:-}"
+ local log rc=0 runtime shard_path=""
+ runtime="$mount_src/experimental/CollectiveX/runtime/run_in_container.sh"
+ [ -z "$shard" ] || shard_path="$mount_src/experimental/CollectiveX/$shard"
+ log="$(cx_private_log_path allocation-preflight)"
+ srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \
+ --chdir=/tmp \
+ --export="$(cx_host_exports)" bash -s -- "$runtime" "$shard_path" "$squash" \
+ "$CX_IMAGE_PLATFORM" \
+ > "$log" 2>&1 <<'BASH' || rc=$?
+set -euo pipefail
+machine="$(uname -m)"
+case "$4:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) exit 13 ;;
+esac
+test -r "$1" || exit 10
+[ -z "$2" ] || test -r "$2" || exit 11
+test -r "$3" || exit 12
+unsquashfs -s "$3" >/dev/null 2>&1 || exit 12
+BASH
+ [ "$rc" = 0 ] && return 0
+ case "$rc" in
+ 10|11) cx_fail_stage repository-stage "$log" ;;
+ 12) cx_fail_stage container-hash "$log" ;;
+ *) cx_fail_stage container-launch "$log" ;;
+ esac
+ return 1
+}
+
+# cx_stage_repo -> echoes the mount-source root.
+# Stage only the public benchmark tree onto compute-visible storage. Canonical
+# GHA requires an operator-configured base; manual diagnostics use an isolated
+# directory under the already-required squash storage so ignored private notes
+# are never mounted into a compute container.
+cx_stage_repo() {
+ local repo_root="$1" stage_dir="${2:-}" log tag safe_tag
+ cx_validate_shard_control "$repo_root/experimental/CollectiveX"
+ if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] \
+ && { [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; }; then
+ cx_die "canonical CollectiveX execution requires compute-visible staging"
+ fi
+ # Concurrency isolation. Under GHA the per-config concurrency fan-out runs many
+ # same-SKU dispatches at once, all staging into the SAME shared base dir; a
+ # shared dir + `rsync --delete` lets one job unlink/replace a file a peer is
+ # mid-read of -> "error reading input file: Stale file handle" on the next
+ # `srun ... run_in_container.sh`. Give each EXECUTING job its own subdir keyed on
+ # a workflow-provided execution id. Manual runs use the launcher PID.
+ tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}"
+ safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+ if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || cx_die "manual CollectiveX staging requires CX_SQUASH_DIR"
+ stage_dir="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag"
+ else
+ stage_dir="${stage_dir%/}/job_$safe_tag"
+ fi
+ mkdir -p "${stage_dir%/*}" 2>/dev/null \
+ || cx_die "cannot create the configured stage base"
+ if [ -e "$stage_dir" ] || [ -L "$stage_dir" ]; then
+ cx_die "refusing to reuse a pre-existing execution stage"
+ fi
+ mkdir -m 700 "$stage_dir" 2>/dev/null \
+ || cx_die "cannot create the configured stage directory"
+ mkdir -m 700 "$stage_dir/experimental" 2>/dev/null \
+ || cx_die "cannot create the configured stage directory"
+ cx_log "staging CollectiveX on compute-visible storage"
+ log="$(cx_private_log_path repository-stage)"
+ if ! rsync -a --delete --delete-excluded \
+ --exclude='__pycache__/' --exclude='results/' --exclude='.cx_workloads/' \
+ --exclude='.cx_backend/' --exclude='.cx_sources/' \
+ --exclude='configs/platforms.yaml' --exclude='private-infra.md' \
+ --exclude='goal.md' --exclude='notes.md' \
+ "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" > "$log" 2>&1; then
+ rm -rf -- "$stage_dir" >/dev/null 2>&1 \
+ || cx_log "ERROR: cannot remove the incomplete execution stage"
+ cx_fail_stage repository-stage "$log" || true
+ return 1
+ fi
+ echo "$stage_dir"
+}
+
+# cx_collect_results
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+ local mount_src="$1" repo_root="$2" dst log
+ local -a files
+ [ "$mount_src" = "$repo_root" ] && return 0
+ log="$(cx_private_log_path "artifact-collection-$$-${RANDOM}")"
+ dst="$repo_root/experimental/CollectiveX/results"
+ mkdir -p "$dst" 2>> "$log" \
+ || { cx_log "ERROR: cannot create checkout result directory"; return 1; }
+ shopt -s nullglob
+ files=("$mount_src/experimental/CollectiveX/results/"*.json)
+ shopt -u nullglob
+ [ "${#files[@]}" -gt 0 ] || { cx_log "ERROR: staged run produced no result JSON"; return 1; }
+ cp -- "${files[@]}" "$dst/" >> "$log" 2>&1 \
+ || { cx_log "ERROR: staged result collection failed"; return 1; }
+ cx_log "collected staged results for artifact validation"
+}
+
+cx_cleanup_stage() {
+ local mount_src="$1" repo_root="$2" base="${CX_STAGE_DIR:-}" tag safe_tag expected
+ tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}"
+ safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+ [ "$mount_src" != "$repo_root" ] || return 0
+ if [ -n "$base" ] && [ "$base" != "$repo_root" ]; then
+ expected="${base%/}/job_$safe_tag"
+ else
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || { cx_log "ERROR: cannot identify the generated stage directory"; return 1; }
+ expected="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag"
+ fi
+ if [ "$mount_src" != "$expected" ] || [ "$mount_src" = / ] \
+ || { [ -n "$base" ] && [ "$mount_src" = "$base" ]; }; then
+ cx_log "ERROR: refusing to remove an unrecognized stage directory"
+ return 1
+ fi
+ rm -rf -- "$mount_src" >/dev/null 2>&1 || {
+ cx_log "ERROR: cannot remove generated stage directory"
+ return 1
+ }
+ cx_log "removed generated per-execution stage directory"
+}
+
+# Return success only when a benchmark output is a complete JSON result object.
+# Callers use this before synthesizing a terminal outcome so an emitted invalid result
+# is not shadowed by a second record for the same attempt.
+cx_has_result_doc() {
+ local path="$1"
+ python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" >/dev/null 2>&1
+}
+
+cx_result_doc_is() {
+ local path="$1" expected="$2"
+ python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" --status "$expected" \
+ >/dev/null 2>&1
+}
+
+# A rank-zero result can be written before another rank or backend teardown fails. Preserve its
+# measurements, but make the distributed command's nonzero terminal status authoritative.
+cx_demote_result_doc() {
+ local path="$1" rc="$2"
+ python3 "$_CX_COMMON_ROOT/contracts.py" demote "$path" --return-code "$rc"
+}
+
+cx_quarantine_result_doc() {
+ python3 "$_CX_COMMON_ROOT/contracts.py" quarantine-invalid "$1"
+}
+
+# cx_emit_ep_failed_case
+# Preserve failures from rack launchers that invoke run_ep.py directly and therefore cannot use
+# run_in_container.sh's emitter. Case identity is read from the exported CX_* variables.
+cx_emit_ep_failed_case() {
+ local out="$1" backend="$2" phase="$3" rc="$4"
+ local -a args=(emit-terminal --out "$out" --backend "$backend" --phase "$phase"
+ --return-code "$rc")
+ [ -z "${CX_FAILURE_MODE:-}" ] || args+=(--failure-mode "$CX_FAILURE_MODE")
+ if ! python3 "$_CX_COMMON_ROOT/contracts.py" "${args[@]}"
+ then
+ cx_log "ERROR: could not preserve terminal outcome"
+ return 1
+ fi
+}
+
+cx_case_attempt_exists() {
+ local out_dir="$1" case_id="$2"
+ python3 - "$_CX_COMMON_ROOT" "$out_dir" "$case_id" <<'PY'
+import pathlib, sys
+
+sys.path.insert(0, sys.argv[1])
+import contracts
+
+sample_paths = set()
+referenced_samples = set()
+found = False
+
+def quarantine(path, document):
+ sample = document.get("sample_artifact") if isinstance(document, dict) else None
+ if (
+ isinstance(sample, dict)
+ and isinstance(sample.get("path"), str)
+ and pathlib.Path(sample["path"]).name == sample["path"]
+ ):
+ sample_path = path.with_name(sample["path"])
+ if sample_path.is_file():
+ sample_path.replace(sample_path.with_name(sample_path.name + ".quarantine"))
+ if path.is_file():
+ path.replace(path.with_name(path.name + ".quarantine"))
+
+for path in pathlib.Path(sys.argv[2]).glob("*.json"):
+ document = None
+ try:
+ document = contracts.strict_load(path)
+ if not isinstance(document, dict):
+ continue
+ if document.get("format") == contracts.RAW_FORMAT:
+ document = contracts.load_raw_attempt(path)
+ referenced_samples.add(path.with_name(document["sample_artifact"]["path"]))
+ elif document.get("format") == contracts.TERMINAL_FORMAT:
+ document = contracts.validate_terminal_document(document)
+ elif document.get("format") == contracts.SAMPLES_FORMAT:
+ contracts.validate_samples_document(document)
+ sample_paths.add(path)
+ continue
+ else:
+ continue
+ except (contracts.ContractError, OSError, ValueError):
+ quarantine(path, document)
+ continue
+ if document["identity"]["case_id"] == sys.argv[3]:
+ found = True
+for orphan in sample_paths - referenced_samples:
+ quarantine(orphan, {})
+raise SystemExit(0 if found else 1)
+PY
+}
+
+# Emit one setup-failure record per requested case. Rack launchers call this when
+# backend preparation fails before rank processes can start.
+cx_emit_setup_failures() {
+ local root="$1" out_dir="$2" backend="$3" rc="$4" shard="${CX_SHARD_FILE:-}" path
+ local phase case_id suite workload required routing eplb ep hidden topk experts nodes
+ local gpn domain ladder canonical timing
+ local cases_file expected emitted=0 covered=0
+ mkdir -p "$out_dir" || return 1
+ export CX_FAILURE_MODE="${CX_FAILSAFE_MODE:-setup}" CX_ATTEMPT_ID=1
+ if [ -z "$shard" ]; then
+ local phases="${CX_PHASE:-decode}"
+ [ "$phases" = both ] && phases="decode prefill"
+ for phase in $phases; do
+ if [ -n "${CX_CASE_ID:-}" ] && cx_case_attempt_exists "$out_dir" "$CX_CASE_ID"; then
+ continue
+ fi
+ cx_emit_ep_failed_case "$out_dir/failed_${backend}_${phase}_${CX_TS:-setup}-a01.json" \
+ "$backend" "$phase" "$rc" || return 1
+ done
+ unset CX_FAILURE_MODE
+ return 0
+ fi
+ path="$shard"
+ [ -f "$path" ] || path="${root%/}/$shard"
+ [ -f "$path" ] || {
+ unset CX_FAILURE_MODE
+ cx_log "ERROR: cannot emit setup failures without shard control"
+ return 1
+ }
+ export COLLECTIVEX_CONTROL_SHA256
+ COLLECTIVEX_CONTROL_SHA256="$(sha256sum "$path" | awk '{print $1}')"
+ [[ "$COLLECTIVEX_CONTROL_SHA256" =~ ^[0-9a-f]{64}$ ]] || {
+ unset CX_FAILURE_MODE COLLECTIVEX_CONTROL_SHA256
+ cx_log "ERROR: cannot hash shard for setup-failure records"
+ return 1
+ }
+ cases_file="$(mktemp)" || return 1
+ if ! python3 - "$path" > "$cases_file" <<'PY'
+import json, sys
+
+with open(sys.argv[1]) as handle:
+ cases = json.load(handle)["cases"]
+for case in cases:
+ fields = (
+ case["phase"], case["case_id"], case["suite"], case["workload"],
+ case["required_publication"], case["routing"], "1" if case["eplb"] else "",
+ case["ep"], case["hidden"], case["topk"], case["experts"], case["nodes"],
+ case["gpus_per_node"], case["scale_up_domain"], case["ladder"],
+ "1" if case["canonical"] else "", case["timing"],
+ )
+ print("|".join(map(str, fields)))
+PY
+ then
+ rm -f "$cases_file"
+ unset CX_FAILURE_MODE
+ return 1
+ fi
+ expected="$(wc -l < "$cases_file" | tr -d ' ')"
+ [ "$expected" -gt 0 ] || { rm -f "$cases_file"; unset CX_FAILURE_MODE; return 1; }
+ while IFS='|' read -r phase case_id suite workload required routing eplb ep hidden topk experts \
+ nodes gpn domain ladder canonical timing; do
+ export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+ export CX_REQUIRED_PUBLICATION="$required" CX_ROUTING="$routing" CX_EPLB="$eplb"
+ export CX_EP="$ep" CX_NGPUS="$ep" CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts"
+ export CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn" CX_SCALE_UP_DOMAIN="$domain"
+ export CX_TOKENS_LADDER="$ladder" CX_CANONICAL="$canonical"
+ IFS=: read -r CX_ITERS CX_TRIALS CX_WARMUP <<< "$timing"
+ export CX_ITERS CX_TRIALS CX_WARMUP CX_SAMPLES_PER_POINT="$((CX_ITERS * CX_TRIALS))"
+ if cx_case_attempt_exists "$out_dir" "$case_id"; then
+ covered=$((covered + 1))
+ continue
+ fi
+ cx_emit_ep_failed_case "$out_dir/failed_${case_id}-a01.json" "$backend" "$phase" "$rc" || return 1
+ emitted=$((emitted + 1))
+ done < "$cases_file"
+ rm -f "$cases_file"
+ unset CX_FAILURE_MODE
+ [ "$((emitted + covered))" -eq "$expected" ] || {
+ cx_log "ERROR: covered $((emitted + covered))/$expected terminal cases"
+ return 1
+ }
+}
+
+cx_launcher_cleanup() {
+ local rc="$1" source_root="${MOUNT_SRC:-${REPO_ROOT:-}}" out_dir allocation_stopped=1
+ trap - EXIT
+ if [ -n "${COLLECTIVEX_EPHEMERAL_CONFIG_PATH:-}" ]; then
+ rm -f -- "$COLLECTIVEX_EPHEMERAL_CONFIG_PATH" >/dev/null 2>&1 || true
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ fi
+ if [ -n "${JOB_ID:-}" ]; then
+ if ! cx_cancel_job "$JOB_ID"; then
+ allocation_stopped=0
+ [ "$rc" != 0 ] || rc=1
+ fi
+ elif [ "${CX_ALLOCATION_REQUESTED:-0}" = 1 ]; then
+ allocation_stopped=0
+ [ "$rc" != 0 ] || rc=1
+ fi
+ if [ "$allocation_stopped" = 1 ]; then
+ cx_write_cleanup_guard safe || true
+ else
+ cx_write_cleanup_guard unsafe || true
+ fi
+ [ "$allocation_stopped" = 1 ] || source_root="${REPO_ROOT:-$source_root}"
+ if [ "$rc" != 0 ] && [ -n "${REPO_ROOT:-}" ] && [ -n "${CX_BENCH:-}" ]; then
+ cx_log "ERROR: terminal-failure-class=${CX_FAILSAFE_MODE:-setup}"
+ [ -d "$source_root/experimental/CollectiveX" ] || source_root="$REPO_ROOT"
+ out_dir="$source_root/experimental/CollectiveX/results"
+ cx_emit_setup_failures \
+ "$source_root/experimental/CollectiveX" "$out_dir" "$CX_BENCH" "$rc" || true
+ [ "$source_root" = "$REPO_ROOT" ] \
+ || cx_collect_results "$source_root" "$REPO_ROOT" || true
+ fi
+ if [ "$allocation_stopped" = 1 ] && [ -n "${REPO_ROOT:-}" ] \
+ && [ "$source_root" != "$REPO_ROOT" ]; then
+ if ! cx_cleanup_stage "$source_root" "$REPO_ROOT"; then
+ [ "$rc" != 0 ] || rc=1
+ fi
+ fi
+ [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || cx_cleanup_private_logs "$rc"
+ exit "$rc"
+}
+
+cx_install_launcher_fail_safe() {
+ trap 'cx_launcher_cleanup "$?"' EXIT
+}
diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh
new file mode 100644
index 0000000000..119efa7ffc
--- /dev/null
+++ b/experimental/CollectiveX/runtime/run_in_container.sh
@@ -0,0 +1,1002 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun` for single-node shards. The GB EP8 launcher invokes
+# run_ep.py directly across nodes. The SKU adapter handles allocation/container/transport-env;
+# this script selects one EP backend from CX_BENCH and writes result JSON under results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector: CX_BENCH = deepep | deepep-v2 | mori | uccl | nccl-ep | deepep-hybrid
+# EP knobs passed to tests/run_ep.py:
+# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep
+# CX_TOKENS_LADDER (space/comma sep; blank = phase default)
+# CX_HIDDEN CX_TOPK CX_EXPERTS CX_ROUTING CX_SEED CX_ITERS
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=../runtime/common.sh
+source runtime/common.sh
+mkdir -p results
+cx_write_runtime_stage backend-setup || cx_die "cannot record runtime stage"
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-deepep}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+
+cx_apply_timing_profile
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+
+# Blank ladders use the phase default in tests/run_ep.py.
+cx_ep_ladder() {
+ printf '%s' "${CX_TOKENS_LADDER:-}"
+}
+
+# Canonical workload staging. Every SKU/backend generates identical canonical array bytes and
+# content IDs in-container; the NPZ container bytes themselves are not an identity boundary. When CX_CANONICAL=1
+# (and CX_WORKLOAD_DIR not already provided) we generate routing traces for the run's ladder
+# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and
+# point run_ep at it. Raw attempts remain diagnostic until the publisher validates full coverage.
+cx_stage_canonical() {
+ [ "${CX_CANONICAL:-0}" = "1" ] || return 0
+ [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0
+ local dir="$PWD/.cx_workloads"
+ local ladder; ladder="$(cx_ep_ladder)"
+ # cover both phase ladders when none is given, so either phase finds its files.
+ [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+ cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')"
+ python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \
+ --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \
+ --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \
+ || { cx_log "ERROR: canonical workload staging failed"; return 1; }
+ export CX_WORKLOAD_DIR="$dir"
+ cx_log "canonical workloads staged at $dir"
+}
+
+# run_ep_suite
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+# Preserve a failed case with its full scheduled identity instead of letting it vanish.
+emit_failed_case() { # backend phase rc
+ cx_emit_ep_failed_case \
+ "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" "$1" "$2" "$3" || true
+}
+
+run_ep_suite() {
+ local backend="$1" phase phases ladder failure_kind rc=0 rc_run
+ ladder="$(cx_ep_ladder)"
+ phases="${CX_PHASE:-decode}"
+ [ "$phases" = "both" ] && phases="decode prefill"
+ if ! cx_stage_canonical; then
+ for phase in $phases; do
+ emit_failed_case "$backend" "$phase" 2
+ done
+ return 1
+ fi
+ for phase in $phases; do
+ cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'"
+ local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"
+ local -a EPARGS=(--backend "$backend" --phase "$phase" --tokens-ladder "$ladder"
+ --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}"
+ --routing "${CX_ROUTING:-uniform}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-8}"
+ --trials "${CX_TRIALS:-64}" --warmup "${CX_WARMUP:-32}"
+ --gpus-per-node "${CX_GPUS_PER_NODE:-0}" --scale-up-domain "${CX_SCALE_UP_DOMAIN:-0}"
+ --case-id "${CX_CASE_ID:-}" --suite "${CX_SUITE:-}" --workload-name "${CX_WORKLOAD_NAME:-}"
+ --required-publication "${CX_REQUIRED_PUBLICATION:-}"
+ --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT"
+ --out "$out")
+ [ -n "${CX_EPLB:-}" ] && EPARGS+=(--eplb)
+ [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR")
+ cx_write_runtime_stage execution || cx_die "cannot record runtime stage"
+ if timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+ torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"; then
+ rc_run=0
+ else
+ rc_run=$?
+ fi
+ if [ "$rc_run" = 0 ] && cx_result_doc_is "$out" invalid; then
+ cx_log "WARN: $backend $phase completed with invalid semantic evidence"
+ rc=1
+ continue
+ fi
+ if [ "$rc_run" = 0 ] && ! cx_result_doc_is "$out" success; then
+ rc_run=1
+ fi
+ if [ "$rc_run" != 0 ]; then
+ failure_kind=failed
+ [ "$rc_run" != 124 ] && [ "$rc_run" != 137 ] || failure_kind="timed out"
+ if [ "$failure_kind" = "timed out" ]; then
+ cx_log "WARN: $backend $phase run timed out rc=$rc_run (limit=${CX_RUN_TIMEOUT:-900}s)"
+ else
+ cx_log "WARN: $backend $phase run failed rc=$rc_run"
+ fi
+ if cx_has_result_doc "$out"; then
+ cx_demote_result_doc "$out" "$rc_run" \
+ || { cx_quarantine_result_doc "$out"; emit_failed_case "$backend" "$phase" "$rc_run"; }
+ cx_log "preserved benchmark output as a failed attempt"
+ else
+ cx_quarantine_result_doc "$out"
+ emit_failed_case "$backend" "$phase" "$rc_run"
+ fi
+ rc=1
+ fi
+ done
+ return "$rc"
+}
+
+# Resolve and verify the actual CUDA target before compiling source kernels.
+cx_cuda_arch() {
+ local expected detected
+ case "$CX_RUNNER" in
+ h100*|h200*) expected="9.0" ;;
+ b200*|gb200*) expected="10.0" ;;
+ b300*|gb300*) expected="10.3" ;;
+ *) cx_log "ERROR: no CUDA target registered for $CX_RUNNER"; return 1 ;;
+ esac
+ detected="$(python3 - <<'PY'
+import torch
+
+major, minor = torch.cuda.get_device_capability()
+print(f"{major}.{minor}")
+PY
+)" || return 1
+ [ "$detected" = "$expected" ] || {
+ cx_log "ERROR: $CX_RUNNER expected CUDA target $expected, detected $detected"
+ return 1
+ }
+ printf '%s' "$detected"
+}
+
+cx_nvidia_package_root() {
+ local package="$1" component="$2"
+ python3 - "$package" "$component" <<'PY'
+from importlib import metadata
+from pathlib import Path, PurePosixPath
+import sys
+
+package, component = sys.argv[1:]
+try:
+ distribution = metadata.distribution(package)
+ prefix = f"nvidia/{component}/"
+ entries = [str(entry).replace("\\", "/") for entry in distribution.files or ()]
+ if not any(entry.startswith(prefix) for entry in entries):
+ raise ValueError
+ root = Path(distribution.locate_file(PurePosixPath("nvidia") / component)).resolve()
+ if not root.is_dir():
+ raise ValueError
+except (metadata.PackageNotFoundError, OSError, TypeError, ValueError):
+ raise SystemExit(1)
+print(root, end="")
+PY
+}
+
+cx_prepare_cuda_cccl() {
+ local cccl="" candidate cuda_home nvcc
+ nvcc="$(command -v nvcc)" \
+ || { cx_log "ERROR: CUDA nvcc is unavailable"; return 1; }
+ nvcc="$(readlink -f -- "$nvcc")" \
+ || { cx_log "ERROR: CUDA nvcc cannot be resolved"; return 1; }
+ case "$nvcc" in
+ */bin/nvcc) cuda_home="${nvcc%/bin/nvcc}" ;;
+ *) cx_log "ERROR: CUDA nvcc has an unexpected path"; return 1 ;;
+ esac
+ [ -x "$cuda_home/bin/nvcc" ] && [ -d "$cuda_home/include" ] \
+ && [ -d "$cuda_home/lib64" ] \
+ || { cx_log "ERROR: CUDA toolkit root is incomplete"; return 1; }
+ for candidate in "$cuda_home"/targets/*/include/cccl; do
+ if [ -d "$candidate" ]; then
+ cccl="$candidate"
+ break
+ fi
+ done
+ [ -n "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; }
+ export CUDA_HOME="$cuda_home" CX_CUDA_CCCL="$cccl"
+ export CPATH="$cccl:${CPATH:-}"
+ export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}"
+}
+
+cx_prepare_deepep_toolchain() {
+ local packaged overlay path root temporary
+ packaged="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \
+ || { cx_log "ERROR: nvidia.nvshmem is unavailable"; return 1; }
+ root="$(cx_deepep_v2_root)" || return 1
+ overlay="$root/nvshmem-overlay"
+ if ! (
+ umask 077
+ exec 8>"$root/nvshmem-overlay.lock" || exit 1
+ flock 8 || exit 1
+ if [ ! -d "$overlay" ]; then
+ temporary="$root/.nvshmem-overlay.$$"
+ rm -rf "$temporary" || exit 1
+ mkdir -p "$temporary/lib" || exit 1
+ ln -s "$packaged/include" "$temporary/include" || exit 1
+ for path in "$packaged"/lib/*; do
+ ln -s "$path" "$temporary/lib/${path##*/}" || exit 1
+ done
+ [ ! -e "$packaged/lib/libnvshmem_host.so.3" ] \
+ || ln -sf "$packaged/lib/libnvshmem_host.so.3" \
+ "$temporary/lib/libnvshmem_host.so" || exit 1
+ mv "$temporary" "$overlay" || exit 1
+ fi
+ [ ! -L "$overlay" ] \
+ && [ "$(readlink -f "$overlay/include")" = "$(readlink -f "$packaged/include")" ] \
+ && [ -e "$overlay/lib/libnvshmem_host.so" ] \
+ && [ -e "$overlay/lib/libnvshmem_device.a" ]
+ ); then
+ cx_log "ERROR: DeepEP V2 NVSHMEM overlay is invalid"
+ return 1
+ fi
+ NVSHMEM_DIR="$overlay"
+ export NVSHMEM_DIR
+ cx_prepare_cuda_cccl || return 1
+ export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}"
+}
+
+cx_probe_deepep() {
+ local expected_record_sha256 expected_version expected_wheel_sha256
+ if [ "${COLLECTIVEX_IMAGE:-}" != "$CX_IMAGE_MULTIARCH" ] \
+ || [ "${COLLECTIVEX_IMAGE_DIGEST:-}" != "$CX_IMAGE_MULTIARCH_DIGEST" ] \
+ || [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" != 1 ]; then
+ cx_log "ERROR: DeepEP V1 requires the exact pinned multi-architecture image"
+ return 1
+ fi
+ cx_cuda_arch >/dev/null || return 1
+ case "$CX_RUNNER" in
+ gb200|gb300)
+ expected_version="1.1.0+814e508"
+ expected_wheel_sha256="784dabec0877b6cf72619b7e93eda7e2f365648487bd37fc3ff6960e53669313"
+ expected_record_sha256="2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882"
+ DEEPEP_COMMIT="814e508537c6ffc775d59f6f1b9ba43f3a65968c"
+ ;;
+ *)
+ expected_version="1.2.1"
+ expected_wheel_sha256="7c02c29306ea0fe2dd474618e72e0f310f260187a9c0700a656d2f6964e8c307"
+ expected_record_sha256="6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac"
+ DEEPEP_COMMIT="9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee"
+ ;;
+ esac
+ export DEEPEP_COMMIT
+ python3 - "$expected_version" "$expected_wheel_sha256" "$expected_record_sha256" <<'PY' || {
+import base64
+import csv
+import hashlib
+import importlib.metadata as metadata
+import io
+import json
+from pathlib import Path
+import sys
+
+import deep_ep
+from deep_ep import Buffer
+
+distribution = metadata.distribution("deep_ep")
+assert distribution.version == sys.argv[1]
+assert Buffer.__name__ == "Buffer"
+recorded_files = {
+ Path(distribution.locate_file(entry)).resolve() for entry in distribution.files or ()
+}
+buffer_module = sys.modules.get(Buffer.__module__)
+assert Path(deep_ep.__file__).resolve() in recorded_files
+assert buffer_module is not None and Path(buffer_module.__file__).resolve() in recorded_files
+direct_url = json.loads(distribution.read_text("direct_url.json"))
+assert direct_url["archive_info"]["hashes"]["sha256"] == sys.argv[2]
+record_entry = next(
+ entry for entry in distribution.files or ()
+ if str(entry).endswith(".dist-info/RECORD")
+)
+record = distribution.locate_file(record_entry).read_bytes()
+assert hashlib.sha256(record).hexdigest() == sys.argv[3]
+for path, encoded_digest, size in csv.reader(io.StringIO(record.decode())):
+ if not encoded_digest:
+ continue
+ algorithm, expected = encoded_digest.split("=", 1)
+ assert algorithm == "sha256"
+ payload = distribution.locate_file(path).read_bytes()
+ observed = base64.urlsafe_b64encode(hashlib.sha256(payload).digest()).decode().rstrip("=")
+ assert observed == expected
+ assert not size or len(payload) == int(size)
+PY
+ cx_log "ERROR: container DeepEP build does not match its pinned image contract"
+ return 1
+ }
+ cx_log "DeepEP image build ready ($DEEPEP_COMMIT)"
+}
+
+# DeepEP V2 is PR #605's ElasticBuffer implementation with upstream PR #630's pure scale-up
+# initialization fix. Canonical launchers stage the pinned source and mount a private cluster-local
+# build cache at /cx-cache.
+cx_deepep_v2_root() {
+ local arch cpu base identity key image_digest
+ arch="$(cx_cuda_arch)" || return 1
+ cpu="$(uname -m)"
+ [[ "$cpu" =~ ^[A-Za-z0-9._-]+$ ]] || return 1
+ base="${CX_BACKEND_CACHE_ROOT:-}"
+ [[ "$base" = /* ]] || return 1
+ image_digest="${COLLECTIVEX_IMAGE_DIGEST:-manual-unverified}"
+ [[ "$image_digest" = manual-unverified || "$image_digest" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || return 1
+ # Bump the recipe generation whenever the build procedure changes. Benchmark-only
+ # source revisions must reuse the same immutable environment instead of leaking GBs.
+ identity="deepep-v2-cache-v2|$cpu|sm${arch/./}|image=$image_digest|recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2|$CX_DEEPEP_V2_COMMIT|$CX_DEEPEP_V2_TREE|$CX_DEEPEP_V2_FMT_COMMIT|pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0|numpy=2.2.6|torch=2.10.0+cu130|nccl=2.30.4|nvshmem=3.3.9|max-jobs=16"
+ key="$(printf '%s' "$identity" | sha256sum | awk '{print $1}')"
+ [[ "$key" =~ ^[0-9a-f]{64}$ ]] || return 1
+ printf '%s/deepep-v2-%s' "$base" "$key"
+}
+
+cx_activate_deepep_v2() {
+ local root venv stage_root
+ root="$(cx_deepep_v2_root)" || return 1
+ venv="$root/venv"
+ [ -x "$venv/bin/python" ] \
+ || { cx_log "ERROR: DeepEP V2 venv interpreter is unavailable"; return 1; }
+ export VIRTUAL_ENV="$venv"
+ export PATH="$venv/bin:${PATH#"$venv/bin:"}"
+ EP_NCCL_ROOT_DIR="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" \
+ || { cx_log "ERROR: DeepEP V2 NCCL package root is unavailable"; return 1; }
+ EP_NVSHMEM_ROOT_DIR="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \
+ || { cx_log "ERROR: DeepEP V2 NVSHMEM package root is unavailable"; return 1; }
+ export EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR
+ export LD_LIBRARY_PATH="$EP_NCCL_ROOT_DIR/lib:$EP_NVSHMEM_ROOT_DIR/lib:${LD_LIBRARY_PATH:-}"
+ case "${CX_BACKEND_SOURCE_ROOT:-}" in
+ /*/.cx_sources) stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}" ;;
+ *) cx_log "ERROR: DeepEP V2 job-local source root is unavailable"; return 1 ;;
+ esac
+ [ -d "$stage_root" ] && [ ! -L "$stage_root" ] \
+ || { cx_log "ERROR: DeepEP V2 job-local stage is invalid"; return 1; }
+ # JIT CUBINs are evidence from this shard, not part of the persistent AOT environment.
+ # Keeping them on the isolated staged tree prevents a prior driver/topology attempt
+ # from seeding a later run; all ranks and cases in this shard still share one cold build.
+ export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"
+ export EP_REUSE_NCCL_COMM=1
+ export DEEPEP_V2_PR=605 DEEPEP_V2_FIX_PR=630
+ DEEPEP_V2_COMMIT="$CX_DEEPEP_V2_COMMIT"
+ DEEPEP_V2_TREE="$CX_DEEPEP_V2_TREE"
+ DEEPEP_V2_FMT_COMMIT="$CX_DEEPEP_V2_FMT_COMMIT"
+ export DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT
+ [ ! -L "$stage_root/.cx_backend" ] && [ ! -L "$EP_JIT_CACHE_DIR" ] \
+ || { cx_log "ERROR: DeepEP V2 JIT cache path is unsafe"; return 1; }
+ if ! mkdir -p "$EP_JIT_CACHE_DIR" \
+ || ! chmod 700 "$stage_root/.cx_backend" "$EP_JIT_CACHE_DIR"; then
+ cx_log "ERROR: DeepEP V2 JIT cache is unavailable"
+ return 1
+ fi
+ unset EP_SUPPRESS_NCCL_CHECK
+}
+
+cx_enable_deepep_v2_jit_reproducibility() {
+ local seed="collectivex-deepep-v2-fa8a9b1" cccl
+ [ -n "${CUDA_HOME:-}" ] \
+ || { cx_log "ERROR: active CUDA toolkit is unavailable"; return 1; }
+ cccl="${CX_CUDA_CCCL:-}"
+ case "$cccl" in
+ "$CUDA_HOME"/targets/*/include/cccl) ;;
+ *) cx_log "ERROR: CUDA CCCL headers differ from the active toolkit"; return 1 ;;
+ esac
+ [ -d "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; }
+ CPATH="$cccl"
+ NVCC_PREPEND_FLAGS="--frandom-seed=$seed -I$cccl"
+ DEEPEP_V2_JIT_RANDOM_SEED="$seed"
+ EP_JIT_DUMP_SASS=1
+ unset EP_JIT_DEBUG EP_JIT_DUMP_ASM EP_JIT_DUMP_PTX EP_JIT_WITH_LINEINFO
+ unset EP_JIT_PTXAS_VERBOSE EP_JIT_PRINT_COMPILER_COMMAND EP_JIT_NVCC_COMPILER
+ unset EP_JIT_CPP_STANDARD EP_JIT_PTXAS_CHECK EP_GIN_GDAKI_DEBUG EP_NUM_TOPK_IDX_BITS
+ export CPATH DEEPEP_V2_JIT_RANDOM_SEED EP_JIT_DUMP_SASS NVCC_PREPEND_FLAGS
+}
+
+cx_probe_deepep_v2() {
+ python3 - <<'PY'
+import ctypes
+import importlib.metadata as metadata
+import inspect
+import os
+
+import torch
+
+assert torch.__version__ == "2.10.0+cu130", torch.__version__
+assert metadata.version("nvidia-nccl-cu13") == "2.30.4"
+assert metadata.version("nvidia-nvshmem-cu12") == "3.3.9"
+assert metadata.version("numpy") == "2.2.6"
+
+import deep_ep
+assert deep_ep.__version__ == "2.0.0", deep_ep.__version__
+assert metadata.version("deep_ep") == "2.0.0+fa8a9b1"
+assert inspect.isclass(deep_ep.ElasticBuffer)
+assert deep_ep.ElasticBuffer.__name__ == "ElasticBuffer"
+assert os.environ.get("EP_SUPPRESS_NCCL_CHECK") is None
+with open("/proc/self/maps", encoding="utf-8") as handle:
+ loaded_nccl = {
+ os.path.realpath(line.rstrip().split()[-1])
+ for line in handle
+ if "libnccl.so" in line and os.path.isfile(line.rstrip().split()[-1])
+ }
+assert len(loaded_nccl) == 1
+runtime_version = ctypes.c_int()
+assert ctypes.CDLL(loaded_nccl.pop()).ncclGetVersion(ctypes.byref(runtime_version)) == 0
+assert runtime_version.value == 23004, runtime_version.value
+PY
+}
+
+cx_deepep_v2_content_sha256() {
+ python3 - <<'PY'
+import hashlib
+from importlib import metadata
+import os
+from pathlib import Path, PurePosixPath
+import stat
+
+distribution = metadata.distribution("deep_ep")
+entries = sorted(distribution.files or (), key=lambda entry: entry.as_posix())
+if not entries:
+ raise SystemExit(1)
+venv_path = Path(os.environ["VIRTUAL_ENV"]).absolute()
+if venv_path.is_symlink() or not venv_path.is_dir():
+ raise SystemExit(1)
+venv = venv_path.resolve(strict=True)
+digest = hashlib.sha256()
+extension = False
+for entry in entries:
+ relative = PurePosixPath(entry.as_posix())
+ if (
+ relative.is_absolute()
+ or ".." in relative.parts
+ or not relative.parts
+ or not (
+ relative.parts[0] == "deep_ep"
+ or relative.parts[0].startswith("deep_ep-")
+ and relative.parts[0].endswith(".dist-info")
+ )
+ ):
+ raise SystemExit(1)
+ path = Path(distribution.locate_file(entry)).absolute()
+ resolved = path.resolve(strict=True)
+ try:
+ path.relative_to(venv_path)
+ resolved.relative_to(venv)
+ except ValueError:
+ raise SystemExit(1)
+ parent = path.parent
+ while parent != venv_path:
+ if parent.is_symlink():
+ raise SystemExit(1)
+ parent = parent.parent
+ item = os.lstat(path)
+ if not stat.S_ISREG(item.st_mode):
+ raise SystemExit(1)
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (item.st_dev, item.st_ino):
+ raise SystemExit(1)
+ file_digest = hashlib.sha256()
+ while chunk := os.read(descriptor, 1024 * 1024):
+ file_digest.update(chunk)
+ finally:
+ os.close(descriptor)
+ name = relative.as_posix()
+ extension |= name.startswith("deep_ep/") and name.endswith(".so")
+ digest.update(name.encode())
+ digest.update(b"\0")
+ digest.update(str(item.st_size).encode())
+ digest.update(b"\0")
+ digest.update(file_digest.digest())
+if not extension:
+ raise SystemExit(1)
+print(digest.hexdigest(), end="")
+PY
+}
+
+cx_deepep_v2_marker_content_sha256() {
+ local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6"
+ python3 - "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" <<'PY'
+import os
+import re
+import stat
+import sys
+
+root, marker, revision, tree, fmt_revision, cache_key = sys.argv[1:]
+try:
+ root_item = os.lstat(root)
+ marker_item = os.lstat(marker)
+ children = [os.lstat(os.path.join(root, name)) for name in ("source", "venv")]
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ or not stat.S_ISREG(marker_item.st_mode)
+ or marker_item.st_uid != root_item.st_uid
+ or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600
+ or marker_item.st_size > 1024
+ or any(
+ not stat.S_ISDIR(child.st_mode)
+ or child.st_uid != root_item.st_uid
+ or stat.S_IMODE(child.st_mode) & 0o022
+ for child in children
+ )
+ ):
+ raise OSError
+ descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino):
+ raise OSError
+ payload = os.read(descriptor, 1025)
+ finally:
+ os.close(descriptor)
+ lines = payload.decode("ascii").splitlines()
+ if lines[:4] != [revision, tree, fmt_revision, cache_key] or len(lines) != 5:
+ raise ValueError
+ if not re.fullmatch(r"[0-9a-f]{64}", lines[4]):
+ raise ValueError
+except (OSError, UnicodeError, ValueError):
+ raise SystemExit(1)
+print(lines[4], end="")
+PY
+}
+
+cx_deepep_v2_cache_is_valid() {
+ local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6"
+ local expected_content actual_content
+ expected_content="$(
+ cx_deepep_v2_marker_content_sha256 \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key"
+ )" || return 1
+ [ -d "$root/source" ] && [ ! -L "$root/source" ] \
+ && [ "$(cx_git_in_tree "$root/source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ && [ "$(cx_git_in_tree "$root/source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt_revision" ] \
+ || return 1
+ cx_activate_deepep_v2 || return 1
+ actual_content="$(cx_deepep_v2_content_sha256)" || return 1
+ [ "$actual_content" = "$expected_content" ]
+}
+
+cx_build_deepep_v2() {
+ local root venv source marker marker_tmp lock_path arch cache_key cache_ready content_sha256
+ local revision="fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+ local tree="29809e75c5874e6609dac4804e7b651d5226959f"
+ local fmt_revision="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+ cx_verify_backend_cache_mount \
+ || { cx_log "ERROR: DeepEP V2 cache mount identity validation failed"; return 1; }
+ arch="$(cx_cuda_arch)" || return 1
+ root="$(cx_deepep_v2_root)" || return 1
+ cache_key="${root##*/deepep-v2-}"
+ [[ "$cache_key" =~ ^[0-9a-f]{64}$ ]] || return 1
+ venv="$root/venv"; source="$root/source"; marker="$root/.collectivex-complete"
+ lock_path="${root}.lock"
+ command -v flock >/dev/null || { cx_log "ERROR: flock is required for DeepEP V2"; return 1; }
+ mkdir -p "${root%/*}" || return 1
+ cx_log "DeepEP V2: preparing PR #605 implementation with upstream PR #630 fix ($revision)"
+ if ! (
+ [ ! -L "$lock_path" ] \
+ || { cx_log "ERROR: DeepEP V2 cache lock is unsafe"; exit 1; }
+ (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-create failed"; exit 1; }
+ exec 9<>"$lock_path" \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-open failed"; exit 1; }
+ flock 9 \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-acquire failed"; exit 1; }
+ cache_ready=0
+ if [ -e "$marker" ] || [ -L "$marker" ]; then
+ if (
+ cx_deepep_v2_cache_is_valid \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key"
+ ); then
+ cache_ready=1
+ else
+ cx_log "ERROR: published DeepEP V2 cache failed integrity validation; refusing reset"
+ exit 1
+ fi
+ fi
+ if [ "$cache_ready" != 1 ]; then
+ if [ -e "$root" ] || [ -L "$root" ]; then
+ rm -rf "$root" \
+ || { cx_log "ERROR: incomplete DeepEP V2 cache-reset failed"; exit 1; }
+ fi
+ mkdir -m 700 "$root" \
+ || { cx_log "ERROR: DeepEP V2 cache-create failed"; exit 1; }
+ python3 -m venv "$venv" \
+ || { cx_log "ERROR: DeepEP V2 venv creation failed"; exit 1; }
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ "pip==26.1.2" "setuptools==82.0.1" "wheel==0.47.0" "ninja==1.13.0" \
+ "numpy==2.2.6" "nvidia-nvshmem-cu12==3.3.9" >&2 2>&1 \
+ || { cx_log "ERROR: DeepEP V2 build-tool installation failed"; exit 1; }
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ --index-url https://download.pytorch.org/whl/cu130 \
+ --extra-index-url https://pypi.org/simple "torch==2.10.0" >&2 2>&1 \
+ || { cx_log "ERROR: torch 2.10.0+cu130 installation failed"; exit 1; }
+ # Torch pins NCCL 2.28.9; the PR #605 ElasticBuffer implementation requires 2.30.4.
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ --force-reinstall --no-deps "nvidia-nccl-cu13==2.30.4" >&2 2>&1 \
+ || { cx_log "ERROR: NCCL 2.30.4 installation failed"; exit 1; }
+ cx_activate_deepep_v2 \
+ || { cx_log "ERROR: DeepEP V2 environment activation failed"; exit 1; }
+ cx_prepare_deepep_toolchain \
+ || { cx_log "ERROR: DeepEP V2 toolchain preparation failed"; exit 1; }
+ EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"
+ export EP_NVSHMEM_ROOT_DIR
+ cx_materialize_backend_source deepep-v2 "$source" \
+ || { cx_log "ERROR: DeepEP V2 staged source is invalid"; exit 1; }
+ (cd "$source" && SOURCE_DATE_EPOCH="$(cx_git_in_tree "$source" show -s --format=%ct HEAD)" \
+ TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+ python3 -m pip install -q --no-build-isolation --no-deps --force-reinstall .) >&2 2>&1 \
+ || { cx_log "ERROR: DeepEP V2 build failed"; exit 1; }
+ cx_probe_deepep_v2 \
+ || { cx_log "ERROR: DeepEP V2 ElasticBuffer/runtime probe failed"; exit 1; }
+ content_sha256="$(cx_deepep_v2_content_sha256)" \
+ || { cx_log "ERROR: DeepEP V2 installed-content hashing failed"; exit 1; }
+ marker_tmp="$(mktemp "$root/.collectivex-complete.tmp.XXXXXX")" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-create failed"; exit 1; }
+ chmod 600 "$marker_tmp" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-permission failed"; exit 1; }
+ printf '%s\n%s\n%s\n%s\n%s\n' \
+ "$revision" "$tree" "$fmt_revision" "$cache_key" "$content_sha256" > "$marker_tmp" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-write failed"; exit 1; }
+ mv -f -- "$marker_tmp" "$marker" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-publish failed"; exit 1; }
+ fi
+ cx_deepep_v2_cache_is_valid \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" \
+ || { cx_log "ERROR: DeepEP V2 cache validation failed"; exit 1; }
+ ); then
+ cx_log "ERROR: shared DeepEP V2 environment is incomplete"
+ return 1
+ fi
+ cx_activate_deepep_v2 || return 1
+ cx_prepare_deepep_toolchain || return 1
+ cx_enable_deepep_v2_jit_reproducibility || return 1
+ EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"
+ export EP_NVSHMEM_ROOT_DIR
+ cx_probe_deepep_v2 || { cx_log "ERROR: DeepEP V2 shared runtime probe failed"; return 1; }
+ cx_log "DeepEP V2 ready ($DEEPEP_V2_COMMIT, ElasticBuffer, NCCL Device API; LSA/Gin selected by adapter)"
+}
+
+# Build the pinned DeepEP `hybrid-ep` implementation for one NVLink/MNNVL domain. CUDA 13 moved
+# libcudacxx headers under cccl, but this intradomain path does not use the separate NVSHMEM
+# toolchain required by DeepEP V2.
+cx_deepep_hybrid_marker_content_sha256() {
+ python3 - "$1" "$2" "$3" "$4" <<'PY'
+import os
+import re
+import stat
+import sys
+
+root, marker, revision, tree = sys.argv[1:]
+try:
+ root_item = os.lstat(root)
+ marker_item = os.lstat(marker)
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ or not stat.S_ISREG(marker_item.st_mode)
+ or marker_item.st_uid != root_item.st_uid
+ or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600
+ or marker_item.st_size > 512
+ ):
+ raise OSError
+ descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino):
+ raise OSError
+ payload = os.read(descriptor, 513)
+ finally:
+ os.close(descriptor)
+ lines = payload.decode("ascii").splitlines()
+ if len(lines) != 3 or lines[:2] != [revision, tree]:
+ raise ValueError
+ if not re.fullmatch(r"[0-9a-f]{64}", lines[2]):
+ raise ValueError
+except (OSError, UnicodeError, ValueError):
+ raise SystemExit(1)
+print(lines[2], end="")
+PY
+}
+
+cx_deepep_hybrid_cache_is_valid() {
+ local root="$1" marker="$2" revision="$3" tree="$4" expected actual status extra
+ expected="$(cx_deepep_hybrid_marker_content_sha256 \
+ "$root" "$marker" "$revision" "$tree")" || return 1
+ [ "$(cx_git_in_tree "$root" rev-parse HEAD 2>/dev/null)" = "$revision" ] \
+ && [ "$(cx_git_in_tree "$root" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ || return 1
+ status="$(cx_git_in_tree "$root" status --porcelain --untracked-files=no \
+ --ignore-submodules=none 2>/dev/null)" || return 1
+ [ -z "$status" ] || return 1
+ extra="$(cx_git_in_tree "$root" ls-files --others --exclude-standard -- \
+ 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1
+ [ -z "$extra" ] || return 1
+ extra="$(cx_git_in_tree "$root" ls-files --others --ignored --exclude-standard -- \
+ 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1
+ [ -z "$extra" ] || return 1
+ actual="$(cx_extension_pair_sha256 "$root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" \
+ || return 1
+ [ "$actual" = "$expected" ]
+}
+
+cx_build_deepep_hybrid() {
+ local arch revision="$CX_DEEPEP_HYBRID_COMMIT" tree="$CX_DEEPEP_HYBRID_TREE"
+ local build_root marker marker_tmp lock_path content_sha256 cache_ready
+ export DEEPEP_COMMIT="$revision" DEEPEP_TREE="$tree"
+ arch="$(cx_cuda_arch)" || return 1
+ build_root="$PWD/.cx_backend/deepep-hybrid-${arch/./}"
+ marker="$build_root/.collectivex-complete"
+ lock_path="${build_root}.lock"
+ cx_log "DeepEP hybrid-ep: building $revision for CUDA target $arch"
+ unset NVSHMEM_DIR HYBRID_EP_MULTINODE USE_NIXL
+ cx_prepare_cuda_cccl || return 1
+ command -v flock >/dev/null || { cx_log "ERROR: flock is required for hybrid-ep"; return 1; }
+ mkdir -p "$PWD/.cx_backend" || return 1
+ if ! (
+ [ ! -L "$lock_path" ] || exit 1
+ (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" || exit 1
+ exec 9<>"$lock_path" || exit 1
+ flock 9 || exit 1
+ cache_ready=0
+ if [ -e "$marker" ] || [ -L "$marker" ]; then
+ cx_deepep_hybrid_cache_is_valid "$build_root" "$marker" "$revision" "$tree" \
+ || exit 1
+ cache_ready=1
+ fi
+ if [ "$cache_ready" != 1 ]; then
+ cx_materialize_backend_source deepep-hybrid "$build_root" \
+ || { cx_log "ERROR: hybrid-ep staged source is invalid"; exit 1; }
+ (cd "$build_root" && \
+ SOURCE_DATE_EPOCH="$(cx_git_in_tree "$build_root" show -s --format=%ct HEAD)" \
+ TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+ python3 setup.py build_ext --inplace) >&2 2>&1 \
+ || { cx_log "ERROR: hybrid-ep build failed"; exit 1; }
+ content_sha256="$(cx_extension_pair_sha256 \
+ "$build_root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" || exit 1
+ marker_tmp="$(mktemp "$build_root/.collectivex-complete.tmp.XXXXXX")" || exit 1
+ chmod 600 "$marker_tmp" || exit 1
+ printf '%s\n%s\n%s\n' "$revision" "$tree" "$content_sha256" > "$marker_tmp" \
+ || exit 1
+ mv -f -- "$marker_tmp" "$marker" || exit 1
+ fi
+ cx_deepep_hybrid_cache_is_valid "$build_root" "$marker" "$revision" "$tree"
+ ); then
+ cx_log "ERROR: shared hybrid-ep build is incomplete"
+ return 1
+ fi
+ export PYTHONPATH="$build_root:${PYTHONPATH:-}"
+ python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+ || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; }
+ cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT)"
+}
+
+# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13
+# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668
+# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly.
+cx_build_uccl() {
+ if [ -f /tmp/.cx_built_uccl ]; then
+ cx_log "UCCL EP already prepared this allocation — skip rebuild"
+ python3 -c "import torch; from uccl_deepep import Buffer" 2>/dev/null || return 1
+ return 0
+ fi
+ local version="0.1.1" tag="v0.1.1"
+ local wheel_sha256="390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec"
+ cx_log "UCCL EP: installing uccl==$version + cu12 runtime shim"
+ export PIP_BREAK_SYSTEM_PACKAGES=1
+ pip install -q --no-deps "sortedcontainers==2.4.0" "intervaltree==3.1.0" >&2 2>&1 \
+ || { cx_log "ERROR: UCCL support dependency installation failed"; return 1; }
+ printf 'uccl==%s --hash=sha256:%s\n' "$version" "$wheel_sha256" \
+ | pip install -q --no-deps --only-binary=:all: --require-hashes -r /dev/stdin >&2 2>&1 \
+ || { cx_log "ERROR: pip install uccl==$version failed"; return 1; }
+ pip install -q --no-deps "nvidia-cuda-runtime-cu12==12.9.79" >&2 2>&1 \
+ || { cx_log "ERROR: CUDA 12 runtime shim install failed"; return 1; }
+ local cu12lib
+ cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)"
+ [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}"
+ local installed
+ installed="$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))')" \
+ || { cx_log "ERROR: cannot read installed UCCL version"; return 1; }
+ [ "$installed" = "$version" ] \
+ || { cx_log "ERROR: expected UCCL $version, installed $installed"; return 1; }
+ UCCL_COMMIT="pkg-$installed"
+ export UCCL_COMMIT
+ # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader
+ # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too.
+ python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \
+ || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; }
+ # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name
+ # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes,
+ # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full
+ # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks.
+ rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg
+ # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's
+ # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel
+ # mismatches signatures. Match them.
+ if git clone --depth 1 --branch "$tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \
+ && [ "$(git -C /tmp/uccl_src rev-parse HEAD)" = "73ee4f12ba71717d6de34ba06806e1baaabe3f42" ] \
+ && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then
+ mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep
+ cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null
+ export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}"
+ python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2 \
+ || { cx_log "ERROR: uccl_deepep wrapper import failed"; return 1; }
+ export CX_UCCL_WRAPPER=1
+ export UCCL_WRAPPER_COMMIT="73ee4f12ba71717d6de34ba06806e1baaabe3f42"
+ else
+ cx_log "ERROR: uccl deep_ep_wrapper not available"
+ return 1
+ fi
+ : > /tmp/.cx_built_uccl
+ cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})"
+}
+
+# Rack build and rank steps may enter different container instances. Persist each node's
+# loader/import path and build identity on the shared staged mount, then require it from every rank.
+cx_persist_backend_env() {
+ local root="$PWD/.cx_backend/env" node_id="${SLURM_NODEID:-0}" path temporary name
+ local -a names=(PATH VIRTUAL_ENV LD_LIBRARY_PATH PYTHONPATH CUDA_HOME CPATH NVCC_PREPEND_FLAGS
+ NVSHMEM_DIR DEEPEP_COMMIT DEEPEP_TREE
+ EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR EP_JIT_CACHE_DIR EP_REUSE_NCCL_COMM
+ EP_JIT_DUMP_SASS
+ DEEPEP_V2_PR DEEPEP_V2_FIX_PR DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT
+ DEEPEP_V2_JIT_RANDOM_SEED
+ UCCL_COMMIT UCCL_WRAPPER_COMMIT CX_UCCL_WRAPPER)
+ [[ "$node_id" =~ ^[0-9]+$ ]] || return 1
+ mkdir -p "$root" || return 1
+ chmod 700 "$root" || return 1
+ temporary="$(mktemp "$root/.node-${node_id}.XXXXXX")" || return 1
+ chmod 600 "$temporary" || { rm -f "$temporary"; return 1; }
+ for name in "${names[@]}"; do
+ if declare -p "$name" >/dev/null 2>&1; then
+ printf 'export %s=%q\n' "$name" "${!name}" >> "$temporary" \
+ || { rm -f "$temporary"; return 1; }
+ fi
+ done
+ path="$root/node-${node_id}.sh"
+ mv -f -- "$temporary" "$path" || { rm -f "$temporary"; return 1; }
+}
+
+# Prepare and probe one backend without running a benchmark. The same hook is used
+# by normal in-container runs and by rack launchers' persistent build-only step.
+cx_prepare_backend() {
+ local backend="${1:-}"
+ case "$backend" in
+ deepep)
+ cx_probe_deepep || return 1
+ ;;
+ deepep-v2)
+ cx_build_deepep_v2 || return 1
+ ;;
+ deepep-hybrid)
+ cx_build_deepep_hybrid || return 1
+ ;;
+ uccl)
+ cx_build_uccl || return 1
+ ;;
+ mori)
+ python3 -c "import mori" 2>/dev/null || return 1
+ ;;
+ nccl-ep)
+ ;;
+ *)
+ cx_log "ERROR: unknown backend preparation request"
+ return 1
+ ;;
+ esac
+}
+
+prepare_backend_or_record() {
+ local backend="$1" phases="${CX_PHASE:-decode}" phase
+ cx_write_runtime_stage backend-setup || return 1
+ if cx_prepare_backend "$backend"; then
+ return 0
+ fi
+ cx_log "WARN: $backend preparation failed"
+ [ "$phases" = "both" ] && phases="decode prefill"
+ for phase in $phases; do
+ CX_FAILURE_MODE=backend-setup emit_failed_case "$backend" "$phase" 6
+ done
+ return 1
+}
+
+# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many
+# of these per allocation (SHARD mode below), reusing this single container + its built backend.
+dispatch_bench() {
+ case "$CX_BENCH" in
+ nccl-ep)
+ run_ep_suite "$CX_BENCH"
+ ;;
+ deepep|deepep-v2|deepep-hybrid|mori|uccl)
+ prepare_backend_or_record "$CX_BENCH" && run_ep_suite "$CX_BENCH"
+ ;;
+ *)
+ cx_die "unknown CX_BENCH=$CX_BENCH (want deepep|deepep-v2|mori|uccl|nccl-ep|deepep-hybrid)"
+ ;;
+ esac
+}
+
+rc=0
+cx_validate_shard_control "$PWD"
+# Build-only mode: rack launchers run the shared backend preparation hook once per
+# node inside a persistent named container, then direct rank processes reuse it.
+if [ -n "${CX_BUILD_ONLY:-}" ]; then
+ if cx_prepare_backend "${CX_BENCH:-}"; then
+ cx_persist_backend_env || rc=1
+ else
+ rc=1
+ fi
+ cx_log "backend preparation: bench=${CX_BENCH:-unknown} rc=$rc"
+ exit "$rc"
+fi
+if [ -n "${CX_SHARD_FILE:-}" ]; then
+ # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation.
+ # All cases share (sku, backend, nodes), so backend preparation is paid once and cached.
+ ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE'))['cases']))")"
+ cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)"
+ _cx_ts_base="$CX_TS" # per-case CX_TS suffix below keeps each case's result file UNIQUE (else
+ # cases sharing backend+phase overwrite each other at the same timestamp).
+ ci=0
+ failed_cases=0
+ while [ "$ci" -lt "$ncases" ]; do
+ CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")"
+ export CX_TS
+ # Map varying case fields plus the frozen v1 defaults into CX_* env.
+ _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY'
+import json, sys, shlex
+c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])]
+def g(k, d=""):
+ v = c.get(k, d); return "" if v is None else str(v)
+env = {
+ "CX_BENCH": g("backend"),
+ "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"),
+ "CX_EP": g("ep", "1"),
+ "CX_EPLB": "1" if c.get("eplb") else "",
+ "CX_CASE_ID": g("case_id"), "CX_SUITE": g("suite"), "CX_WORKLOAD_NAME": g("workload"),
+ "CX_REQUIRED_PUBLICATION": g("required_publication"),
+ "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"),
+ "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""),
+}
+lines = [f"export {k}={shlex.quote(v)}" for k, v in env.items()]
+# Per-case timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32 everywhere);
+# cases without one must fall back to the harness defaults, so UNSET rather than export-empty
+# (an empty CX_ITERS would defeat the 8-iter default and break the run_ep argparse; NOTE no
+# apostrophes in this heredoc — bash command-substitution scanning chokes on unbalanced quotes).
+timing = g("timing")
+if timing:
+ parts = (timing.split(":") + ["", "", ""])[:3]
+ for k, v in zip(("CX_ITERS", "CX_TRIALS", "CX_WARMUP"), parts):
+ if v:
+ lines.append(f"export {k}={shlex.quote(v)}")
+else:
+ lines.append("unset CX_ITERS CX_TRIALS CX_WARMUP 2>/dev/null || true")
+print("\n".join(lines))
+PY
+)"
+ eval "$_exports"
+ # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical
+ # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's
+ # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests
+ # (FileNotFoundError .cx_workloads/.manifest.json). Unset so every case re-stages its own.
+ unset CX_WORKLOAD_DIR 2>/dev/null || true
+ cx_log " [$((ci+1))/$ncases] $CX_BENCH $CX_PHASE routing=$CX_ROUTING eplb=${CX_EPLB:-0}"
+ _cx_case_ts="$CX_TS"
+ CX_TS="${_cx_case_ts}-a01"
+ export CX_ATTEMPT_ID=1 CX_TS
+ dispatch_bench || {
+ failed_cases=$((failed_cases+1))
+ cx_log " [$((ci+1))/$ncases] $CX_BENCH case FAILED; failed-case record preserved"
+ }
+ export CX_TS="$_cx_case_ts"
+ ci=$((ci + 1))
+ done
+ if [ "${failed_cases:-0}" -gt 0 ]; then
+ cx_log "SHARD done: $failed_cases/$ncases case(s) failed"
+ rc=1
+ fi
+ # The base timestamp matches every per-case file, so the final summary covers the whole shard.
+ export CX_TS="$_cx_ts_base"
+else
+ _cx_single_ts="$CX_TS"
+ CX_TS="${_cx_single_ts}-a01"
+ export CX_ATTEMPT_ID=1 CX_TS
+ dispatch_bench || rc=1
+fi
+
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/schemas/channel-v1.schema.json b/experimental/CollectiveX/schemas/channel-v1.schema.json
new file mode 100644
index 0000000000..663e22914b
--- /dev/null
+++ b/experimental/CollectiveX/schemas/channel-v1.schema.json
@@ -0,0 +1,23 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/channel-v1.schema.json",
+ "title": "CollectiveX public channel v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["format","channel","dataset","generated_at"],
+ "properties": {
+ "format": {"const": "collectivex.channel.v1"},
+ "channel": {"enum": ["latest-attempt","dev-latest"]},
+ "dataset": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes"],
+ "properties": {
+ "path": {"type": "string","pattern": "^datasets/[0-9a-f]{64}/dataset\\.json$"},
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "bytes": {"type": "integer","minimum": 1,"maximum": 33554432}
+ }
+ },
+ "generated_at": {"type": "string","format": "date-time"}
+ }
+}
diff --git a/experimental/CollectiveX/schemas/private-bundle-v1.schema.json b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json
new file mode 100644
index 0000000000..166c808930
--- /dev/null
+++ b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json
@@ -0,0 +1,162 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/private-bundle-v1.schema.json",
+ "title": "CollectiveX private attempt bundle v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "format",
+ "schema_version",
+ "created_at",
+ "ingest_id",
+ "run",
+ "matrix",
+ "sources",
+ "attempts",
+ "coverage",
+ "runtime_fingerprints",
+ "checksums",
+ "validation"
+ ],
+ "properties": {
+ "format": {"const": "collectivex.private.bundle.v1"},
+ "schema_version": {"const": 1},
+ "created_at": {"type": "string","format": "date-time"},
+ "ingest_id": {"$ref": "#/$defs/sha256"},
+ "run": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["repository","run_id","run_attempt","source_sha"],
+ "properties": {
+ "repository": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_attempt": {"type": "integer","minimum": 1},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"}
+ }
+ },
+ "matrix": {"$ref": "#/$defs/file"},
+ "sources": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/source"}},
+ "attempts": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "attempt_id",
+ "allocation_id",
+ "case_id",
+ "outcome",
+ "reason",
+ "selected",
+ "document",
+ "samples",
+ "runtime_fingerprint_sha256",
+ "series_ids",
+ "evidence_ids"
+ ],
+ "properties": {
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "selected": {"type": "boolean"},
+ "document": {"$ref": "#/$defs/file"},
+ "samples": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/file"}]},
+ "runtime_fingerprint_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "series_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}},
+ "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}}
+ }
+ }
+ },
+ "coverage": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["expected_cases","terminal_cases","complete","outcome_counts","selections"],
+ "properties": {
+ "expected_cases": {"type": "integer","minimum": 1},
+ "terminal_cases": {"type": "integer","minimum": 0},
+ "complete": {"type": "boolean"},
+ "outcome_counts": {"$ref": "#/$defs/outcomeCounts"},
+ "selections": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case_id","selected_attempt_id","outcome"],
+ "properties": {
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "selected_attempt_id": {"$ref": "#/$defs/attemptId"},
+ "outcome": {"$ref": "#/$defs/outcome"}
+ }
+ }
+ }
+ }
+ },
+ "runtime_fingerprints": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}},
+ "checksums": {"$ref": "#/$defs/file"},
+ "validation": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["policy","passed","checks"],
+ "properties": {
+ "policy": {"const": "collectivex-publisher-v1"},
+ "passed": {"const": true},
+ "checks": {
+ "type": "array",
+ "minItems": 1,
+ "uniqueItems": true,
+ "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$"}
+ }
+ }
+ }
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]},
+ "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]},
+ "outcomeCounts": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["success","unsupported","failed","invalid","diagnostic"],
+ "properties": {
+ "success": {"type": "integer","minimum": 0},
+ "unsupported": {"type": "integer","minimum": 0},
+ "failed": {"type": "integer","minimum": 0},
+ "invalid": {"type": "integer","minimum": 0},
+ "diagnostic": {"type": "integer","minimum": 0}
+ }
+ },
+ "file": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes"],
+ "properties": {
+ "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"},
+ "sha256": {"$ref": "#/$defs/sha256"},
+ "bytes": {"type": "integer","minimum": 1}
+ }
+ },
+ "source": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes","artifact_name"],
+ "properties": {
+ "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"},
+ "sha256": {"$ref": "#/$defs/sha256"},
+ "bytes": {"type": "integer","minimum": 1},
+ "artifact_name": {
+ "type": "string",
+ "pattern": "^cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*$"
+ }
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/public-dataset-v1.schema.json b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json
new file mode 100644
index 0000000000..87abf403d1
--- /dev/null
+++ b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json
@@ -0,0 +1,562 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/public-dataset-v1.schema.json",
+ "title": "CollectiveX sanitized public dataset v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "format",
+ "schema_version",
+ "generated_at",
+ "source_bundle_ids",
+ "promotion",
+ "coverage",
+ "attempts",
+ "series",
+ "cohorts",
+ "rankings",
+ "recommendations",
+ "sensitivities"
+ ],
+ "properties": {
+ "format": {"const": "collectivex.public.v1"},
+ "schema_version": {"const": 1},
+ "generated_at": {"type": "string","format": "date-time"},
+ "source_bundle_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}},
+ "promotion": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "status",
+ "reason",
+ "matrix_id",
+ "allocation_ids",
+ "required_allocations",
+ "requested_cases",
+ "terminal_cases",
+ "policy"
+ ],
+ "properties": {
+ "status": {"enum": ["promoted","diagnostic","quarantined"]},
+ "reason": {"$ref": "#/$defs/reason"},
+ "matrix_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "required_allocations": {"const": 3},
+ "requested_cases": {"type": "integer","minimum": 0},
+ "terminal_cases": {"type": "integer","minimum": 0},
+ "policy": {"const": "collectivex-decision-grade-v1"}
+ }
+ },
+ "coverage": {"type": "array","items": {"$ref": "#/$defs/coverage"}},
+ "attempts": {"type": "array","items": {"$ref": "#/$defs/attempt"}},
+ "series": {"type": "array","items": {"$ref": "#/$defs/series"}},
+ "cohorts": {"type": "array","items": {"$ref": "#/$defs/cohort"}},
+ "rankings": {"type": "array","items": {"$ref": "#/$defs/ranking"}},
+ "recommendations": {"type": "array","items": {"$ref": "#/$defs/recommendation"}},
+ "sensitivities": {"type": "array","items": {"$ref": "#/$defs/sensitivity"}}
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "workloadId": {"type": "string","pattern": "^cxwork-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128},
+ "publicationTier": {"enum": ["official","comparable-experimental"]},
+ "label": {"type": "string","minLength": 1,"maxLength": 160},
+ "nullableLabel": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/label"}]},
+ "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]},
+ "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]},
+ "coverage": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "case_id",
+ "label",
+ "required",
+ "sku",
+ "backend",
+ "phase",
+ "disposition",
+ "selected_attempt_id",
+ "outcome",
+ "failure_mode",
+ "reason",
+ "attempt_ids"
+ ],
+ "properties": {
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "label": {"$ref": "#/$defs/label"},
+ "required": {"type": "boolean"},
+ "sku": {"$ref": "#/$defs/safeId"},
+ "backend": {"$ref": "#/$defs/safeId"},
+ "phase": {"enum": ["decode","prefill"]},
+ "disposition": {"enum": ["runnable","unsupported"]},
+ "selected_attempt_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/attemptId"}]},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "failure_mode": {"$ref": "#/$defs/reason"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "attempt_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/attemptId"}}
+ }
+ },
+ "attempt": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "attempt_id",
+ "evidence",
+ "case_id",
+ "allocation_id",
+ "run_id",
+ "run_attempt",
+ "attempt_index",
+ "selected",
+ "outcome",
+ "failure_mode",
+ "reason",
+ "series_id",
+ "completed_at"
+ ],
+ "properties": {
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "evidence": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["evidence_id","point_id"],
+ "properties": {"evidence_id": {"$ref": "#/$defs/evidenceId"},"point_id": {"$ref": "#/$defs/pointId"}}
+ }
+ },
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_attempt": {"type": "integer","minimum": 1},
+ "attempt_index": {"type": "integer","minimum": 1},
+ "selected": {"type": "boolean"},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "failure_mode": {"$ref": "#/$defs/reason"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "series_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/seriesId"}]},
+ "completed_at": {"oneOf": [{"type": "null"},{"type": "string","format": "date-time"}]}
+ }
+ },
+ "eligibility": {
+ "type": "object",
+ "additionalProperties": false,
+ "allOf": [{
+ "if": {"properties": {"decision_grade": {"const": true}},"required": ["decision_grade"]},
+ "then": {"properties": {"reasons": {"maxItems": 0}}},
+ "else": {"properties": {"reasons": {"minItems": 1}}}
+ }],
+ "required": [
+ "decision_grade",
+ "allocation_ids",
+ "complete",
+ "correct",
+ "measured_roundtrip_p99",
+ "stable_p50",
+ "stable_p99",
+ "stable_ordering",
+ "p50_max_min_ratio",
+ "p99_max_min_ratio",
+ "reasons"
+ ],
+ "properties": {
+ "decision_grade": {"type": "boolean"},
+ "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "complete": {"type": "boolean"},
+ "correct": {"type": "boolean"},
+ "measured_roundtrip_p99": {"type": "boolean"},
+ "stable_p50": {"type": "boolean"},
+ "stable_p99": {"type": "boolean"},
+ "stable_ordering": {"type": "boolean"},
+ "p50_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]},
+ "p99_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]},
+ "reasons": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}
+ }
+ }
+ },
+ "percentiles": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["p50","p90","p95","p99"],
+ "properties": {
+ "p50": {"type": "number","exclusiveMinimum": 0},
+ "p90": {"type": "number","exclusiveMinimum": 0},
+ "p95": {"type": "number","exclusiveMinimum": 0},
+ "p99": {"type": "number","exclusiveMinimum": 0}
+ }
+ },
+ "component": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["origin","latency_us","logical_bytes","logical_payload_rate_gbps_at_latency_percentile","sample_count"],
+ "properties": {
+ "origin": {"enum": ["measured","derived"]},
+ "latency_us": {"$ref": "#/$defs/percentiles"},
+ "logical_bytes": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]},
+ "logical_payload_rate_gbps_at_latency_percentile": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/percentiles"}]},
+ "sample_count": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}
+ }
+ },
+ "nullableComponent": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/component"}]},
+ "point": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "point_id",
+ "tokens_per_rank",
+ "global_tokens",
+ "correct",
+ "routing",
+ "components",
+ "roundtrip_token_rate_at_latency_percentile",
+ "evidence_ids"
+ ],
+ "properties": {
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "tokens_per_rank": {"type": "integer","minimum": 1},
+ "global_tokens": {"type": "integer","minimum": 1},
+ "correct": {"type": "boolean"},
+ "routing": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "fanout_mean",
+ "recv_tokens_max",
+ "expert_load_cv",
+ "payload_rank_cv",
+ "hotspot_ratio",
+ "empty_expert_count",
+ "empty_rank_count",
+ "routed_copies"
+ ],
+ "properties": {
+ "fanout_mean": {"type": "number","minimum": 0},
+ "recv_tokens_max": {"type": "integer","minimum": 0},
+ "expert_load_cv": {"type": "number","minimum": 0},
+ "payload_rank_cv": {"type": "number","minimum": 0},
+ "hotspot_ratio": {"type": "number","minimum": 0},
+ "empty_expert_count": {"type": "integer","minimum": 0},
+ "empty_rank_count": {"type": "integer","minimum": 0},
+ "routed_copies": {"type": "integer","minimum": 1}
+ }
+ },
+ "components": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["dispatch","combine","roundtrip","isolated_sum"],
+ "properties": {
+ "dispatch": {"$ref": "#/$defs/nullableComponent"},
+ "combine": {"$ref": "#/$defs/nullableComponent"},
+ "roundtrip": {"$ref": "#/$defs/nullableComponent"},
+ "isolated_sum": {"$ref": "#/$defs/nullableComponent"}
+ }
+ },
+ "roundtrip_token_rate_at_latency_percentile": {"$ref": "#/$defs/percentiles"},
+ "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}}
+ }
+ },
+ "series": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "series_id",
+ "label",
+ "status",
+ "case_ids",
+ "allocation_ids",
+ "model",
+ "suite",
+ "phase",
+ "publication_tier",
+ "backend",
+ "build",
+ "system",
+ "workload",
+ "eplb",
+ "resource",
+ "measurement",
+ "points",
+ "eligibility"
+ ],
+ "properties": {
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "label": {"$ref": "#/$defs/label"},
+ "status": {"enum": ["decision-grade","diagnostic"]},
+ "case_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/caseId"}},
+ "allocation_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "model": {"$ref": "#/$defs/safeId"},
+ "suite": {"$ref": "#/$defs/safeId"},
+ "phase": {"enum": ["decode","prefill"]},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "backend": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["id","label","role","generation","version"],
+ "properties": {
+ "id": {"$ref": "#/$defs/safeId"},
+ "label": {"$ref": "#/$defs/label"},
+ "role": {"enum": ["library","reference"]},
+ "generation": {"$ref": "#/$defs/nullableLabel"},
+ "version": {"$ref": "#/$defs/nullableLabel"}
+ }
+ },
+ "build": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["implementation_contract_sha256","public_config_sha256","routing_control_sha256","runtime_fingerprint_sha256","image_digest","source_sha","squash_sha256"],
+ "properties": {
+ "implementation_contract_sha256": {"$ref": "#/$defs/sha256"},
+ "public_config_sha256": {"$ref": "#/$defs/sha256"},
+ "routing_control_sha256": {"$ref": "#/$defs/sha256"},
+ "runtime_fingerprint_sha256": {"$ref": "#/$defs/sha256"},
+ "image_digest": {"type": "string","pattern": "^sha256:[0-9a-f]{64}$"},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40,64}$"},
+ "squash_sha256": {"$ref": "#/$defs/sha256"}
+ }
+ },
+ "system": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["sku","label","vendor","topology_class","transport","world_size","ep_size","placement"],
+ "properties": {
+ "sku": {"$ref": "#/$defs/safeId"},
+ "label": {"$ref": "#/$defs/label"},
+ "vendor": {"enum": ["nvidia","amd"]},
+ "topology_class": {"$ref": "#/$defs/safeId"},
+ "transport": {"$ref": "#/$defs/safeId"},
+ "world_size": {"type": "integer","minimum": 1},
+ "ep_size": {"type": "integer","minimum": 1},
+ "placement": {"enum": ["packed"]}
+ }
+ },
+ "workload": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "workload_id",
+ "hidden",
+ "top_k",
+ "experts",
+ "routing",
+ "eplb",
+ "dispatch_dtype",
+ "combine_dtype",
+ "activation_profile"
+ ],
+ "properties": {
+ "workload_id": {"$ref": "#/$defs/workloadId"},
+ "hidden": {"type": "integer","minimum": 1},
+ "top_k": {"type": "integer","minimum": 1},
+ "experts": {"type": "integer","minimum": 1},
+ "routing": {"enum": ["uniform","zipf"]},
+ "eplb": {"type": "boolean"},
+ "dispatch_dtype": {"const": "bf16"},
+ "combine_dtype": {"const": "bf16"},
+ "activation_profile": {"const": "canonical-counter-source-v3"}
+ }
+ },
+ "eplb": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "enabled",
+ "planner",
+ "mapping_sha256",
+ "logical_experts",
+ "physical_experts",
+ "redundant_experts",
+ "reference_tokens_per_rank",
+ "replicated_experts",
+ "max_replicas",
+ "imbalance_before",
+ "imbalance_after"
+ ],
+ "properties": {
+ "enabled": {"type": "boolean"},
+ "planner": {"$ref": "#/$defs/nullableLabel"},
+ "mapping_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "logical_experts": {"type": "integer","minimum": 1},
+ "physical_experts": {"type": "integer","minimum": 1},
+ "redundant_experts": {"type": "integer","minimum": 0},
+ "reference_tokens_per_rank": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]},
+ "replicated_experts": {"type": "integer","minimum": 0},
+ "max_replicas": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 0}]},
+ "imbalance_before": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]},
+ "imbalance_after": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]}
+ }
+ },
+ "resource": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["mode","profile","comm_units_kind","configured_units"],
+ "properties": {
+ "mode": {"const": "tuned"},
+ "profile": {"$ref": "#/$defs/safeId"},
+ "comm_units_kind": {"$ref": "#/$defs/nullableLabel"},
+ "configured_units": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}
+ }
+ },
+ "measurement": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "contract",
+ "sampling_contract",
+ "iters",
+ "trials",
+ "warmups",
+ "samples_per_component",
+ "headline_component",
+ "headline_percentile"
+ ],
+ "properties": {
+ "contract": {"const": "layout-and-dispatch-v1"},
+ "sampling_contract": {"const": "fixed-512-v1"},
+ "iters": {"const": 8},
+ "trials": {"const": 64},
+ "warmups": {"const": 32},
+ "samples_per_component": {"const": 512},
+ "headline_component": {"const": "roundtrip"},
+ "headline_percentile": {"const": "p99"}
+ }
+ },
+ "points": {"type": "array","minItems": 1,"items": {"$ref": "#/$defs/point"}},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "cohort": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "cohort_id",
+ "kind",
+ "label",
+ "description",
+ "series_ids",
+ "controlled_factors",
+ "varying_factors",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "kind": {"enum": ["library","chip","system","routing"]},
+ "label": {"$ref": "#/$defs/label"},
+ "description": {"$ref": "#/$defs/label"},
+ "series_ids": {"type": "array","minItems": 2,"uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}},
+ "controlled_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}},
+ "varying_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "metric": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["operation","statistic","measure","objective","tokens_per_rank","phase"],
+ "properties": {
+ "operation": {"const": "roundtrip"},
+ "statistic": {"enum": ["p50","p99"]},
+ "measure": {"enum": ["latency_us","logical_payload_rate_gbps_at_latency_percentile"]},
+ "objective": {"enum": ["min","max"]},
+ "tokens_per_rank": {"type": "integer","minimum": 1},
+ "phase": {"enum": ["decode","prefill"]}
+ }
+ },
+ "ranking": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["ranking_id","cohort_id","label","metric","entries","publication_tier","eligibility"],
+ "properties": {
+ "ranking_id": {"type": "string","pattern": "^cxranking-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "metric": {"$ref": "#/$defs/metric"},
+ "entries": {
+ "type": "array",
+ "minItems": 2,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["rank","series_id","point_id","value","unit"],
+ "properties": {
+ "rank": {"type": "integer","minimum": 1},
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "value": {"type": "number","exclusiveMinimum": 0},
+ "unit": {"enum": ["us","GB/s"]}
+ }
+ }
+ },
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "recommendation": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "recommendation_id",
+ "cohort_id",
+ "label",
+ "objective",
+ "series_id",
+ "point_id",
+ "value",
+ "unit",
+ "rationale",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "recommendation_id": {"type": "string","pattern": "^cxrecommendation-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "objective": {"enum": ["min-p50-latency","min-p99-latency","max-payload-rate-at-p50-latency","max-payload-rate-at-p99-latency"]},
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "value": {"type": "number","exclusiveMinimum": 0},
+ "unit": {"enum": ["us","GB/s"]},
+ "rationale": {"$ref": "#/$defs/label"},
+ "publication_tier": {"const": "official"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "sensitivity": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "sensitivity_id",
+ "cohort_id",
+ "label",
+ "baseline_series_id",
+ "candidate_series_id",
+ "metric",
+ "signed_change_ratio",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "sensitivity_id": {"type": "string","pattern": "^cxsensitivity-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "baseline_series_id": {"$ref": "#/$defs/seriesId"},
+ "candidate_series_id": {"$ref": "#/$defs/seriesId"},
+ "metric": {"$ref": "#/$defs/metric"},
+ "signed_change_ratio": {"type": "number"},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/raw-case-v1.schema.json b/experimental/CollectiveX/schemas/raw-case-v1.schema.json
new file mode 100644
index 0000000000..ccf85b19ad
--- /dev/null
+++ b/experimental/CollectiveX/schemas/raw-case-v1.schema.json
@@ -0,0 +1,1142 @@
+{
+ "$id": "https://inferencex.com/schemas/collectivex/raw-case-v1.schema.json",
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$defs": {
+ "deepep_v2_jit_cubin": {
+ "additionalProperties": false,
+ "properties": {
+ "cache_key": {
+ "pattern":"^kernel\\.[A-Za-z0-9_+-]+\\.[0-9a-f]{32}$",
+ "type":"string"
+ },
+ "cubin_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "sass_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "source_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["cache_key","cubin_sha256","sass_sha256","source_sha256"],
+ "type": "object"
+ },
+ "hybrid_jit_rank_artifact": {
+ "additionalProperties": false,
+ "properties": {
+ "bytes": {"minimum":1,"type":"integer"},
+ "rank": {"minimum":0,"type":"integer"},
+ "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["bytes","rank","sha256"],
+ "type": "object"
+ },
+ "hybrid_realized_config": {
+ "additionalProperties": false,
+ "properties": {
+ "backward_combine_api": {"type":"boolean"},
+ "device_side_sync_combine_api": {"type":"boolean"},
+ "device_side_sync_dispatch_api": {"type":"boolean"},
+ "forward_dispatch_api": {"type":"boolean"},
+ "hidden_dim": {"minimum":1,"type":"integer"},
+ "max_num_of_tokens_per_rank": {"minimum":1,"type":"integer"},
+ "num_of_additional_in_flight_s2g_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_additional_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_permute": {"minimum":0,"type":"integer"},
+ "num_of_blocks_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_unpermute": {"minimum":0,"type":"integer"},
+ "num_of_experts_per_rank": {"minimum":1,"type":"integer"},
+ "num_of_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_in_flight_s2g_permute_block_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_nodes": {"minimum":1,"type":"integer"},
+ "num_of_ranks_per_node": {"minimum":1,"type":"integer"},
+ "num_of_stages_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_g2s_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_permute_block_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_s2g_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_threads_per_block_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_group_combine_api": {"minimum":0,"type":"integer"},
+ "pad_multiple": {"minimum":0,"type":"integer"},
+ "token_data_type": {"enum":["UINT8","UINT16"]}
+ },
+ "required": [
+ "backward_combine_api","device_side_sync_combine_api","device_side_sync_dispatch_api",
+ "forward_dispatch_api","hidden_dim","max_num_of_tokens_per_rank",
+ "num_of_additional_in_flight_s2g_combine_api",
+ "num_of_additional_in_flight_s2g_dispatch_api","num_of_blocks_combine_api",
+ "num_of_blocks_dispatch_api","num_of_blocks_permute","num_of_blocks_preprocessing_api",
+ "num_of_blocks_unpermute","num_of_experts_per_rank",
+ "num_of_in_flight_s2g_dispatch_api","num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_nodes","num_of_ranks_per_node","num_of_stages_dispatch_api",
+ "num_of_stages_g2s_combine_api","num_of_stages_permute_block_dispatch_api",
+ "num_of_stages_s2g_combine_api","num_of_threads_per_block_preprocessing_api",
+ "num_of_tokens_per_chunk_combine_api","num_of_tokens_per_chunk_dispatch_api",
+ "num_of_tokens_per_chunk_preprocessing_api","num_of_tokens_per_group_combine_api",
+ "pad_multiple","token_data_type"
+ ],
+ "type": "object"
+ },
+ "nullable_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "oracle": {
+ "additionalProperties": false,
+ "properties": {
+ "checks": {
+ "additionalProperties": false,
+ "properties": {
+ "combine_values": {"type":"boolean"},
+ "counts": {"type":"boolean"},
+ "metadata": {"type":"boolean"},
+ "multiplicity": {"type":"boolean"},
+ "payload": {"type":"boolean"},
+ "source_set": {"type":"boolean"},
+ "weights": {"type":"boolean"}
+ },
+ "required": ["combine_values","counts","metadata","multiplicity","payload","source_set","weights"],
+ "type": "object"
+ },
+ "atol": {"const":0.02},
+ "combine_weight_semantics": {"const":"unweighted-rank-sum"},
+ "contract": {"const":"expert-specific-transform-v1"},
+ "dispatch_sha256": {"$ref":"#/$defs/nullable_sha256"},
+ "max_absolute_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_elementwise_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_weight_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "order_sha256": {"$ref":"#/$defs/nullable_sha256"},
+ "ordering_contract": {"minLength":1,"type":"string"},
+ "passed": {"type":"boolean"},
+ "receive_count": {"minimum":0,"type":"integer"},
+ "rtol": {"const":0.05}
+ },
+ "required": [
+ "atol",
+ "checks",
+ "combine_weight_semantics",
+ "contract",
+ "dispatch_sha256",
+ "max_absolute_error",
+ "max_elementwise_relative_error",
+ "max_relative_error",
+ "max_weight_error",
+ "order_sha256",
+ "ordering_contract",
+ "passed",
+ "receive_count",
+ "rtol"
+ ],
+ "type": "object"
+ },
+ "percentiles": {
+ "additionalProperties": false,
+ "properties": {
+ "p50": {"minimum":0,"type":"number"},
+ "p90": {"minimum":0,"type":"number"},
+ "p95": {"minimum":0,"type":"number"},
+ "p99": {"minimum":0,"type":"number"}
+ },
+ "required": ["p50","p90","p95","p99"],
+ "type": "object"
+ },
+ "component": {
+ "additionalProperties": false,
+ "properties": {
+ "availability": {"enum":["measured","derived","unavailable"]},
+ "origin": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "percentiles_us": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]},
+ "sample_count": {"minimum":0,"type":"integer"}
+ },
+ "required": ["availability","origin","percentiles_us","sample_count"],
+ "type": "object"
+ },
+ "histogram": {
+ "additionalProperties": false,
+ "properties": {
+ "bins": {"minimum":1,"type":"integer"},
+ "counts": {"items":{"minimum":0,"type":"integer"},"minItems":1,"type":"array"},
+ "max": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"number"},
+ "n": {"minimum":1,"type":"integer"}
+ },
+ "required": ["n","min","max","bins","counts"],
+ "type": "object"
+ },
+ "scheduled_case": {
+ "additionalProperties": false,
+ "properties": {
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "canonical": {"const":true},
+ "ep": {"minimum":1,"type":"integer"},
+ "eplb": {"type":"boolean"},
+ "experts": {"minimum":1,"type":"integer"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "hidden": {"minimum":1,"type":"integer"},
+ "ladder": {"pattern":"^[1-9][0-9]*( [1-9][0-9]*)*$","type":"string"},
+ "nodes": {"minimum":1,"type":"integer"},
+ "phase": {"enum":["decode","prefill"]},
+ "required_publication": {"enum":["official","comparable-experimental"]},
+ "routing": {"enum":["uniform","zipf"]},
+ "samples_per_point": {"const":512},
+ "scale_up_domain": {"minimum":1,"type":"integer"},
+ "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "timing": {"const":"8:64:32"},
+ "topk": {"minimum":1,"type":"integer"},
+ "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"},
+ "workload": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": [
+ "backend",
+ "canonical",
+ "eplb",
+ "ep",
+ "experts",
+ "gpus_per_node",
+ "hidden",
+ "ladder",
+ "nodes",
+ "phase",
+ "required_publication",
+ "routing",
+ "samples_per_point",
+ "scale_up_domain",
+ "suite",
+ "timing",
+ "topk",
+ "warmup_semantics",
+ "workload"
+ ],
+ "type": "object"
+ },
+ "git_run": {
+ "additionalProperties": false,
+ "properties": {
+ "artifact": {"minLength":1,"type":"string"},
+ "job": {"minLength":1,"type":"string"},
+ "ref": {"minLength":1,"type":"string"},
+ "repo": {"pattern":"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$","type":"string"},
+ "run_attempt": {"pattern":"^[1-9][0-9]*$","type":"string"},
+ "run_id": {"pattern":"^[1-9][0-9]*$","type":"string"},
+ "source_sha": {"pattern":"^[0-9a-f]{40}$","type":"string"}
+ },
+ "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"],
+ "type": "object"
+ }
+ },
+ "additionalProperties": false,
+ "properties": {
+ "case": {
+ "additionalProperties": false,
+ "properties": {
+ "attempt_ordinal": {"minimum":1,"type":"integer"},
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "ep_size": {"minimum":1,"type":"integer"},
+ "eplb": {
+ "additionalProperties": false,
+ "properties": {
+ "enabled": {"type":"boolean"},
+ "imbalance_after": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "imbalance_before": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "mapping_hash": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "max_replicas": {"oneOf":[{"type":"null"},{"minimum":0,"type":"integer"}]},
+ "num_logical_experts": {"minimum":1,"type":"integer"},
+ "num_physical_experts": {"minimum":1,"type":"integer"},
+ "num_redundant": {"minimum":0,"type":"integer"},
+ "planner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "reference_tokens_per_rank": {"oneOf":[{"type":"null"},{"minimum":1,"type":"integer"}]},
+ "replicated_experts": {"minimum":0,"type":"integer"}
+ },
+ "required": [
+ "enabled",
+ "imbalance_after",
+ "imbalance_before",
+ "mapping_hash",
+ "max_replicas",
+ "num_logical_experts",
+ "num_physical_experts",
+ "num_redundant",
+ "planner",
+ "reference_tokens_per_rank",
+ "replicated_experts"
+ ],
+ "type": "object"
+ },
+ "mode": {"const":"normal"},
+ "phase": {"enum":["decode","prefill"]},
+ "required_publication": {"enum":["official","comparable-experimental"]},
+ "resource_mode": {"const":"tuned"},
+ "runner": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "shape": {
+ "additionalProperties": false,
+ "properties": {
+ "activation_profile": {"const":"canonical-counter-source-v3"},
+ "dispatch_dtype": {"const":"bf16"},
+ "eplb": {"type":"boolean"},
+ "experts": {"minimum":1,"type":"integer"},
+ "experts_per_rank": {"minimum":1,"type":"integer"},
+ "hidden": {"minimum":1,"type":"integer"},
+ "kernel_gen": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "num_logical_experts": {"minimum":1,"type":"integer"},
+ "quant": {
+ "additionalProperties": false,
+ "properties": {
+ "combine_accum_dtype": {"minLength":1,"type":"string"},
+ "combine_input_dtype": {"const":"bf16"},
+ "combine_output_dtype": {"const":"bf16"},
+ "combine_quant_mode": {"const":"none"},
+ "scale_layout": {"type":"null"}
+ },
+ "required": [
+ "combine_accum_dtype",
+ "combine_input_dtype",
+ "combine_output_dtype",
+ "combine_quant_mode",
+ "scale_layout"
+ ],
+ "type": "object"
+ },
+ "routing": {"enum":["uniform","zipf"]},
+ "topk": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "activation_profile",
+ "dispatch_dtype",
+ "eplb",
+ "experts",
+ "experts_per_rank",
+ "hidden",
+ "kernel_gen",
+ "num_logical_experts",
+ "quant",
+ "routing",
+ "topk"
+ ],
+ "type": "object"
+ },
+ "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "workload_name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": [
+ "attempt_ordinal",
+ "backend",
+ "eplb",
+ "ep_size",
+ "mode",
+ "phase",
+ "required_publication",
+ "resource_mode",
+ "runner",
+ "shape",
+ "suite",
+ "workload_name"
+ ],
+ "type": "object"
+ },
+ "format": {"const":"collectivex.ep.v1"},
+ "generated_at": {"format":"date-time","type":"string"},
+ "identity": {
+ "additionalProperties": false,
+ "properties": {
+ "allocation_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "artifact": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "execution_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "job": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "repo": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "run_attempt": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "run_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "runner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "source_sha": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"],
+ "type": "object"
+ },
+ "allocation_id": {"pattern":"^cxallocation-v1-[0-9a-f]{64}$","type":"string"},
+ "attempt_id": {"pattern":"^cxattempt-v1-[0-9a-f]{64}$","type":"string"},
+ "attempt_ordinal": {"minimum":1,"type":"integer"},
+ "case_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "case": {"$ref":"#/$defs/scheduled_case"},
+ "profile": {
+ "const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }
+ },
+ "sku": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": ["case","profile","sku"],
+ "type": "object"
+ },
+ "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"},
+ "series_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "implementation_contract_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "public_config_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "routing_control_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"},
+ "image_digest": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "runtime_fingerprint_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "source_sha": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{40}$","type":"string"}]},
+ "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "workload_id": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}
+ },
+ "required": [
+ "backend",
+ "implementation_contract_sha256",
+ "public_config_sha256",
+ "routing_control_sha256",
+ "case_id",
+ "image_digest",
+ "runtime_fingerprint_sha256",
+ "source_sha",
+ "squash_sha256",
+ "workload_id"
+ ],
+ "type": "object"
+ },
+ "series_id": {"pattern":"^cxseries-v1-[0-9a-f]{64}$","type":"string"}
+ },
+ "required": [
+ "allocation_factors",
+ "allocation_id",
+ "attempt_id",
+ "attempt_ordinal",
+ "case_factors",
+ "case_id",
+ "series_factors",
+ "series_id"
+ ],
+ "type": "object"
+ },
+ "implementation": {
+ "additionalProperties": false,
+ "properties": {
+ "kernel_generation": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "provenance": {
+ "properties": {
+ "allow_hybrid_mode": {"const":false},
+ "communication_backend": {"const":"nccl-device-lsa"},
+ "deepep_fix_pr": {"const":630},
+ "deepep_pr": {"const":605},
+ "deterministic": {"type": "boolean"},
+ "gin_enabled": {"const":false},
+ "jit_cubins": {
+ "items": {"$ref":"#/$defs/deepep_v2_jit_cubin"},
+ "maxItems": 5,
+ "minItems": 5,
+ "type": "array",
+ "uniqueItems": true
+ },
+ "jit_kernel_keys": {
+ "items": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"},
+ "maxItems": 3,
+ "minItems": 3,
+ "type": "array",
+ "uniqueItems": true
+ },
+ "jit_random_seed": {"const":"collectivex-deepep-v2-fa8a9b1"},
+ "jit_shared_objects": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "kernel_key": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"},
+ "rank_artifacts": {
+ "items": {"$ref":"#/$defs/hybrid_jit_rank_artifact"},
+ "minItems": 1,
+ "type": "array"
+ }
+ },
+ "required": ["kernel_key","rank_artifacts"],
+ "type": "object"
+ },
+ "maxItems": 3,
+ "minItems": 3,
+ "type": "array"
+ },
+ "num_experts": {"minimum": 1, "type": "integer"},
+ "realized_config": {"$ref":"#/$defs/hybrid_realized_config"},
+ "tuning_num_experts": {"minimum": 1, "type": "integer"},
+ "uccl_dependency_versions": {
+ "additionalProperties": false,
+ "properties": {
+ "intervaltree": {"const":"3.1.0"},
+ "nvidia-cuda-runtime-cu12": {"const":"12.9.79"},
+ "sortedcontainers": {"const":"2.4.0"}
+ },
+ "required": ["intervaltree","nvidia-cuda-runtime-cu12","sortedcontainers"],
+ "type": "object"
+ }
+ },
+ "type": "object",
+ "propertyNames": {
+ "enum": [
+ "allocated_qps",
+ "allow_hybrid_mode",
+ "allow_mnnvl",
+ "allow_multiple_reduction",
+ "api",
+ "api_signature_sha256",
+ "backend",
+ "backend_lineage",
+ "block_num",
+ "block_num_floored",
+ "block_num_target",
+ "branch",
+ "collective_library",
+ "combine_dtype",
+ "combine_warps",
+ "communication_backend",
+ "cuda_version",
+ "deepep_commit",
+ "deepep_distribution_version",
+ "deepep_fix_pr",
+ "deepep_pr",
+ "deepep_tree",
+ "deepep_version",
+ "deterministic",
+ "device_cus",
+ "device_sms",
+ "dispatch_dtype",
+ "dispatch_warps",
+ "enable_sdma",
+ "fmt_commit",
+ "gpus_per_node",
+ "gin_enabled",
+ "heap_size",
+ "impl",
+ "jit_cache_key",
+ "jit_cubins",
+ "jit_kernel_keys",
+ "jit_random_seed",
+ "jit_shared_objects",
+ "kernel_type",
+ "loaded_libraries",
+ "local_experts",
+ "logical_scaleout_ranks",
+ "logical_scaleup_ranks",
+ "mapping_variant",
+ "max_num_inp_token_per_rank",
+ "max_num_tokens",
+ "max_total_recv_tokens",
+ "mnnvl_comm",
+ "mode",
+ "mori_commit",
+ "nccl_communicator",
+ "nccl_package_version",
+ "nccl_version",
+ "num_experts",
+ "num_max_tokens_per_rank",
+ "num_nvl_bytes",
+ "num_qps",
+ "num_sms",
+ "nvshmem_package_version",
+ "path",
+ "physical_nvlink_ranks",
+ "physical_rdma_ranks",
+ "prefer_overlap_with_compute",
+ "reference_semantics",
+ "realized_config",
+ "requested_num_sms",
+ "resource_mode",
+ "routing_factor",
+ "routing_metadata",
+ "sm_fraction",
+ "top_k",
+ "torch_git_version",
+ "torch_version",
+ "transport",
+ "trtllm",
+ "tuned_source",
+ "tuning_num_experts",
+ "uccl_commit",
+ "uccl_dependency_versions",
+ "uccl_version",
+ "uccl_wrapper_commit",
+ "workspace"
+ ]
+ }
+ },
+ "resource_profile": {
+ "additionalProperties": false,
+ "properties": {
+ "achieved_fraction": {},
+ "comm_units_kind": {},
+ "configured_units": {},
+ "conformance_class": {},
+ "device_units": {},
+ "fixed_kernel": {},
+ "nonconforming": {},
+ "pareto_eligible": {},
+ "persistent_bytes": {},
+ "qps_per_rank": {},
+ "requested_fraction": {},
+ "tuned_source": {},
+ "target_achieved_within_tol": {},
+ "tolerance": {},
+ "resource_class": {},
+ "warps_combine": {},
+ "warps_dispatch": {}
+ },
+ "required": [
+ "comm_units_kind",
+ "requested_fraction",
+ "configured_units",
+ "device_units",
+ "achieved_fraction",
+ "warps_dispatch",
+ "warps_combine",
+ "qps_per_rank",
+ "persistent_bytes",
+ "tuned_source",
+ "resource_class",
+ "conformance_class",
+ "tolerance",
+ "target_achieved_within_tol",
+ "nonconforming",
+ "fixed_kernel",
+ "pareto_eligible"
+ ],
+ "type": "object"
+ }
+ },
+ "required": ["kernel_generation","name","provenance","resource_profile"],
+ "type": "object"
+ },
+ "measurement": {
+ "additionalProperties": false,
+ "properties": {
+ "component_order_contract": {"const":"roundtrip-dispatch-activation-only-combine-v2"},
+ "conditioning": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"const":"fixed-phase-ramp-8-roundtrips-v1"},
+ "ladder": {"items":{"minimum":1,"type":"integer"},"minItems":1,"type":"array"},
+ "roundtrips_per_shape": {"const":8}
+ },
+ "required": ["contract","ladder","roundtrips_per_shape"],
+ "type": "object"
+ },
+ "contract": {"const":"layout-and-dispatch-v1"},
+ "rows": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "anomalies": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "T": {"minimum":1,"type":"integer"},
+ "component_floor_p50": {"minimum":0,"type":"number"},
+ "isolated_sum_p99": {"minimum":0,"type":"number"},
+ "ratio": {"minimum":0,"type":"number"},
+ "roundtrip_p50": {"minimum":0,"type":"number"},
+ "roundtrip_p99": {"minimum":0,"type":"number"},
+ "threshold": {"minimum":0,"type":"number"},
+ "type": {"enum":["roundtrip_gt_isolated_sum","roundtrip_lt_component_floor"]}
+ },
+ "required": ["type","T"],
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "components": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"$ref":"#/$defs/component"},
+ "dispatch": {"$ref":"#/$defs/component"},
+ "isolated_sum": {"$ref":"#/$defs/component"},
+ "roundtrip": {"$ref":"#/$defs/component"}
+ },
+ "required": ["combine","dispatch","isolated_sum","roundtrip"],
+ "type": "object"
+ },
+ "correctness": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"const":"expert-specific-transform-v1"},
+ "max_relative_error": {"minimum":0,"type":"number"},
+ "passed": {"type":"boolean"},
+ "rank_evidence": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "input_unchanged": {"type":"boolean"},
+ "order_stable": {"type":"boolean"},
+ "post_timing": {"$ref":"#/$defs/oracle"},
+ "pre_timing": {"$ref":"#/$defs/oracle"},
+ "rank": {"minimum":0,"type":"integer"}
+ },
+ "required": ["input_unchanged","order_stable","post_timing","pre_timing","rank"],
+ "type": "object"
+ },
+ "minItems": 1,
+ "type": "array"
+ },
+ "scope": {"const":"dispatch-metadata-and-transformed-combine"}
+ },
+ "required": ["contract","max_relative_error","passed","rank_evidence","scope"],
+ "type": "object"
+ },
+ "evidence_id": {"pattern":"^cxevidence-v1-[0-9a-f]{64}$","type":"string"},
+ "global_tokens": {"minimum":1,"type":"integer"},
+ "logical_bytes": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"minimum":1,"type":"integer"},
+ "dispatch": {"minimum":1,"type":"integer"},
+ "roundtrip": {"minimum":1,"type":"integer"}
+ },
+ "required": ["combine","dispatch","roundtrip"],
+ "type": "object"
+ },
+ "point_id": {"pattern":"^cxpoint-v1-[0-9a-f]{64}$","type":"string"},
+ "receive": {
+ "additionalProperties": false,
+ "properties": {
+ "max": {"minimum":0,"type":"integer"},
+ "mean": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"integer"},
+ "total": {"minimum":0,"type":"integer"}
+ },
+ "required": ["max","mean","min","total"],
+ "type": "object"
+ },
+ "routing": {
+ "additionalProperties": false,
+ "properties": {
+ "empty_expert_count": {"minimum":0,"type":"integer"},
+ "empty_rank_count": {"minimum":0,"type":"integer"},
+ "expert_assignment_rank_cv": {"minimum":0,"type":"number"},
+ "expert_assignments_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "expert_load_cv": {"minimum":0,"type":"number"},
+ "expert_load_max": {"minimum":0,"type":"integer"},
+ "expert_load_mean": {"minimum":0,"type":"number"},
+ "expert_load_min": {"minimum":0,"type":"integer"},
+ "fanout_histogram": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "fanout_max": {"minimum":1,"type":"integer"},
+ "fanout_mean": {"minimum":0,"type":"number"},
+ "fanout_min": {"minimum":1,"type":"integer"},
+ "hash": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "hotspot_ratio": {"minimum":0,"type":"number"},
+ "locality": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": false,
+ "properties": {
+ "copies": {"minimum":0,"type":"integer"},
+ "cross_domain_fraction": {"minimum":0,"type":"number"},
+ "cross_node_fraction": {"minimum":0,"type":"number"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "local_rank_fraction": {"minimum":0,"type":"number"},
+ "placement": {"const":"packed"},
+ "same_node_fraction": {"minimum":0,"type":"number"},
+ "same_scaleup_domain_fraction": {"minimum":0,"type":"number"},
+ "scale_up_domain": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "placement",
+ "local_rank_fraction",
+ "same_node_fraction",
+ "same_scaleup_domain_fraction",
+ "cross_node_fraction",
+ "cross_domain_fraction",
+ "gpus_per_node",
+ "scale_up_domain",
+ "copies"
+ ],
+ "type": "object"
+ }
+ ]
+ },
+ "payload_copies_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "payload_rank_cv": {"minimum":0,"type":"number"},
+ "routed_copies": {"minimum":1,"type":"integer"},
+ "source_token_stats": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": false,
+ "properties": {
+ "cv": {"minimum":0,"type":"number"},
+ "empty_ranks": {"minimum":0,"type":"integer"},
+ "max": {"minimum":0,"type":"integer"},
+ "mean": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"integer"},
+ "ranks": {"minimum":1,"type":"integer"},
+ "total": {"minimum":0,"type":"integer"}
+ },
+ "required": ["min","mean","max","cv","empty_ranks","total","ranks"],
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "required": [
+ "empty_expert_count",
+ "empty_rank_count",
+ "expert_assignment_rank_cv",
+ "expert_assignments_per_rank",
+ "expert_load_cv",
+ "expert_load_max",
+ "expert_load_mean",
+ "expert_load_min",
+ "fanout_histogram",
+ "fanout_max",
+ "fanout_mean",
+ "fanout_min",
+ "hash",
+ "hotspot_ratio",
+ "locality",
+ "payload_copies_per_rank",
+ "payload_rank_cv",
+ "routed_copies",
+ "source_token_stats"
+ ],
+ "type": "object"
+ },
+ "sample_histograms": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]},
+ "dispatch": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]},
+ "roundtrip": {"$ref":"#/$defs/histogram"}
+ },
+ "required": ["dispatch","combine","roundtrip"],
+ "type": "object"
+ },
+ "sample_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "token_rate_at_latency_percentile": {"$ref":"#/$defs/percentiles"},
+ "tokens_per_rank": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "anomalies",
+ "components",
+ "correctness",
+ "evidence_id",
+ "global_tokens",
+ "logical_bytes",
+ "point_id",
+ "receive",
+ "routing",
+ "sample_histograms",
+ "sample_sha256",
+ "token_rate_at_latency_percentile",
+ "tokens_per_rank"
+ ],
+ "type": "object"
+ },
+ "minItems": 1,
+ "type": "array"
+ },
+ "sampling": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"const":"fixed-512-v1"},
+ "iterations_per_trial": {"const":8},
+ "percentile_method": {"const":"nearest-rank"},
+ "reduction": {"const":"cross-rank-max-per-iteration"},
+ "samples_per_component": {"const":512},
+ "trials": {"const":64},
+ "warmup_iterations": {"const":32},
+ "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"}
+ },
+ "required": [
+ "contract",
+ "iterations_per_trial",
+ "percentile_method",
+ "reduction",
+ "samples_per_component",
+ "trials",
+ "warmup_iterations",
+ "warmup_semantics"
+ ],
+ "type": "object"
+ },
+ "source_allocation": {"const":"even"}
+ },
+ "required": [
+ "component_order_contract",
+ "conditioning",
+ "contract",
+ "rows",
+ "sampling",
+ "source_allocation"
+ ],
+ "type": "object"
+ },
+ "outcome": {
+ "additionalProperties": false,
+ "properties": {
+ "publication_status": {"enum":["diagnostic","invalid"]},
+ "reasons": {"items":{"type":"string"},"type":"array"},
+ "status": {"enum":["success","invalid"]},
+ "validity": {
+ "additionalProperties": false,
+ "properties": {
+ "anomaly_free": {"type":"boolean"},
+ "execution_status": {"enum":["complete","failed"]},
+ "measurement_conformance": {"enum":["conformant","nonconformant"]},
+ "provenance_complete": {"type":"boolean"},
+ "resource_conformance": {"minLength":1,"type":"string"},
+ "sampling_conformance": {"enum":["conformant","nonconformant"]},
+ "semantic_correctness": {"enum":["pass","fail"]},
+ "workload_identity": {"enum":["consistent-across-ranks","inconsistent"]},
+ "workload_source": {"enum":["canonical-serialized","seeded-runtime"]}
+ },
+ "required": [
+ "execution_status",
+ "semantic_correctness",
+ "workload_identity",
+ "workload_source",
+ "measurement_conformance",
+ "sampling_conformance",
+ "resource_conformance",
+ "provenance_complete",
+ "anomaly_free"
+ ],
+ "type": "object"
+ }
+ },
+ "required": ["publication_status","reasons","status","validity"],
+ "type": "object"
+ },
+ "provenance": {
+ "additionalProperties": false,
+ "properties": {
+ "command": {"minLength":1,"type":"string"},
+ "distributed_launcher": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "git_run": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/git_run"}]},
+ "image": {
+ "additionalProperties": false,
+ "properties": {
+ "arch": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "digest": {
+ "oneOf": [{"type":"null"},{"pattern":"^sha256:[0-9a-f]{64}$","type":"string"}]
+ },
+ "digest_verified": {"type":"boolean"},
+ "reference": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}
+ },
+ "required": ["arch","digest","digest_verified","reference","squash_sha256"],
+ "type": "object"
+ },
+ "redaction": {"const":"sanitized-v1"}
+ },
+ "required": ["command","distributed_launcher","git_run","image","redaction"],
+ "type": "object"
+ },
+ "record_type": {"const":"case-attempt"},
+ "runtime_fingerprint": {
+ "additionalProperties": false,
+ "properties": {
+ "accelerator_runtime": {
+ "additionalProperties": false,
+ "properties": {
+ "kind": {"enum":["cuda","hip"]},
+ "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "collective_library": {
+ "additionalProperties": false,
+ "properties": {
+ "kind": {"enum":["nccl","rccl"]},
+ "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "device": {
+ "additionalProperties": false,
+ "properties": {
+ "arch": {"minLength":1,"type":"string"},
+ "compute_units": {"minimum":1,"type":"integer"},
+ "memory_bytes": {"minimum":1,"type":"integer"},
+ "product": {"minLength":1,"type":"string"},
+ "warp_size": {"minimum":1,"type":"integer"}
+ },
+ "required": ["arch","compute_units","memory_bytes","product","warp_size"],
+ "type": "object"
+ },
+ "driver_version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "framework": {
+ "additionalProperties": false,
+ "properties": {"kind":{"const":"torch"},"version":{"minLength":1,"type":"string"}},
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "machine": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "python_version": {"minLength":1,"type":"string"},
+ "vendor": {"enum":["nvidia","amd"]}
+ },
+ "required": [
+ "accelerator_runtime",
+ "collective_library",
+ "device",
+ "driver_version",
+ "framework",
+ "machine",
+ "python_version",
+ "vendor"
+ ],
+ "type": "object"
+ },
+ "sample_artifact": {
+ "additionalProperties": false,
+ "properties": {
+ "bytes": {"minimum":1,"type":"integer"},
+ "format": {"const":"collectivex.samples.v1"},
+ "path": {"pattern":"^[A-Za-z0-9_.-]+$","type":"string"},
+ "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["bytes","format","path","sha256"],
+ "type": "object"
+ },
+ "schema_version": {"const":1},
+ "topology": {
+ "additionalProperties": false,
+ "properties": {
+ "device_count": {"minimum":1,"type":"integer"},
+ "device_product": {"minLength":1,"type":"string"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "nodes": {"minimum":1,"type":"integer"},
+ "placement": {"const":"packed"},
+ "realized_placement": {
+ "additionalProperties": false,
+ "properties": {
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "nodes": {"minimum":1,"type":"integer"},
+ "ranks_per_node": {"minimum":1,"type":"integer"},
+ "unique_local_ranks": {"const":true},
+ "valid": {"const":true}
+ },
+ "required": ["gpus_per_node","nodes","ranks_per_node","unique_local_ranks","valid"],
+ "type": "object"
+ },
+ "scale_up_domain": {"minimum":1,"type":"integer"},
+ "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "world_size": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "device_count",
+ "device_product",
+ "gpus_per_node",
+ "nodes",
+ "placement",
+ "realized_placement",
+ "scale_up_domain",
+ "topology_class",
+ "transport",
+ "world_size"
+ ],
+ "type": "object"
+ },
+ "workload": {
+ "additionalProperties": false,
+ "properties": {
+ "activation_generator": {"const":"collectivex-activation-counter-v3"},
+ "activation_identity": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "activation_profile": {"const":"canonical-counter-source-v3"},
+ "cross_rank_consistent": {"const":true},
+ "manifest_checksums": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": {
+ "additionalProperties": false,
+ "properties": {
+ "topk_idx": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "topk_weights": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "trace": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["topk_idx", "topk_weights", "trace"],
+ "type": "object"
+ },
+ "type": "object"
+ }
+ ]
+ },
+ "members": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "items": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"},
+ "minItems": 1,
+ "uniqueItems": true,
+ "type": "array"
+ }
+ ]
+ },
+ "routing_generator": {"const":"collectivex-routing-counter-v3"},
+ "source": {"enum":["canonical-serialized","seeded-runtime"]},
+ "trace_hashes": {
+ "items": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "minItems": 1,
+ "type": "array"
+ },
+ "trace_signature": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "workload_id": {
+ "oneOf": [{"type":"null"},{"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}]
+ }
+ },
+ "required": [
+ "activation_generator",
+ "activation_identity",
+ "activation_profile",
+ "cross_rank_consistent",
+ "manifest_checksums",
+ "members",
+ "routing_generator",
+ "source",
+ "trace_hashes",
+ "trace_signature",
+ "workload_id"
+ ],
+ "type": "object"
+ }
+ },
+ "required": [
+ "case",
+ "format",
+ "generated_at",
+ "identity",
+ "implementation",
+ "measurement",
+ "outcome",
+ "provenance",
+ "record_type",
+ "runtime_fingerprint",
+ "sample_artifact",
+ "schema_version",
+ "topology",
+ "workload"
+ ],
+ "title": "CollectiveX raw case attempt v1",
+ "type": "object"
+}
diff --git a/experimental/CollectiveX/schemas/samples-v1.schema.json b/experimental/CollectiveX/schemas/samples-v1.schema.json
new file mode 100644
index 0000000000..b9a1df0541
--- /dev/null
+++ b/experimental/CollectiveX/schemas/samples-v1.schema.json
@@ -0,0 +1,80 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/samples-v1.schema.json",
+ "title": "CollectiveX exact private samples v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["allocation_id","attempt_id","case_id","format","points","sampling","schema_version","series_id"],
+ "properties": {
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "format": {"const": "collectivex.samples.v1"},
+ "points": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["components","evidence_id","point_id","sample_sha256","tokens_per_rank"],
+ "properties": {
+ "components": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["combine","dispatch","roundtrip"],
+ "properties": {
+ "combine": {"$ref": "#/$defs/component"},
+ "dispatch": {"$ref": "#/$defs/component"},
+ "roundtrip": {"$ref": "#/$defs/component"}
+ }
+ },
+ "evidence_id": {"$ref": "#/$defs/evidenceId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "sample_sha256": {"$ref": "#/$defs/sha256"},
+ "tokens_per_rank": {"type": "integer","minimum": 1}
+ }
+ }
+ },
+ "sampling": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["iterations_per_trial","reduction","trials"],
+ "properties": {
+ "iterations_per_trial": {"const": 8},
+ "reduction": {"const": "cross-rank-max-per-iteration"},
+ "trials": {"const": 64}
+ }
+ },
+ "schema_version": {"const": 1},
+ "series_id": {"$ref": "#/$defs/seriesId"}
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "component": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["availability","sample_count","trials"],
+ "properties": {
+ "availability": {"enum": ["measured","unavailable"]},
+ "sample_count": {"type": "integer","minimum": 0,"maximum": 512},
+ "trials": {
+ "oneOf": [
+ {"type": "null"},
+ {
+ "type": "array",
+ "minItems": 64,
+ "maxItems": 64,
+ "items": {"type": "array","minItems": 8,"maxItems": 8,"items": {"type": "number","minimum": 0}}
+ }
+ ]
+ }
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json
new file mode 100644
index 0000000000..9c28613064
--- /dev/null
+++ b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json
@@ -0,0 +1,246 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/terminal-outcome-v1.schema.json",
+ "title": "CollectiveX terminal outcome v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case","format","generated_at","identity","outcome","provenance","record_type","schema_version"],
+ "properties": {
+ "case": {"$ref": "#/$defs/case"},
+ "format": {"const": "collectivex.terminal.v1"},
+ "generated_at": {"type": "string","format": "date-time"},
+ "identity": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["allocation_factors","allocation_id","attempt_id","attempt_ordinal","case_factors","case_id"],
+ "properties": {
+ "allocation_factors": {"$ref": "#/$defs/allocationFactors"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "attempt_ordinal": {"type": "integer","minimum": 1},
+ "case_factors": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case","profile","sku"],
+ "properties": {
+ "case": {"$ref": "#/$defs/case"},
+ "profile": {
+ "const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }
+ },
+ "sku": {"$ref": "#/$defs/safeId"}
+ }
+ },
+ "case_id": {"$ref": "#/$defs/caseId"}
+ }
+ },
+ "outcome": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["failure_mode","reason","return_code","status"],
+ "properties": {
+ "failure_mode": {"$ref": "#/$defs/safeId"},
+ "reason": {"type": "string","minLength": 1,"maxLength": 240},
+ "return_code": {"type": "integer","minimum": 0},
+ "status": {"enum": ["failed","invalid","unsupported"]}
+ }
+ },
+ "provenance": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["git_run","control_sha256","redaction","source"],
+ "properties": {
+ "git_run": {"$ref": "#/$defs/gitRun"},
+ "control_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "redaction": {"const": "sanitized-v1"},
+ "source": {
+ "enum": [
+ "runtime-emitter",
+ "post-emit-command",
+ "matrix-capability-resolver"
+ ]
+ }
+ }
+ },
+ "record_type": {"const": "terminal-outcome"},
+ "schema_version": {"const": 1}
+ },
+ "allOf": [
+ {
+ "oneOf": [
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "runtime-emitter"}}
+ },
+ "outcome": {"$ref": "#/$defs/runtimeOutcome"}
+ }
+ },
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "post-emit-command"}}
+ },
+ "outcome": {"$ref": "#/$defs/postEmitOutcome"}
+ }
+ },
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "matrix-capability-resolver"}}
+ },
+ "outcome": {"$ref": "#/$defs/capabilityOutcome"}
+ }
+ }
+ ]
+ }
+ ],
+ "$defs": {
+ "runtimeOutcome": {
+ "type": "object",
+ "properties": {"status": {"const": "failed"}},
+ "allOf": [
+ {
+ "oneOf": [
+ {"properties": {"failure_mode": {"const": "setup"}, "reason": {"const": "launcher-setup-failed"}}},
+ {"properties": {"failure_mode": {"const": "repository-stage"}, "reason": {"const": "repository-staging-failed"}}},
+ {"properties": {"failure_mode": {"const": "registry-verification"}, "reason": {"const": "container-registry-verification-failed"}}},
+ {"properties": {"failure_mode": {"const": "scheduler-allocation"}, "reason": {"const": "scheduler-allocation-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-import"}, "reason": {"const": "container-image-preparation-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-hash"}, "reason": {"const": "container-image-identity-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-launch"}, "reason": {"const": "container-runtime-launch-failed"}}},
+ {"properties": {"failure_mode": {"const": "backend-setup"}, "reason": {"const": "backend-setup-failed"}}},
+ {"properties": {"failure_mode": {"const": "artifact-collection"}, "reason": {"const": "artifact-collection-failed"}}},
+ {"properties": {"failure_mode": {"const": "runtime-identity"}, "reason": {"const": "runtime-identity-mismatch"}}},
+ {"properties": {"failure_mode": {"const": "timeout"}, "reason": {"const": "execution-timeout"}}},
+ {"properties": {"failure_mode": {"const": "deadlock"}, "reason": {"const": "execution-deadlock"}}},
+ {"properties": {"failure_mode": {"const": "execution"}, "reason": {"const": "distributed-command-failed"}}}
+ ]
+ }
+ ]
+ },
+ "postEmitOutcome": {
+ "type": "object",
+ "properties": {
+ "status": {"const": "failed"},
+ "failure_mode": {"enum": ["runtime-identity", "timeout", "deadlock", "execution"]},
+ "reason": {"const": "post-emit-distributed-command-failed"}
+ }
+ },
+ "capabilityOutcome": {
+ "type": "object",
+ "properties": {
+ "status": {"const": "unsupported"},
+ "failure_mode": {"const": "capability"},
+ "reason": {
+ "enum": [
+ "backend-platform-unsupported",
+ "backend-token-capacity"
+ ]
+ }
+ }
+ },
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "nullableText": {"oneOf": [{"type": "null"},{"type": "string","minLength": 1}]},
+ "allocationFactors": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"],
+ "properties": {
+ "artifact": {"$ref": "#/$defs/nullableText"},
+ "execution_id": {"$ref": "#/$defs/nullableText"},
+ "job": {"$ref": "#/$defs/nullableText"},
+ "repo": {"$ref": "#/$defs/nullableText"},
+ "run_attempt": {"$ref": "#/$defs/nullableText"},
+ "run_id": {"$ref": "#/$defs/nullableText"},
+ "runner": {"$ref": "#/$defs/nullableText"},
+ "source_sha": {"$ref": "#/$defs/nullableText"}
+ }
+ },
+ "gitRun": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"],
+ "properties": {
+ "artifact": {"type": "string","minLength": 1},
+ "job": {"type": "string","minLength": 1},
+ "ref": {"type": "string","minLength": 1},
+ "repo": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"},
+ "run_attempt": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"}
+ }
+ },
+ "case": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "backend",
+ "canonical",
+ "eplb",
+ "ep",
+ "experts",
+ "gpus_per_node",
+ "hidden",
+ "ladder",
+ "nodes",
+ "phase",
+ "required_publication",
+ "routing",
+ "samples_per_point",
+ "scale_up_domain",
+ "suite",
+ "timing",
+ "topk",
+ "warmup_semantics",
+ "workload"
+ ],
+ "properties": {
+ "backend": {"$ref": "#/$defs/safeId"},
+ "canonical": {"const": true},
+ "eplb": {"type": "boolean"},
+ "ep": {"type": "integer","minimum": 1},
+ "experts": {"type": "integer","minimum": 1},
+ "gpus_per_node": {"type": "integer","minimum": 1},
+ "hidden": {"type": "integer","minimum": 1},
+ "ladder": {"type": "string","pattern": "^[1-9][0-9]*( [1-9][0-9]*)*$"},
+ "nodes": {"type": "integer","minimum": 1},
+ "phase": {"enum": ["decode","prefill"]},
+ "required_publication": {"enum": ["official","comparable-experimental"]},
+ "routing": {"enum": ["uniform","zipf"]},
+ "samples_per_point": {"const": 512},
+ "scale_up_domain": {"type": "integer","minimum": 1},
+ "suite": {"$ref": "#/$defs/safeId"},
+ "timing": {"const": "8:64:32"},
+ "topk": {"type": "integer","minimum": 1},
+ "warmup_semantics": {"const": "full-roundtrip-before-each-component-trial-point-v1"},
+ "workload": {"$ref": "#/$defs/safeId"}
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 0000000000..3752db6b9d
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Render a small native-v1 shard summary and gate on a successful case."""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import contracts
+
+
+def load_results(directory: str, runner: str | None, timestamp: str | None) -> list[dict]:
+ documents: list[dict] = []
+ for path in sorted(Path(directory).glob("*.json")):
+ if runner and not path.name.startswith(f"{runner}_"):
+ continue
+ if timestamp and timestamp not in path.name:
+ continue
+ try:
+ document = contracts.strict_load(path)
+ if document.get("format") == contracts.RAW_FORMAT:
+ documents.append(contracts.load_raw_attempt(path))
+ elif document.get("format") == contracts.TERMINAL_FORMAT:
+ documents.append(contracts.validate_terminal_document(document))
+ except (contracts.ContractError, OSError):
+ continue
+ return documents
+
+
+def _identity(document: dict) -> tuple[str, str, str, str, bool, str, int]:
+ case = document["case"]
+ if document["format"] == contracts.RAW_FORMAT:
+ routing = case["shape"]["routing"]
+ eplb = case["eplb"]["enabled"]
+ else:
+ routing = case["routing"]
+ eplb = case["eplb"]
+ sku = document["identity"]["case_factors"]["sku"]
+ return (
+ sku, case["suite"], routing, case["phase"], eplb,
+ case["required_publication"], case.get("ep_size", case.get("ep", 0)),
+ )
+
+
+def _headline(document: dict) -> tuple[int | str, float | str, float | str]:
+ if document["format"] != contracts.RAW_FORMAT:
+ return "-", "-", "-"
+ rows = document["measurement"]["rows"]
+ row = next((item for item in rows if item["tokens_per_rank"] == 64), rows[len(rows) // 2])
+ latency = row["components"]["roundtrip"]["percentiles_us"]
+ return row["tokens_per_rank"], latency["p50"], latency["p99"]
+
+
+def render(documents: list[dict], markdown: bool) -> str:
+ documents = sorted(documents, key=_identity)
+ if markdown:
+ lines = [
+ "## CollectiveX EP results", "",
+ "| sku | backend | suite | phase | routing | tier | ep | outcome | T* | p50 us | p99 us |",
+ "|---|---|---|---|---|---|--:|---|--:|--:|--:|",
+ ]
+ for document in documents:
+ sku, suite, routing, phase, eplb, tier, ep = _identity(document)
+ backend = document["case"]["backend"]
+ token, p50, p99 = _headline(document)
+ lines.append(
+ f"| {sku} | `{backend}` | {suite} | {phase} | "
+ f"{routing}{'+eplb' if eplb else ''} | {tier} | {ep} | "
+ f"{document['outcome']['status']} | {token} | {p50} | {p99} |"
+ )
+ if not documents:
+ lines.append("\n> No valid native v1 outcome documents found.")
+ return "\n".join(lines)
+ lines = ["CollectiveX EP results", "======================"]
+ for document in documents:
+ sku, suite, routing, phase, eplb, tier, ep = _identity(document)
+ backend = document["case"]["backend"]
+ token, _, p99 = _headline(document)
+ lines.append(
+ f" {sku:<10} {backend:<16} {suite:<13} {phase:<7} "
+ f"{routing}{'+eplb' if eplb else ''} {tier} ep{ep} "
+ f"{document['outcome']['status']} T={token} roundtrip_p99_us={p99}"
+ )
+ return "\n".join(lines)
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="Summarize CollectiveX native v1 outcomes")
+ parser.add_argument("--results-dir", default="results")
+ parser.add_argument("--runner")
+ parser.add_argument("--ts")
+ parser.add_argument("--markdown", action="store_true")
+ args = parser.parse_args()
+ documents = load_results(args.results_dir, args.runner, args.ts)
+ print(render(documents, args.markdown))
+ if args.markdown:
+ return 0
+ return 0 if any(
+ document["format"] == contracts.RAW_FORMAT
+ and document["outcome"]["status"] == "success"
+ for document in documents
+ ) else 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py
new file mode 100644
index 0000000000..17aa80c94a
--- /dev/null
+++ b/experimental/CollectiveX/sweep_matrix.py
@@ -0,0 +1,974 @@
+#!/usr/bin/env python3
+"""Resolve CollectiveX v1 suites and extract validated execution shards.
+
+The promoted v1 profile is intentionally narrow: normal-mode BF16,
+layout-and-dispatch-v1, tuned resources, and unquantized BF16 combine. Those
+constants are runtime defaults, not matrix axes. The matrix contains only the
+dimensions that vary between scheduled cases.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import hashlib
+import itertools
+import json
+import os
+from pathlib import Path
+import re
+import sys
+from typing import Any
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+sys.path.insert(0, str(HERE / "tests"))
+
+try: # Shard extraction on GPU runners is intentionally stdlib-only.
+ import yaml # type: ignore
+except ModuleNotFoundError: # pragma: no cover - exercised by the workflow environment
+ yaml = None
+
+import capability as cap # noqa: E402
+import contracts # noqa: E402
+import ep_harness # noqa: E402
+import identity # noqa: E402
+
+
+EP_TIMING_PROFILE = (
+ f"{ep_harness.TIMED_ITERS_PER_TRIAL}:{ep_harness.TRIALS_PER_POINT}:"
+ f"{ep_harness.WARMUP_ITERS_PER_TRIAL}"
+)
+V1_PROFILE = dict(identity.V1_CASE_PROFILE)
+V1_WORKLOAD = ("deepseek-v3-v1", 7168, 8, 256)
+V1_SUITE_CONTRACTS = {
+ "ep-core-v1": {
+ "publication": "official",
+ "coordinates": {("decode", "uniform", False), ("prefill", "uniform", False)},
+ "ladders": {
+ "decode": tuple(ep_harness.DECODE_LADDER),
+ "prefill": (256, 512),
+ },
+ },
+ "ep-routing-v1": {
+ "publication": "comparable-experimental",
+ "coordinates": {
+ ("decode", "zipf", False), ("decode", "zipf", True),
+ ("prefill", "zipf", False), ("prefill", "zipf", True),
+ },
+ "ladders": {"decode": (128,), "prefill": (512,)},
+ },
+}
+IDENTIFIER = re.compile(r"[a-z0-9][a-z0-9.-]*")
+SUITE_FIELDS = {
+ "ep_degrees", "eplb", "phases", "platforms", "required_publication", "routings", "token_points",
+ "token_points_decode", "token_points_prefill", "workloads",
+}
+SUITE_REQUIRED = {
+ "phases", "platforms", "required_publication", "routings", "workloads",
+}
+
+
+class MatrixError(ValueError):
+ """A matrix or shard-control document violates the execution contract."""
+
+
+if yaml is not None:
+ class _UniqueKeyLoader(yaml.SafeLoader):
+ pass
+
+ def _unique_mapping(loader: Any, node: Any, deep: bool = False) -> dict[Any, Any]:
+ result: dict[Any, Any] = {}
+ for key_node, value_node in node.value:
+ key = loader.construct_object(key_node, deep=deep)
+ if key in result:
+ raise SystemExit(f"duplicate YAML key {key!r} at line {key_node.start_mark.line + 1}")
+ result[key] = loader.construct_object(value_node, deep=deep)
+ return result
+
+ _UniqueKeyLoader.add_constructor(
+ yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _unique_mapping
+ )
+
+
+def _load(name: str) -> dict[str, Any]:
+ if yaml is None:
+ raise SystemExit("matrix generation requires PyYAML; shard extraction does not")
+ try:
+ with (HERE / "configs" / name).open() as fh:
+ document = yaml.load(fh, Loader=_UniqueKeyLoader)
+ except yaml.YAMLError as exc:
+ raise SystemExit(f"configs/{name} is not valid YAML: {exc}") from exc
+ if not isinstance(document, dict):
+ raise SystemExit(f"configs/{name} must contain a YAML object")
+ return document
+
+
+def _workload_registry(workloads: dict[str, Any]) -> dict[str, dict[str, Any]]:
+ return {
+ name: cfg
+ for section in ("synthetic", "model_derived")
+ for name, cfg in (workloads.get(section) or {}).items()
+ }
+
+
+def _fields(value: Any, path: str, allowed: set[str], required: set[str]) -> dict[str, Any]:
+ if not isinstance(value, dict):
+ raise SystemExit(f"{path} must be an object")
+ if any(not isinstance(key, str) for key in value):
+ raise SystemExit(f"{path} field names must be strings")
+ unknown, missing = set(value) - allowed, required - set(value)
+ if unknown or missing:
+ raise SystemExit(f"{path} fields: unknown={sorted(unknown)}, missing={sorted(missing)}")
+ return value
+
+
+def _list(value: Any, path: str, item_type: type, allowed: set[Any] | None = None) -> list[Any]:
+ if (not isinstance(value, list) or not value
+ or any(type(item) is not item_type for item in value)
+ or len(value) != len(set(value))
+ or (allowed is not None and any(item not in allowed for item in value))):
+ raise SystemExit(f"{path} must be a non-empty unique list of valid {item_type.__name__}s")
+ return value
+
+
+def validate_config_documents(
+ suites_document: dict[str, Any], workloads: dict[str, Any]
+) -> None:
+ """Reject configuration that is ambiguous, unused, or outside the v1 grid."""
+ _fields(
+ suites_document, "configs/suites.yaml",
+ {"schema_version", "suites"}, {"schema_version", "suites"},
+ )
+ _fields(
+ workloads, "configs/workloads.yaml",
+ {"schema_version", "synthetic", "model_derived"}, {"schema_version"},
+ )
+ if type(suites_document["schema_version"]) is not int or suites_document["schema_version"] != 1:
+ raise SystemExit("configs/suites.yaml schema_version must be integer 1")
+ if type(workloads["schema_version"]) is not int or workloads["schema_version"] != 1:
+ raise SystemExit("configs/workloads.yaml schema_version must be integer 1")
+ registry: dict[str, dict[str, Any]] = {}
+ for section, expert_field in (
+ ("synthetic", "experts"),
+ ("model_derived", "routed_experts"),
+ ):
+ entries = workloads.get(section, {})
+ if not isinstance(entries, dict):
+ raise SystemExit(f"workloads.{section} must be an object")
+ for name, value in entries.items():
+ if not isinstance(name, str) or not IDENTIFIER.fullmatch(name) or name in registry:
+ raise SystemExit(f"workloads.{section} has invalid or duplicate name {name!r}")
+ fields = {"hidden", "topk", expert_field, "verified_against"}
+ config = _fields(value, f"workload {name}", fields, fields - {"verified_against"})
+ dimensions = [config[key] for key in ("hidden", "topk", expert_field)]
+ if any(type(item) is not int or item <= 0 for item in dimensions):
+ raise SystemExit(f"workload {name} dimensions must be positive integers")
+ if dimensions[1] > dimensions[2]:
+ raise SystemExit(f"workload {name}.topk exceeds its expert count")
+ source = config.get("verified_against")
+ if source is not None and (not isinstance(source, str) or not source.strip()):
+ raise SystemExit(f"workload {name}.verified_against must be a non-empty string")
+ registry[name] = config
+ if not registry:
+ raise SystemExit("configs/workloads.yaml must define at least one workload")
+
+ suites = suites_document["suites"]
+ if not isinstance(suites, dict) or not suites:
+ raise SystemExit("configs/suites.yaml suites must be a non-empty object")
+ referenced: set[str] = set()
+ for name, value in suites.items():
+ if not isinstance(name, str) or not IDENTIFIER.fullmatch(name):
+ raise SystemExit(f"invalid suite name {name!r}")
+ suite = _fields(value, f"suite {name}", SUITE_FIELDS, SUITE_REQUIRED)
+ suite_workloads = _list(suite["workloads"], f"suite {name}.workloads", str)
+ unknown = sorted(set(suite_workloads) - set(registry))
+ if unknown:
+ raise SystemExit(f"suite {name}: unknown workloads {unknown}")
+ referenced.update(suite_workloads)
+ platforms = _list(
+ suite["platforms"], f"suite {name}.platforms", str, set(cap.PLATFORMS)
+ )
+ phases = _list(suite["phases"], f"suite {name}.phases", str, {"decode", "prefill"})
+ routings = _list(suite["routings"], f"suite {name}.routings", str, {"uniform", "zipf"})
+ eplb = _list(suite.get("eplb", [False]), f"suite {name}.eplb", bool)
+ if True in eplb and routings != ["zipf"]:
+ raise SystemExit(f"suite {name}: EPLB is only valid for Zipf routing")
+ if suite["required_publication"] not in {"official", "comparable-experimental"}:
+ raise SystemExit(f"suite {name}.required_publication is invalid")
+ if suite["required_publication"] == "official":
+ unverified = [item for item in suite_workloads if not registry[item].get("verified_against")]
+ if unverified:
+ raise SystemExit(f"suite {name}: official workloads need verified_against: {unverified}")
+ if "ep_degrees" in suite:
+ degrees = _list(suite["ep_degrees"], f"suite {name}.ep_degrees", int)
+ if any(degree <= 0 for degree in degrees):
+ raise SystemExit(f"suite {name}.ep_degrees must be positive")
+ for platform in platforms:
+ if not set(degrees).issubset(cap.PLATFORMS[platform]["ep_degrees"]):
+ raise SystemExit(f"suite {name}: invalid EP degree for {platform}")
+ for phase in {"decode", "prefill"} - set(phases):
+ if f"token_points_{phase}" in suite:
+ raise SystemExit(f"suite {name}.token_points_{phase} is unreachable")
+ if "token_points" in suite and all(
+ f"token_points_{phase}" in suite for phase in phases
+ ):
+ raise SystemExit(f"suite {name}.token_points is unreachable")
+ for phase in phases:
+ _ladder(suite, phase)
+ unused = sorted(set(registry) - referenced)
+ if unused:
+ raise SystemExit(f"unreferenced workloads: {unused}")
+
+
+def _dims(workloads: dict[str, Any], name: str) -> tuple[int, int, int]:
+ config = _workload_registry(workloads)[name]
+ values = (
+ config.get("hidden"),
+ config.get("topk"),
+ config.get("experts", config.get("routed_experts")),
+ )
+ return values # type: ignore[return-value]
+
+
+def _ladder(suite: dict[str, Any], phase: str) -> str:
+ points = suite.get(f"token_points_{phase}", suite.get("token_points"))
+ if points is None:
+ points = ep_harness.DECODE_LADDER if phase == "decode" else ep_harness.PREFILL_LADDER
+ if (not isinstance(points, list) or not points
+ or any(isinstance(point, bool) or not isinstance(point, int) or point <= 0
+ for point in points)
+ or points != sorted(set(points))):
+ raise SystemExit(f"invalid {phase} token ladder: {points!r}")
+ return " ".join(map(str, points))
+
+
+def _v1_requested_ladder(case: dict[str, Any]) -> str:
+ """Bind extracted controls to the frozen v1 suite and workload catalog."""
+ suite = V1_SUITE_CONTRACTS.get(case.get("suite"))
+ coordinate = (case.get("phase"), case.get("routing"), case.get("eplb"))
+ if (
+ suite is None
+ or coordinate not in suite["coordinates"]
+ or case.get("required_publication") != suite["publication"]
+ or (
+ case.get("workload"), case.get("hidden"), case.get("topk"), case.get("experts")
+ ) != V1_WORKLOAD
+ ):
+ raise MatrixError("case differs from the frozen v1 suite/workload catalog")
+ return " ".join(map(str, suite["ladders"][case["phase"]]))
+
+
+def _expected_disposition(
+ sku: str, case: dict[str, Any]
+) -> tuple[str, str | None, str | None]:
+ requested_ladder = _v1_requested_ladder(case)
+ ok, detail = cap.resolve(
+ sku, case["backend"], nodes=case["nodes"],
+ routing=case["routing"], eplb=case["eplb"],
+ )
+ if ok:
+ if case["ladder"] != requested_ladder:
+ raise MatrixError("case ladder differs from the frozen v1 suite catalog")
+ return "runnable", None, None
+ if case["ladder"] != requested_ladder:
+ raise MatrixError("unsupported case ladder differs from the frozen v1 suite catalog")
+ return "unsupported", "backend-platform-unsupported", detail
+
+
+def _case_id(sku: str, case: dict[str, Any]) -> str:
+ return identity.case_id(sku=sku, profile=V1_PROFILE, case=case)
+
+
+def _semantic_points(sku: str, case: dict[str, Any]) -> list[str]:
+ execution = {
+ key: value for key, value in case.items()
+ if key not in {"canonical", "case_id", "ladder", "required_publication", "suite", "workload"}
+ }
+ return [
+ json.dumps(
+ {"sku": sku, "tokens_per_rank": int(point), **execution},
+ sort_keys=True,
+ separators=(",", ":"),
+ )
+ for point in case["ladder"].split()
+ ]
+
+
+def _select_backends(backend: str, backends: str) -> list[str]:
+ available = list(cap.SWEEP_BACKENDS)
+ if backend and backends:
+ raise SystemExit("--backend and --backends are mutually exclusive")
+ if backends:
+ names = available if backends == "all" else [
+ value.strip() for value in backends.split(",") if value.strip()
+ ]
+ else:
+ names = [backend or "deepep"]
+ unknown = sorted(set(names) - set(available))
+ if unknown:
+ raise SystemExit(f"unknown backend values {unknown}; have {available}")
+ if len(names) != len(set(names)):
+ raise SystemExit("backend selection contains duplicates")
+ return names
+
+
+def resolve_matrix(
+ suites: str = "all",
+ backend: str = "",
+ backends: str = "",
+ only_sku: str = "",
+ min_nodes: int = 0,
+ max_nodes: int = 0,
+ max_cases: int = 128,
+) -> dict[str, Any]:
+ """Resolve suite configuration into allocation-sized workflow shards."""
+ if max_cases <= 0:
+ raise SystemExit("--max-cases must be positive")
+ if min_nodes < 0 or max_nodes < 0 or (min_nodes and max_nodes and min_nodes > max_nodes):
+ raise SystemExit("invalid node bounds")
+ if only_sku and only_sku not in cap.PLATFORMS:
+ raise SystemExit(f"unknown --only-sku {only_sku!r}; have {sorted(cap.PLATFORMS)}")
+
+ workloads = _load("workloads.yaml")
+ suites_document = _load("suites.yaml")
+ validate_config_documents(suites_document, workloads)
+ registry = suites_document["suites"]
+ names = list(registry) if suites == "all" else [
+ value.strip() for value in suites.split(",") if value.strip()
+ ]
+ if not names or len(names) != len(set(names)):
+ raise SystemExit("suite selection must be non-empty and unique")
+ unknown = sorted(set(names) - set(registry))
+ if unknown:
+ raise SystemExit(f"unknown suites {unknown}; have {sorted(registry)}")
+ targets = _select_backends(backend, backends)
+
+ shards: dict[tuple[str, str, int], list[dict[str, Any]]] = {}
+ requested_cases: list[dict[str, Any]] = []
+ scheduled: set[str] = set()
+ for suite_name in names:
+ suite = registry[suite_name]
+ phases = suite["phases"]
+ routings = suite["routings"]
+ eplb_values = suite.get("eplb", [False])
+ for platform_name in suite["platforms"]:
+ platform = cap.PLATFORMS[platform_name]
+ if only_sku and platform_name != only_sku:
+ continue
+ gpus_per_node = int(platform["gpus_per_node"])
+ scale_up_domain = int(platform["scale_up_domain"])
+ ep_degrees = suite.get("ep_degrees") or platform["ep_degrees"]
+ for workload, ep, phase, routing, eplb, target in itertools.product(
+ suite["workloads"], ep_degrees, phases, routings, eplb_values, targets
+ ):
+ if ep not in platform["ep_degrees"]:
+ raise SystemExit(
+ f"suite {suite_name}: {platform_name} EP{ep} is not registered"
+ )
+ nodes_int = (ep + gpus_per_node - 1) // gpus_per_node
+ if min_nodes and nodes_int < min_nodes:
+ continue
+ if max_nodes and nodes_int > max_nodes:
+ continue
+ ok, capability_detail = cap.resolve(
+ platform_name,
+ target,
+ nodes=nodes_int,
+ routing=routing,
+ eplb=bool(eplb),
+ )
+ hidden, topk, experts = _dims(workloads, workload)
+ nodes = nodes_int
+
+ def add_case(
+ case_ladder: str,
+ disposition: str,
+ reason: str | None,
+ detail: str | None,
+ ) -> None:
+ case: dict[str, Any] = {
+ "suite": suite_name,
+ "workload": workload,
+ "required_publication": suite["required_publication"],
+ "backend": target,
+ "routing": routing,
+ "phase": phase,
+ "ep": ep,
+ "eplb": eplb,
+ "hidden": hidden,
+ "topk": topk,
+ "experts": experts,
+ "samples_per_point": ep_harness.TIMED_SAMPLES_PER_POINT,
+ "warmup_semantics": ep_harness.WARMUP_SEMANTICS,
+ "ladder": case_ladder,
+ "timing": EP_TIMING_PROFILE,
+ "canonical": True,
+ "nodes": nodes,
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ }
+ for signature in _semantic_points(platform_name, case):
+ if signature in scheduled:
+ raise SystemExit(
+ f"suite {suite_name}: duplicate semantic point for {platform_name}"
+ )
+ scheduled.add(signature)
+ case["case_id"] = _case_id(platform_name, case)
+ requested_cases.append(
+ {
+ "sku": platform_name,
+ "case": case,
+ "disposition": disposition,
+ "reason": reason,
+ "detail": detail,
+ }
+ )
+ if disposition == "runnable":
+ shards.setdefault((platform_name, target, nodes), []).append(case)
+
+ requested_ladder = _ladder(suite, phase)
+ if not ok:
+ add_case(
+ requested_ladder,
+ "unsupported",
+ "backend-platform-unsupported",
+ capability_detail,
+ )
+ continue
+ add_case(requested_ladder, "runnable", None, None)
+
+ shards_by_sku: dict[str, list[dict[str, Any]]] = {}
+ for (sku, target, nodes), cases in sorted(shards.items()):
+ chunk_size = max_cases
+ for offset in range(0, len(cases), chunk_size):
+ chunk = cases[offset:offset + chunk_size]
+ part = offset // chunk_size
+ shard_id = f"{sku}-{target}-n{nodes}"
+ if len(cases) > chunk_size:
+ shard_id += f"-p{part}"
+ shards_by_sku.setdefault(sku, []).append({
+ "id": shard_id,
+ "sku": sku,
+ "backend": target,
+ "launcher": cap.PLATFORMS[sku]["launcher"],
+ "gpus_per_node": cap.PLATFORMS[sku]["gpus_per_node"],
+ "scale_up_domain": cap.PLATFORMS[sku]["scale_up_domain"],
+ "nodes": nodes,
+ "n": len(chunk),
+ "case_ids": [case["case_id"] for case in chunk],
+ })
+ include = [
+ shards_by_sku[sku][round_index]
+ for round_index in range(max(map(len, shards_by_sku.values()), default=0))
+ for sku in sorted(shards_by_sku)
+ if round_index < len(shards_by_sku[sku])
+ ]
+ return {
+ "format": "collectivex.matrix.v1",
+ "schema_version": 1,
+ "requested_cases": requested_cases,
+ "include": include,
+ }
+
+
+def _strict_json_load(path: Path) -> Any:
+ def reject_constant(value: str) -> None:
+ raise MatrixError(f"non-finite JSON number {value}")
+
+ def reject_duplicates(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
+ result: dict[str, Any] = {}
+ for key, value in pairs:
+ if key in result:
+ raise MatrixError(f"duplicate JSON key {key!r}")
+ result[key] = value
+ return result
+
+ if not path.is_file():
+ raise MatrixError(f"matrix does not exist: {path}")
+ if path.stat().st_size == 0:
+ raise MatrixError(f"matrix is empty: {path}")
+ try:
+ with path.open() as fh:
+ return json.load(
+ fh, parse_constant=reject_constant, object_pairs_hook=reject_duplicates
+ )
+ except (OSError, json.JSONDecodeError) as exc:
+ raise MatrixError(f"matrix is not valid JSON: {exc}") from exc
+
+
+def _positive_int(value: Any, field: str) -> int:
+ if type(value) is not int:
+ raise MatrixError(f"{field} must be a positive integer")
+ if value <= 0:
+ raise MatrixError(f"{field} must be a positive integer")
+ return value
+
+
+def validate_shard_control(
+ shard: dict[str, Any],
+ *,
+ sku: str,
+ backend: str,
+ nodes: int,
+ require_runnable: bool = True,
+) -> None:
+ """Validate one shard against the workflow cell that requested it."""
+ if not isinstance(shard, dict):
+ raise MatrixError("shard must be a JSON object")
+ if sku not in cap.PLATFORMS or backend not in cap.SWEEP_BACKENDS:
+ raise MatrixError("shard platform/backend is not registered")
+ top_fields = {"schema_version", "id", "sku", "backend", "nodes", "n", "cases"}
+ if (
+ set(shard) != top_fields
+ or type(shard.get("schema_version")) is not int
+ or shard["schema_version"] != 1
+ ):
+ raise MatrixError("shard fields or schema version differ from v1 contract")
+ if not isinstance(shard.get("id"), str) or not IDENTIFIER.fullmatch(shard["id"]):
+ raise MatrixError("shard has invalid id")
+ for field, expected in (("sku", sku), ("backend", backend)):
+ if shard.get(field) != expected:
+ raise MatrixError(
+ f"shard {field} mismatch: expected {expected!r}, got {shard.get(field)!r}"
+ )
+ if _positive_int(shard.get("nodes"), "shard.nodes") != nodes:
+ raise MatrixError(
+ f"shard nodes mismatch: expected {nodes}, got {shard.get('nodes')!r}"
+ )
+ cases = shard.get("cases")
+ if not isinstance(cases, list) or not cases:
+ raise MatrixError("shard must contain at least one case")
+ if _positive_int(shard.get("n"), "shard.n") != len(cases):
+ raise MatrixError("shard.n does not match the number of cases")
+ seen: set[str] = set()
+ required = {
+ "case_id", "suite", "workload", "required_publication", "backend", "routing",
+ "phase", "ep", "eplb", "hidden", "topk", "experts", "samples_per_point",
+ "warmup_semantics", "ladder", "timing", "canonical", "nodes",
+ "gpus_per_node", "scale_up_domain",
+ }
+ for index, case in enumerate(cases):
+ if not isinstance(case, dict):
+ raise MatrixError(f"case {index} must be a JSON object")
+ fields = set(case)
+ if fields != required:
+ raise MatrixError(
+ f"case {index} fields differ from v1 contract: "
+ f"missing={sorted(required - fields)}, extra={sorted(fields - required)}"
+ )
+ case_id = case["case_id"]
+ if not identity.is_typed_id(case_id, "case"):
+ raise MatrixError(f"case {index} has invalid case_id")
+ if case_id in seen:
+ raise MatrixError(f"duplicate case_id {case_id}")
+ seen.add(case_id)
+ for field in ("suite", "workload", "required_publication", "backend", "routing", "phase",
+ "warmup_semantics", "ladder", "timing"):
+ if not isinstance(case[field], str) or not case[field]:
+ raise MatrixError(f"case {index}.{field} must be a non-empty string")
+ for field in ("suite", "workload", "required_publication", "backend", "routing", "phase"):
+ if not IDENTIFIER.fullmatch(case[field]):
+ raise MatrixError(f"case {index}.{field} is not a safe identifier")
+ if case["required_publication"] not in {"official", "comparable-experimental"}:
+ raise MatrixError(f"case {index} has invalid publication requirement")
+ case_identity = {key: value for key, value in case.items() if key != "case_id"}
+ if case_id != _case_id(sku, case_identity):
+ raise MatrixError(f"case {index} case_id does not match its contents")
+ if case["backend"] != backend:
+ raise MatrixError(f"case {index} backend does not match shard")
+ if _positive_int(case["nodes"], f"case {index}.nodes") != nodes:
+ raise MatrixError(f"case {index} nodes does not match shard")
+ ep = _positive_int(case["ep"], f"case {index}.ep")
+ gpus_per_node = _positive_int(
+ case["gpus_per_node"], f"case {index}.gpus_per_node"
+ )
+ platform = cap.PLATFORMS[sku]
+ if (
+ gpus_per_node != platform["gpus_per_node"]
+ or case["scale_up_domain"] != platform["scale_up_domain"]
+ or ep not in platform["ep_degrees"]
+ ):
+ raise MatrixError(f"case {index} differs from the platform registry")
+ if ep != nodes * gpus_per_node:
+ raise MatrixError(f"case {index} ep does not equal nodes * gpus_per_node")
+ if case["samples_per_point"] != ep_harness.TIMED_SAMPLES_PER_POINT:
+ raise MatrixError(f"case {index} violates fixed-512-v1")
+ if case["timing"] != EP_TIMING_PROFILE:
+ raise MatrixError(f"case {index} has invalid timing profile")
+ if case["warmup_semantics"] != ep_harness.WARMUP_SEMANTICS:
+ raise MatrixError(f"case {index} has invalid warmup semantics")
+ if case["phase"] not in {"decode", "prefill"}:
+ raise MatrixError(f"case {index} has invalid phase")
+ if case["routing"] not in {"uniform", "zipf"}:
+ raise MatrixError(f"case {index} has invalid routing")
+ if not isinstance(case["eplb"], bool) or (case["eplb"] and case["routing"] != "zipf"):
+ raise MatrixError(f"case {index} has invalid EPLB setting")
+ if not isinstance(case["canonical"], bool) or not case["canonical"]:
+ raise MatrixError(f"case {index} must use a canonical workload")
+ for field in ("ep", "nodes", "gpus_per_node", "hidden", "topk", "experts",
+ "samples_per_point", "scale_up_domain"):
+ if isinstance(case[field], bool) or not isinstance(case[field], int):
+ raise MatrixError(f"case {index}.{field} must be an integer")
+ _positive_int(case[field], f"case {index}.{field}")
+ if ep > _positive_int(case["scale_up_domain"], f"case {index}.scale_up_domain"):
+ raise MatrixError(f"case {index} exceeds its scale-up domain")
+ try:
+ ladder = [int(value) for value in case["ladder"].split()]
+ except (AttributeError, ValueError) as exc:
+ raise MatrixError(f"case {index} has invalid token ladder") from exc
+ if (not ladder or any(value <= 0 for value in ladder)
+ or ladder != sorted(set(ladder))
+ or case["ladder"] != " ".join(map(str, ladder))):
+ raise MatrixError(f"case {index} has invalid token ladder")
+ if require_runnable:
+ disposition, reason, _ = _expected_disposition(sku, case)
+ if disposition != "runnable":
+ raise MatrixError(f"case {index} violates capability registry: {reason}")
+ else:
+ _v1_requested_ladder(case)
+
+
+def validate_matrix_document(document: Any) -> dict[str, Any]:
+ """Validate the complete requested grid and its runnable shard partition."""
+ if not isinstance(document, dict) or set(document) != {
+ "format", "schema_version", "requested_cases", "include"
+ }:
+ raise MatrixError("matrix fields differ from the v1 contract")
+ if (
+ document["format"] != "collectivex.matrix.v1"
+ or type(document["schema_version"]) is not int
+ or document["schema_version"] != 1
+ ):
+ raise MatrixError("matrix format/schema differs from v1")
+ requested = document["requested_cases"]
+ include = document["include"]
+ if not isinstance(requested, list) or not requested:
+ raise MatrixError("matrix.requested_cases must be non-empty")
+ if not isinstance(include, list):
+ raise MatrixError("matrix.include must be an array")
+
+ cases_by_id: dict[str, dict[str, Any]] = {}
+ runnable_ids: set[str] = set()
+ semantic_points: set[str] = set()
+ for index, value in enumerate(requested):
+ path = f"matrix.requested_cases[{index}]"
+ if not isinstance(value, dict) or set(value) != {
+ "sku", "case", "disposition", "reason", "detail"
+ }:
+ raise MatrixError(f"{path} fields differ from the v1 contract")
+ sku = value["sku"]
+ case = value["case"]
+ disposition = value["disposition"]
+ if sku not in cap.PLATFORMS:
+ raise MatrixError(f"{path}.sku is unknown")
+ if disposition not in {"runnable", "unsupported"}:
+ raise MatrixError(f"{path}.disposition is invalid")
+ if disposition == "runnable":
+ if value["reason"] is not None or value["detail"] is not None:
+ raise MatrixError(f"{path} runnable cases cannot have a reason")
+ else:
+ if (
+ not isinstance(value["reason"], str)
+ or not IDENTIFIER.fullmatch(value["reason"])
+ or not isinstance(value["detail"], str)
+ or not value["detail"]
+ ):
+ raise MatrixError(f"{path} unsupported cases need a public reason and detail")
+ if not isinstance(case, dict):
+ raise MatrixError(f"{path}.case must be an object")
+ backend = case.get("backend")
+ nodes = case.get("nodes")
+ if not isinstance(backend, str) or type(nodes) is not int:
+ raise MatrixError(f"{path}.case backend/nodes are invalid")
+ validate_shard_control(
+ {
+ "schema_version": 1,
+ "id": "requested-case",
+ "sku": sku,
+ "backend": backend,
+ "nodes": nodes,
+ "n": 1,
+ "cases": [case],
+ },
+ sku=sku,
+ backend=backend,
+ nodes=nodes,
+ require_runnable=disposition == "runnable",
+ )
+ case_id = case["case_id"]
+ if case_id in cases_by_id:
+ raise MatrixError(f"duplicate requested case_id {case_id}")
+ for signature in _semantic_points(sku, case):
+ if signature in semantic_points:
+ raise MatrixError(f"{path} duplicates a semantic token point")
+ semantic_points.add(signature)
+ cases_by_id[case_id] = value
+ expected = _expected_disposition(sku, case)
+ if (disposition, value["reason"], value["detail"]) != expected:
+ raise MatrixError(f"{path} disposition differs from the frozen v1 catalog")
+ if disposition == "runnable":
+ runnable_ids.add(case_id)
+
+ shard_ids: set[str] = set()
+ assigned: list[str] = []
+ for index, shard in enumerate(include):
+ path = f"matrix.include[{index}]"
+ expected = {
+ "id", "sku", "backend", "launcher", "gpus_per_node", "scale_up_domain",
+ "nodes", "n", "case_ids",
+ }
+ if not isinstance(shard, dict) or set(shard) != expected:
+ raise MatrixError(f"{path} fields differ from the v1 contract")
+ shard_id = shard["id"]
+ if not isinstance(shard_id, str) or not IDENTIFIER.fullmatch(shard_id):
+ raise MatrixError(f"{path}.id is invalid")
+ if shard_id in shard_ids:
+ raise MatrixError(f"duplicate shard id {shard_id}")
+ shard_ids.add(shard_id)
+ sku = shard["sku"]
+ if sku not in cap.PLATFORMS:
+ raise MatrixError(f"{path}.sku is unknown")
+ platform = cap.PLATFORMS[sku]
+ for field in ("launcher", "gpus_per_node", "scale_up_domain"):
+ if shard[field] != platform[field]:
+ raise MatrixError(f"{path}.{field} differs from the platform registry")
+ case_ids = shard["case_ids"]
+ if not isinstance(case_ids, list) or not case_ids or len(case_ids) != len(set(case_ids)):
+ raise MatrixError(f"{path}.case_ids must be a non-empty unique array")
+ if _positive_int(shard["n"], f"{path}.n") != len(case_ids):
+ raise MatrixError(f"{path}.n differs from case_ids")
+ nodes = _positive_int(shard["nodes"], f"{path}.nodes")
+ for case_id in case_ids:
+ wrapper = cases_by_id.get(case_id)
+ if wrapper is None or wrapper["disposition"] != "runnable":
+ raise MatrixError(f"{path} references a missing or unsupported case")
+ case = wrapper["case"]
+ if (
+ wrapper["sku"] != sku
+ or case["backend"] != shard["backend"]
+ or case["nodes"] != nodes
+ ):
+ raise MatrixError(f"{path} case does not match shard coordinates")
+ assigned.append(case_id)
+ if len(assigned) != len(set(assigned)):
+ raise MatrixError("a runnable case is assigned to more than one shard")
+ if set(assigned) != runnable_ids:
+ raise MatrixError("runnable requested cases and shard assignments differ")
+ return document
+
+
+def extract_shard(
+ matrix_path: str | os.PathLike[str],
+ shard_id: str,
+ output_path: str | os.PathLike[str],
+ *,
+ sku: str,
+ backend: str,
+ nodes: int,
+) -> dict[str, Any]:
+ """Extract one strictly matched shard control file, writing it atomically."""
+ document = validate_matrix_document(_strict_json_load(Path(matrix_path)))
+ include = document["include"]
+ matches = [item for item in include if isinstance(item, dict) and item.get("id") == shard_id]
+ if len(matches) != 1:
+ raise MatrixError(f"expected exactly one shard {shard_id!r}, found {len(matches)}")
+ source = matches[0]
+ requested = {
+ item["case"]["case_id"]: item
+ for item in document["requested_cases"]
+ }
+ cases = [requested[case_id]["case"] for case_id in source["case_ids"]]
+ control = {
+ "schema_version": 1,
+ "id": source.get("id"),
+ "sku": source.get("sku"),
+ "backend": source.get("backend"),
+ "nodes": source.get("nodes"),
+ "n": source.get("n"),
+ "cases": cases,
+ }
+ validate_shard_control(control, sku=sku, backend=backend, nodes=nodes)
+ output = Path(output_path)
+ output.parent.mkdir(parents=True, exist_ok=True)
+ temporary = output.with_name(f".{output.name}.tmp-{os.getpid()}")
+ try:
+ with temporary.open("w") as fh:
+ json.dump(control, fh, sort_keys=True, separators=(",", ":"))
+ fh.write("\n")
+ os.replace(temporary, output)
+ finally:
+ temporary.unlink(missing_ok=True)
+ return control
+
+
+def emit_unsupported(
+ matrix_path: str | os.PathLike[str], output_dir: str | os.PathLike[str]
+) -> list[Path]:
+ """Materialize one strict terminal outcome for each unsupported requested case."""
+ source = Path(matrix_path)
+ document = validate_matrix_document(_strict_json_load(source))
+ control_sha256 = hashlib.sha256(source.read_bytes()).hexdigest()
+ generated_at = dt.datetime.now(dt.timezone.utc).isoformat()
+ git_run = {
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+ allocation_factors = {
+ "artifact": git_run["artifact"],
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"),
+ "job": git_run["job"],
+ "repo": git_run["repo"],
+ "run_attempt": git_run["run_attempt"],
+ "run_id": git_run["run_id"],
+ "runner": "capability-resolver",
+ "source_sha": git_run["source_sha"],
+ }
+ destination = Path(output_dir)
+ destination.mkdir(parents=True, exist_ok=True)
+ written: list[Path] = []
+ for wrapper in document["requested_cases"]:
+ if wrapper["disposition"] != "unsupported":
+ continue
+ scheduled = wrapper["case"]
+ case = {key: value for key, value in scheduled.items() if key != "case_id"}
+ case_factors = {"case": case, "profile": V1_PROFILE, "sku": wrapper["sku"]}
+ case_id = identity.digest("case", case_factors)
+ if case_id != scheduled["case_id"]:
+ raise MatrixError(f"unsupported case identity differs for {scheduled['case_id']}")
+ attempt_ordinal = 1
+ record = contracts.make_terminal_document(
+ allocation_factors=allocation_factors,
+ attempt_ordinal=attempt_ordinal,
+ case=case,
+ case_factors=case_factors,
+ control_sha256=control_sha256,
+ failure_mode="capability",
+ generated_at=generated_at,
+ git_run=git_run,
+ reason=wrapper["reason"],
+ return_code=5,
+ source="matrix-capability-resolver",
+ status="unsupported",
+ expected_case_id=case_id,
+ )
+ path = destination / f"unsupported_{case_id}.json"
+ temporary = path.with_name(f".{path.name}.tmp-{os.getpid()}")
+ try:
+ with temporary.open("x") as handle:
+ json.dump(record, handle, allow_nan=False, sort_keys=True, separators=(",", ":"))
+ handle.write("\n")
+ handle.flush()
+ os.fsync(handle.fileno())
+ os.replace(temporary, path)
+ finally:
+ temporary.unlink(missing_ok=True)
+ written.append(path)
+ return written
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="CollectiveX v1 matrix resolver")
+ parser.add_argument("--suites", default="all", help="'all' or comma-list of suites")
+ parser.add_argument("--backend", default="", help="select one EP backend")
+ parser.add_argument("--backends", default="", help="'all' or comma-list of EP backends")
+ parser.add_argument("--only-sku", default="")
+ parser.add_argument("--min-nodes", type=int, default=0)
+ parser.add_argument("--max-nodes", type=int, default=0)
+ parser.add_argument("--max-cases", type=int, default=128)
+ parser.add_argument("--extract-from", default="", metavar="MATRIX")
+ parser.add_argument("--validate-control", default="", metavar="SHARD")
+ parser.add_argument("--emit-unsupported-from", default="", metavar="MATRIX")
+ parser.add_argument("--out-dir", default="")
+ parser.add_argument("--shard-id", default="")
+ parser.add_argument("--expect-sku", default="")
+ parser.add_argument("--expect-backend", default="")
+ parser.add_argument("--expect-nodes", type=int, default=0)
+ parser.add_argument("--out", default="")
+ args = parser.parse_args()
+
+ if args.emit_unsupported_from:
+ if not args.out_dir:
+ parser.error("unsupported outcome emission requires --out-dir")
+ try:
+ written = emit_unsupported(args.emit_unsupported_from, args.out_dir)
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"emitted {len(written)} unsupported terminal outcomes", file=sys.stderr)
+ return 0
+
+ if args.validate_control:
+ if not all((args.expect_sku, args.expect_backend, args.expect_nodes)):
+ parser.error(
+ "control validation requires --expect-sku, --expect-backend, and --expect-nodes"
+ )
+ try:
+ control = _strict_json_load(Path(args.validate_control))
+ validate_shard_control(
+ control,
+ sku=args.expect_sku,
+ backend=args.expect_backend,
+ nodes=args.expect_nodes,
+ )
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"validated {control.get('id')}: {control['n']} cases", file=sys.stderr)
+ return 0
+
+ if args.extract_from:
+ if not all((args.shard_id, args.expect_sku, args.expect_backend, args.expect_nodes, args.out)):
+ parser.error(
+ "shard extraction requires --shard-id, --expect-sku, --expect-backend, "
+ "--expect-nodes, and --out"
+ )
+ try:
+ control = extract_shard(
+ args.extract_from,
+ args.shard_id,
+ args.out,
+ sku=args.expect_sku,
+ backend=args.expect_backend,
+ nodes=args.expect_nodes,
+ )
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"extracted {control['id']}: {control['n']} cases", file=sys.stderr)
+ print(json.dumps(control, separators=(",", ":")))
+ return 0
+
+ matrix = resolve_matrix(
+ suites=args.suites,
+ backend=args.backend,
+ backends=args.backends,
+ only_sku=args.only_sku,
+ min_nodes=args.min_nodes,
+ max_nodes=args.max_nodes,
+ max_cases=args.max_cases,
+ )
+ try:
+ validate_matrix_document(matrix)
+ except MatrixError as exc:
+ parser.error(str(exc))
+ if args.out:
+ with open(args.out, "w") as fh:
+ json.dump(matrix, fh, sort_keys=True, separators=(",", ":"))
+ fh.write("\n")
+ runnable = sum(
+ item["disposition"] == "runnable" for item in matrix["requested_cases"]
+ )
+ unsupported = len(matrix["requested_cases"]) - runnable
+ print(
+ f"resolved {len(matrix['include'])} shard-cells, "
+ f"{runnable} runnable and {unsupported} unsupported cases",
+ file=sys.stderr,
+ )
+ print(json.dumps(matrix))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 0000000000..3109e7c771
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""CollectiveX DeepEP adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import inspect
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import deep_ep
+ from deep_ep import Buffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: deep_ep import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _deepep_version() -> str:
+ try:
+ import importlib.metadata as metadata
+
+ return metadata.version("deep_ep")
+ except Exception:
+ return getattr(deep_ep, "__version__", "unknown")
+
+
+def _mnnvl_buffer_configuration() -> tuple[dict[str, bool], str]:
+ """Resolve the explicit DeepEP MNNVL API contract."""
+ requested_value = os.environ.get("CX_ALLOW_MNNVL")
+ if requested_value not in {None, "", "0", "1"}:
+ raise RuntimeError("CX_ALLOW_MNNVL must be unset, 0, or 1")
+ requested = requested_value == "1"
+ if not requested:
+ return contracts.resolve_deepep_mnnvl(
+ requested=False, signature_parameters=(),
+ deepep_commit=os.environ.get("DEEPEP_COMMIT"),
+ )
+ try:
+ parameters = inspect.signature(Buffer.__init__).parameters
+ except (TypeError, ValueError) as exc:
+ raise RuntimeError("cannot inspect DeepEP Buffer MNNVL API") from exc
+ try:
+ return contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=parameters,
+ deepep_commit=os.environ.get("DEEPEP_COMMIT"),
+ )
+ except contracts.ContractError as exc:
+ raise RuntimeError(str(exc)) from exc
+
+
+class DeepEPBackend:
+ name = "deepep"
+ combine_needs_redispatch = False
+ # DeepEP reduces activations and top-k weights independently. The activation
+ # tensor must therefore carry the complete local weighted expert sum.
+ combine_weight_semantics = "unweighted-rank-sum"
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+
+ self.group = dist.group.WORLD
+ device_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ num_nvl_bytes = 4 * 1024 * 1024 * 1024
+ mnnvl_kwargs, mnnvl_comm = _mnnvl_buffer_configuration()
+ self.buffer = Buffer(self.group, num_nvl_bytes, 0, **mnnvl_kwargs)
+
+ num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+ try:
+ Buffer.set_num_sms(num_sms)
+ except Exception as exc: # pragma: no cover - version dependent
+ raise RuntimeError(
+ f"DeepEP did not apply requested num_sms={num_sms}: {exc!r}"
+ ) from exc
+ applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+ if applied_num_sms != num_sms:
+ raise RuntimeError(
+ f"DeepEP num_sms mismatch: requested={num_sms} applied={applied_num_sms}"
+ )
+
+ version = _deepep_version()
+ self.backend_provenance = {
+ "deepep_version": version,
+ "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{version}",
+ "backend_lineage": "deepep-v1",
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "resource_mode": "tuned",
+ "requested_num_sms": num_sms,
+ "num_sms": applied_num_sms,
+ "device_sms": device_sms,
+ "sm_fraction": applied_num_sms / device_sms,
+ "tuned_source": "deepep-default-num_sms",
+ "num_nvl_bytes": num_nvl_bytes,
+ "allow_mnnvl": bool(mnnvl_kwargs),
+ "mnnvl_comm": mnnvl_comm,
+ }
+
+ def buffer_cap(self, args):
+ return None
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ (
+ num_tokens_per_rank,
+ _,
+ num_tokens_per_expert,
+ is_token_in_rank,
+ _,
+ ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+ recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_tokens_per_rank=num_tokens_per_rank,
+ is_token_in_rank=is_token_in_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle)
+ return combined_x
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * (self.args.experts // self.world_size),
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle)
+ return combined
+
+ def recv_tokens(self, h):
+ return int(h.recv_x.shape[0])
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
new file mode 100644
index 0000000000..6514e93c51
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer).
+
+The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA +
+warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer`
+(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72
+rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter drives the single-
+NVLink-domain path (`num_of_hybrid_ep_ranks_per_nvlink_domain == world_size`, <=8 ranks). That domain
+is ONE node on x86, while GB200/GB300 MNNVL can expose multiple trays as one NVLink domain. The v1
+matrix therefore exercises the same path at EP8 across two GB trays, subject to the normal three-run
+qualification gate. The container build is done by runtime/run_in_container.sh
+`cx_build_deepep_hybrid` (CUDA-13 CCCL include path, without the V2 NVSHMEM overlay).
+
+API (pinned on B300, branch e0a5b1d):
+ HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...)
+ .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle)
+ .combine(hidden, handle=) -> [T, hidden]
+
+CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is
+reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8
+round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses
+the same per-rank-sum combine contract (no gate re-weight). BF16 tolerance is 5e-2.
+
+STATUS: bf16 / normal / layout-and-dispatch-v1. The v1 scope is one detected NVLink domain at up to
+eight ranks; fp8 and the cross-RACK (>1 NVL72, IBGDA/RDMA) path remain out of scope.
+"""
+from __future__ import annotations
+
+import hashlib
+import importlib
+import json
+import os
+from pathlib import Path
+import re
+import shutil
+import sys
+import tempfile
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import deep_ep
+ HybridEPBuffer = deep_ep.HybridEPBuffer
+except Exception as exc: # pragma: no cover - needs the hybrid-ep build
+ print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job "
+ "setup (cx_build_deepep_hybrid). "
+ f"{exc!r}", file=sys.stderr)
+ raise
+
+
+def _deepep_hybrid_version() -> str:
+ return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep"))
+
+
+def _hybrid_build_evidence() -> list[dict[str, str]]:
+ records = []
+ for module_name, role in (
+ ("deep_ep_cpp", "deepep-extension"),
+ ("hybrid_ep_cpp", "deepep-hybrid-extension"),
+ ):
+ module = importlib.import_module(module_name)
+ path = getattr(module, "__file__", None)
+ if not path:
+ raise RuntimeError(f"{module_name} has no loaded extension path")
+ records.append(contracts.content_manifest_evidence(
+ role=role,
+ name=module_name,
+ files=[(os.path.basename(path), path)],
+ ))
+ return sorted(records, key=lambda item: (item["role"], item["name"]))
+
+
+HYBRID_CONFIG_FIELDS = (
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes", "pad_multiple",
+ "num_of_tokens_per_chunk_preprocessing_api",
+ "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api",
+ "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type",
+ "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api",
+ "num_of_in_flight_s2g_dispatch_api",
+ "num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_additional_in_flight_s2g_dispatch_api",
+ "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api",
+ "forward_dispatch_api", "device_side_sync_dispatch_api",
+ "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api",
+ "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api",
+ "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api",
+ "backward_combine_api", "device_side_sync_combine_api",
+)
+
+
+def _hybrid_realized_config(config) -> dict[str, str | int | bool]:
+ """Project the Python-visible, post-autotune HybridEP config to JSON scalars."""
+ realized = {}
+ for field in HYBRID_CONFIG_FIELDS:
+ try:
+ value = getattr(config, field)
+ except AttributeError as exc:
+ raise RuntimeError(f"HybridEP realized config omits {field}") from exc
+ if field == "token_data_type":
+ token_type = getattr(value, "name", None)
+ if token_type not in {"UINT8", "UINT16"}:
+ token_type = {"uint8_t": "UINT8", "uint16_t": "UINT16"}.get(str(value))
+ if token_type is None:
+ raise RuntimeError("HybridEP realized token_data_type is invalid")
+ realized[field] = token_type
+ continue
+ if type(value) is bool:
+ realized[field] = value
+ continue
+ try:
+ realized[field] = int(value)
+ except (TypeError, ValueError) as exc:
+ raise RuntimeError(f"HybridEP realized config {field} is not integral") from exc
+ return realized
+
+
+def _sha256_with_size(path: Path) -> tuple[str, int]:
+ digest = hashlib.sha256()
+ size = 0
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ size += len(chunk)
+ return digest.hexdigest(), size
+
+
+def _hybrid_jit_evidence(root: Path) -> list[dict[str, str | int]]:
+ """Hash final JIT libraries without exposing rank-specific cache paths."""
+ if not root.is_dir():
+ raise RuntimeError("DeepEP Hybrid produced no JIT cache directory")
+ artifacts = []
+ for path in sorted(root.iterdir(), key=lambda item: item.name):
+ if path.suffix != ".so":
+ continue
+ if path.is_symlink() or not path.is_file():
+ raise RuntimeError("DeepEP Hybrid JIT artifact is not a regular file")
+ kernel_key = path.stem
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", kernel_key):
+ raise RuntimeError("DeepEP Hybrid JIT kernel key is invalid")
+ digest, size = _sha256_with_size(path)
+ if size <= 0:
+ raise RuntimeError("DeepEP Hybrid JIT artifact is empty")
+ artifacts.append({
+ "bytes": size,
+ "kernel_key": kernel_key,
+ "sha256": digest,
+ })
+ if len(artifacts) != 3:
+ raise RuntimeError(
+ f"DeepEP Hybrid expected 3 final JIT libraries, found {len(artifacts)}"
+ )
+ return artifacts
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"DeepEP Hybrid {label} differs across ranks")
+
+
+class DeepEPHybridBackend:
+ name = "deepep-hybrid"
+ # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed
+ # before a timed combine); the harness times dispatch and combine separately (like ep_deepep).
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+ self.group = dist.group.WORLD
+ self.tolerance = 5e-2
+ self.top_k = int(args.topk)
+ self.num_experts = int(args.experts)
+ self.hidden = int(args.hidden)
+ self.local_experts = max(1, self.num_experts // world_size)
+ # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap).
+ self.max_tokens = 4096
+ dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ ver = _deepep_hybrid_version()
+ loaded_libraries = _hybrid_build_evidence()
+ _require_cross_rank_equal(loaded_libraries, "loaded extension identities")
+
+ # HybridEP's compiler uses a process-specific child of HYBRID_EP_CACHE_DIR. Give every
+ # rank a fresh private base so stale kernels cannot enter this attempt's evidence.
+ self._previous_jit_cache_dir = os.environ.get("HYBRID_EP_CACHE_DIR")
+ self._jit_cache_dir = tempfile.mkdtemp(prefix=f"collectivex-hybrid-r{rank}-")
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._jit_cache_dir
+ self._jit_root = (
+ Path(self._jit_cache_dir) / ".deepep" / "hybrid_ep" / "jit"
+ / f"proc-{os.getpid()}"
+ )
+ self._realized_config = None
+ self._deferred_semantic_snapshot = None
+ self._deferred_jit_diagnostics = None
+
+ # Construct the HybridEPBuffer treating all ranks as ONE NVLink domain (default
+ # num_of_hybrid_ep_ranks_per_nvlink_domain == world_size). On x86 that domain is one node; on a
+ # GB200/GB300 NVL72 the MNNVL fabric makes 2 trays one NVLink domain, so EP8 (8 ranks) is covered
+ # by this same path (validated transport=mnnvl). SM counts default.
+ try:
+ self.buffer = HybridEPBuffer(
+ self.group, hidden_dim=self.hidden,
+ max_num_of_tokens_per_rank=self.max_tokens,
+ num_local_experts=self.local_experts, use_fp8=False)
+ except Exception as exc:
+ shutil.rmtree(self._jit_cache_dir, ignore_errors=True)
+ if self._previous_jit_cache_dir is None:
+ os.environ.pop("HYBRID_EP_CACHE_DIR", None)
+ else:
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir
+ raise RuntimeError(
+ f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} "
+ f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc
+ update_template_config = self.buffer.update_template_config
+
+ def tracked_update_template_config(*call_args, **call_kwargs):
+ config = update_template_config(*call_args, **call_kwargs)
+ realized = _hybrid_realized_config(config)
+ if self._realized_config is not None and realized != self._realized_config:
+ raise RuntimeError("DeepEP Hybrid realized autotune config changed within one case")
+ self._realized_config = realized
+ return config
+
+ self.buffer.update_template_config = tracked_update_template_config
+ self.domain_rank = int(self.buffer.local_rank)
+ if self.domain_rank != rank:
+ raise RuntimeError(
+ "HybridEPBuffer rank within the single NVLink domain differs from global rank: "
+ f"domain={self.domain_rank} global={rank}"
+ )
+ if rank == 0:
+ print(f"[deepep-hybrid] HybridEPBuffer constructed (single NVLink domain, world={world_size}, "
+ f"local_experts={self.local_experts}, hidden={self.hidden})", file=sys.stderr)
+
+ self.backend_provenance = {
+ "deepep_commit": ver, "branch": "hybrid-ep",
+ "deepep_tree": os.environ.get("DEEPEP_TREE"),
+ "backend_lineage": "deepep-hybrid",
+ "loaded_libraries": loaded_libraries,
+ "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)",
+ "mode": "normal", "transport": "nvlink-domain", # one node (x86) or one NVL72 MNNVL domain (gb300 EP8)
+ "resource_mode": "tuned",
+ "num_sms": None, "device_sms": dev_sms,
+ "tuned_source": "deepep-hybrid-configurer-autotune-v1",
+ "realized_config": None, "jit_kernel_keys": [], "jit_shared_objects": [],
+ "max_num_tokens": self.max_tokens, "top_k": self.top_k,
+ "num_experts": self.num_experts, "local_experts": self.local_experts,
+ "routing_factor": "ranks",
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=int(T), x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ recv, recv_probs, _scales, handle = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_of_experts=self.num_experts,
+ )
+ return types.SimpleNamespace(
+ recv=recv,
+ recv_payload=recv,
+ recv_probs=recv_probs,
+ handle=handle,
+ combine_input=None,
+ )
+
+ def stage(self, p, h):
+ # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token.
+ h.combine_input = h.recv_payload
+ return None
+
+ def combine(self, p, h):
+ # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks").
+ comb = self.buffer.combine(h.combine_input, handle=h.handle)
+ return comb[0] if isinstance(comb, (tuple, list)) else comb
+
+ def capture_deferred_provenance(self):
+ torch.cuda.synchronize()
+ dist.barrier()
+ if self._realized_config is None:
+ raise RuntimeError("DeepEP Hybrid autotune config was not materialized")
+ local_artifacts = _hybrid_jit_evidence(self._jit_root)
+ semantic = {
+ "jit_kernel_keys": [item["kernel_key"] for item in local_artifacts],
+ "realized_config": dict(self._realized_config),
+ }
+ # NVCC may embed each rank's timestamped source basename in its ELF, so raw .so hashes are
+ # diagnostics rather than a cross-rank identity. Stable kernel keys encode every codegen
+ # input, including HybridEpConfigInstance fields that the Python binding does not expose.
+ _require_cross_rank_equal(semantic, "realized config/JIT kernel keys")
+ gathered_artifacts = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered_artifacts, local_artifacts)
+ diagnostics = []
+ for artifact_index, kernel_key in enumerate(semantic["jit_kernel_keys"]):
+ diagnostics.append({
+ "kernel_key": kernel_key,
+ "rank_artifacts": [
+ {
+ "bytes": rank_artifacts[artifact_index]["bytes"],
+ "rank": artifact_rank,
+ "sha256": rank_artifacts[artifact_index]["sha256"],
+ }
+ for artifact_rank, rank_artifacts in enumerate(gathered_artifacts)
+ ],
+ })
+ if self._deferred_semantic_snapshot is not None and semantic != self._deferred_semantic_snapshot:
+ raise RuntimeError("DeepEP Hybrid config/JIT kernel set changed after measurement")
+ if self._deferred_jit_diagnostics is not None and diagnostics != self._deferred_jit_diagnostics:
+ raise RuntimeError("DeepEP Hybrid rank-local JIT artifacts changed after measurement")
+ self._deferred_semantic_snapshot = semantic
+ self._deferred_jit_diagnostics = diagnostics
+ self.backend_provenance.update(semantic)
+ self.backend_provenance["jit_shared_objects"] = diagnostics
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ routing_map = h.handle[4][:count]
+ rows, local_expert_ids = routing_map.nonzero(as_tuple=True)
+ positions = routing_map.to(torch.int64).cumsum(dim=1)[rows, local_expert_ids] - 1
+ probability_columns = self.domain_rank * self.local_experts + local_expert_ids
+ if h.recv_probs.shape[1] < (self.domain_rank + 1) * self.local_experts:
+ raise RuntimeError("HybridEPBuffer probability tensor omits this NVLink-domain rank")
+ expert_ids = torch.full(
+ (count, self.top_k), -1, dtype=torch.int64, device=self.device
+ )
+ weights = torch.zeros(
+ (count, self.top_k), dtype=torch.float32, device=self.device
+ )
+ expert_ids[rows, positions] = local_expert_ids + self.rank * self.local_experts
+ weights[rows, positions] = h.recv_probs[:count][rows, probability_columns]
+ return types.SimpleNamespace(
+ payload=h.recv_payload[:count],
+ expert_ids=expert_ids,
+ weights=weights,
+ local_expert_counts=routing_map.sum(dim=0, dtype=torch.int64),
+ ordering_contract="global-source-filter-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combined = self.buffer.combine(
+ transformed.to(h.recv_payload.dtype), handle=h.handle
+ )
+ return combined[0] if isinstance(combined, (tuple, list)) else combined
+
+ def recv_tokens(self, h):
+ return int(h.handle[3].item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ shutil.rmtree(self._jit_cache_dir, ignore_errors=True)
+ if self._previous_jit_cache_dir is None:
+ os.environ.pop("HYBRID_EP_CACHE_DIR", None)
+ else:
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_v2.py b/experimental/CollectiveX/tests/ep_deepep_v2.py
new file mode 100644
index 0000000000..a11185effb
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_v2.py
@@ -0,0 +1,507 @@
+#!/usr/bin/env python3
+"""DeepEP PR #605 adapter with PR #630's pure scale-up initialization fix."""
+
+from __future__ import annotations
+
+import ctypes
+import hashlib
+import importlib.metadata
+import inspect
+import json
+import os
+import re
+import sys
+import types
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import contracts
+import ep_harness
+
+try:
+ import deep_ep
+ from deep_ep import ElasticBuffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: DeepEP V2 import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+DEEPEP_V2_PR = 605
+DEEPEP_V2_FIX_PR = 630
+DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+DEEPEP_V2_TREE = "29809e75c5874e6609dac4804e7b651d5226959f"
+DEEPEP_V2_FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+DEEPEP_V2_VERSION = "2.0.0"
+DEEPEP_V2_DISTRIBUTION = "2.0.0+fa8a9b1"
+DEEPEP_V2_JIT_RANDOM_SEED = "collectivex-deepep-v2-fa8a9b1"
+TORCH_VERSION = "2.10.0+cu130"
+NCCL_VERSION = "2.30.4"
+NVSHMEM_VERSION = "3.3.9"
+DEEPEP_V2_JIT_KERNELS = contracts.DEEPEP_V2_JIT_KERNELS
+
+
+def _sha256(path: str) -> str:
+ digest = hashlib.sha256()
+ with open(path, "rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ return digest.hexdigest()
+
+
+def _api_sha256() -> str:
+ signatures = {
+ "ElasticBuffer.__init__": str(inspect.signature(ElasticBuffer.__init__)),
+ "ElasticBuffer.dispatch": str(inspect.signature(ElasticBuffer.dispatch)),
+ "ElasticBuffer.combine": str(inspect.signature(ElasticBuffer.combine)),
+ }
+ return hashlib.sha256(
+ json.dumps(signatures, sort_keys=True, separators=(",", ":")).encode()
+ ).hexdigest()
+
+
+def _loaded_library_paths() -> set[str]:
+ extension = getattr(getattr(deep_ep, "_C", None), "__file__", None)
+ if not extension or not os.path.isfile(extension):
+ raise RuntimeError("DeepEP V2 extension library is not loaded")
+ paths = {os.path.realpath(extension)}
+ try:
+ with open("/proc/self/maps", encoding="utf-8") as handle:
+ for line in handle:
+ path = line.rstrip().split()[-1]
+ name = os.path.basename(path)
+ if ("libnccl.so" in name or "libnvshmem_host.so" in name) and os.path.isfile(path):
+ paths.add(os.path.realpath(path))
+ except OSError as exc: # pragma: no cover - benchmark runtime is Linux
+ raise RuntimeError("cannot inspect loaded communication libraries") from exc
+ return paths
+
+
+def _loaded_nccl_version() -> str:
+ matches = [
+ path for path in _loaded_library_paths()
+ if "libnccl.so" in os.path.basename(path)
+ ]
+ if len(matches) != 1:
+ raise RuntimeError("expected exactly one loaded NCCL library")
+ version = ctypes.c_int()
+ if ctypes.CDLL(matches[0]).ncclGetVersion(ctypes.byref(version)) != 0:
+ raise RuntimeError("loaded NCCL version query failed")
+ return ep_harness.format_collective_version(version.value)
+
+
+def _loaded_library_evidence() -> list[dict[str, str]]:
+ """Return content identities, never private library paths."""
+ paths = _loaded_library_paths()
+ required = {
+ "nccl": [path for path in paths if "libnccl.so" in os.path.basename(path)],
+ "nvshmem": [path for path in paths if "libnvshmem_host.so" in os.path.basename(path)],
+ }
+ mismatches = [f"{name}={len(matches)}" for name, matches in required.items() if len(matches) != 1]
+ if mismatches:
+ raise RuntimeError("expected one loaded library for each dependency: " + ", ".join(mismatches))
+
+ def role(path: str) -> str:
+ name = os.path.basename(path)
+ if "libnccl.so" in name:
+ return "nccl"
+ if "libnvshmem_host.so" in name:
+ return "nvshmem"
+ return "deepep-extension"
+
+ def label(path: str) -> str:
+ return "deep_ep._C" if role(path) == "deepep-extension" else os.path.basename(path)
+
+ return sorted(
+ ({"role": role(path), "name": label(path), "sha256": _sha256(path)} for path in paths),
+ key=lambda item: (item["role"], item["name"], item["sha256"]),
+ )
+
+
+def _jit_artifact_evidence() -> list[dict[str, str]]:
+ root = Path(os.environ["EP_JIT_CACHE_DIR"]) / "cache"
+ if root.is_symlink() or not root.is_dir():
+ raise RuntimeError("DeepEP V2 produced no JIT cache evidence")
+ artifacts = []
+ kernel_names = set()
+ for directory in sorted(root.iterdir(), key=lambda item: item.name):
+ match = re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.([0-9a-f]{32})", directory.name)
+ if directory.is_symlink() or not directory.is_dir() or match is None:
+ raise RuntimeError("DeepEP V2 JIT cache contains an invalid entry")
+ if {path.name for path in directory.iterdir()} != {
+ "kernel.cu", "kernel.cubin", "kernel.sass",
+ }:
+ raise RuntimeError("DeepEP V2 JIT kernel evidence is incomplete")
+ source = directory / "kernel.cu"
+ cubin = directory / "kernel.cubin"
+ sass = directory / "kernel.sass"
+ if any(path.is_symlink() or not path.is_file() for path in (source, cubin, sass)):
+ raise RuntimeError("DeepEP V2 JIT evidence is not a regular file")
+ if any(path.stat().st_size <= 0 for path in (source, cubin, sass)):
+ raise RuntimeError("DeepEP V2 JIT evidence is empty")
+ kernel_names.add(match.group(1))
+ artifacts.append({
+ "cache_key": directory.name,
+ "source_sha256": _sha256(str(source)),
+ "sass_sha256": _sha256(str(sass)),
+ "cubin_sha256": _sha256(str(cubin)),
+ })
+ if (
+ len(artifacts) != len(DEEPEP_V2_JIT_KERNELS)
+ or kernel_names != DEEPEP_V2_JIT_KERNELS
+ ):
+ raise RuntimeError("DeepEP V2 JIT kernel set differs from the v1 contract")
+ return sorted(artifacts, key=lambda item: item["cache_key"])
+
+
+def _jit_cache_key(
+ args,
+ world_size: int,
+ max_tokens: int,
+ allow_hybrid_mode: bool,
+ realized: dict[str, int | bool],
+) -> str:
+ """Key generated kernels by codegen inputs, not routing data or case identity."""
+ payload = {
+ "contract": "deepep-v2-jit-config-v3",
+ "runner": args.runner,
+ "world_size": world_size,
+ "hidden": args.hidden,
+ "topk": args.topk,
+ "physical_experts": args.experts,
+ "tuning_experts": getattr(args, "num_logical_experts", args.experts),
+ "max_tokens": max_tokens,
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "input_layout": "bf16-no-sf",
+ "expert_alignment": 1,
+ "do_cpu_sync": True,
+ "cached_mode": False,
+ "do_expand": False,
+ "use_expanded_layout": False,
+ "allow_hybrid_mode": allow_hybrid_mode,
+ "allow_multiple_reduction": True,
+ "prefer_overlap_with_compute": True,
+ "deterministic": False,
+ **realized,
+ }
+ return "jitcfg-v3-" + hashlib.sha256(
+ json.dumps(payload, sort_keys=True, separators=(",", ":")).encode()
+ ).hexdigest()
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"DeepEP V2 {label} differs across ranks")
+
+
+def _configure_gin_mode(args, world_size: int) -> bool:
+ scale_up_domain = int(
+ getattr(args, "scale_up_domain", None)
+ or getattr(args, "gpus_per_node", None)
+ or world_size
+ )
+ allow_hybrid_mode = world_size > scale_up_domain
+ if allow_hybrid_mode:
+ os.environ.pop("EP_DISABLE_GIN", None)
+ else:
+ os.environ["EP_DISABLE_GIN"] = "1"
+ return allow_hybrid_mode
+
+
+def _lsa_topology_is_valid(
+ gin_enabled: bool, world_size: int, config: dict[str, int | bool]
+) -> bool:
+ return gin_enabled or (
+ config["physical_rdma_ranks"] == 1
+ and config["physical_nvlink_ranks"] == world_size
+ and config["logical_scaleout_ranks"] == 1
+ and config["logical_scaleup_ranks"] == world_size
+ and config["is_scaleup_nvlink"] is True
+ )
+
+
+def _require_runtime() -> tuple[str, str]:
+ expected = {
+ "DEEPEP_V2_PR": str(DEEPEP_V2_PR),
+ "DEEPEP_V2_FIX_PR": str(DEEPEP_V2_FIX_PR),
+ "DEEPEP_V2_COMMIT": DEEPEP_V2_COMMIT,
+ "DEEPEP_V2_TREE": DEEPEP_V2_TREE,
+ "DEEPEP_V2_FMT_COMMIT": DEEPEP_V2_FMT_COMMIT,
+ "DEEPEP_V2_JIT_RANDOM_SEED": DEEPEP_V2_JIT_RANDOM_SEED,
+ "EP_JIT_DUMP_SASS": "1",
+ }
+ mismatches = [
+ f"{name}={os.environ.get(name)!r}, expected {value!r}"
+ for name, value in expected.items()
+ if os.environ.get(name) != value
+ ]
+ torch_version = str(torch.__version__)
+ nccl_package_version = importlib.metadata.version("nvidia-nccl-cu13")
+ nvshmem_package_version = importlib.metadata.version("nvidia-nvshmem-cu12")
+ actual = {
+ "deep_ep": str(getattr(deep_ep, "__version__", "")),
+ "deep_ep distribution": importlib.metadata.version("deep_ep"),
+ "torch": torch_version,
+ "nvidia-nccl-cu13": nccl_package_version,
+ "nvidia-nvshmem-cu12": nvshmem_package_version,
+ }
+ required = {
+ "deep_ep": DEEPEP_V2_VERSION,
+ "deep_ep distribution": DEEPEP_V2_DISTRIBUTION,
+ "torch": TORCH_VERSION,
+ "nvidia-nccl-cu13": NCCL_VERSION,
+ "nvidia-nvshmem-cu12": NVSHMEM_VERSION,
+ }
+ mismatches.extend(
+ f"{name}={actual[name]!r}, expected {value!r}"
+ for name, value in required.items()
+ if actual[name] != value
+ )
+ if not inspect.isclass(ElasticBuffer) or ElasticBuffer.__name__ != "ElasticBuffer":
+ mismatches.append("deep_ep.ElasticBuffer is absent")
+ if os.environ.get("EP_SUPPRESS_NCCL_CHECK"):
+ mismatches.append("EP_SUPPRESS_NCCL_CHECK must be unset")
+ nccl_runtime_version = _loaded_nccl_version()
+ if nccl_runtime_version != NCCL_VERSION:
+ mismatches.append(
+ f"loaded NCCL={nccl_runtime_version!r}, expected {NCCL_VERSION!r}"
+ )
+ if mismatches:
+ raise RuntimeError("invalid DeepEP V2 runtime: " + "; ".join(mismatches))
+ return torch_version, nccl_runtime_version
+
+
+class DeepEPV2Backend:
+ name = "deepep-v2"
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+ self.group = dist.group.WORLD
+ torch_version, nccl_runtime_version = _require_runtime()
+ ladder, _ = ep_harness.token_ladder(args.tokens_ladder, args.phase, None)
+ conditioning = ep_harness.CONDITIONING_LADDERS[args.phase]
+ self.max_tokens = max([*ladder, *conditioning])
+ jit_root = Path(os.environ["EP_JIT_CACHE_DIR"])
+ allow_hybrid_mode = _configure_gin_mode(args, world_size)
+ gin_enabled = allow_hybrid_mode
+ communication_backend = "nccl-gin" if gin_enabled else "nccl-device-lsa"
+ self._deferred_jit_snapshot = None
+ self.buffer = ElasticBuffer(
+ self.group,
+ num_max_tokens_per_rank=self.max_tokens,
+ hidden=args.hidden,
+ num_topk=args.topk,
+ use_fp8_dispatch=False,
+ deterministic=False,
+ allow_hybrid_mode=allow_hybrid_mode,
+ allow_multiple_reduction=True,
+ prefer_overlap_with_compute=True,
+ num_gpu_timeout_secs=100,
+ explicitly_destroy=True,
+ )
+ tuning_num_experts = int(getattr(args, "num_logical_experts", args.experts))
+ self.num_sms = int(
+ self.buffer.get_theoretical_num_sms(tuning_num_experts, args.topk)
+ )
+ self.num_qps = int(self.buffer.get_theoretical_num_qps(self.num_sms))
+ properties = torch.cuda.get_device_properties(device)
+ device_sms = int(properties.multi_processor_count)
+ jit_config = {
+ "num_sms": self.num_sms,
+ "num_qps": self.num_qps,
+ "allocated_qps": int(self.buffer.num_allocated_qps),
+ "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks),
+ "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks),
+ "physical_rdma_ranks": int(self.buffer.num_rdma_ranks),
+ "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks),
+ "is_scaleup_nvlink": self.buffer.num_scaleup_ranks == self.buffer.num_nvlink_ranks,
+ "device_arch_major": int(properties.major),
+ "device_arch_minor": int(properties.minor),
+ "device_sms": device_sms,
+ "device_smem_bytes": int(properties.shared_memory_per_block_optin),
+ "gpu_timeout_cycles": 100 * int(properties.clock_rate) * 1000,
+ }
+ _require_cross_rank_equal(jit_config, "JIT configuration")
+ if not _lsa_topology_is_valid(gin_enabled, world_size, jit_config):
+ raise RuntimeError("DeepEP V2 no-GIN run is outside one realized LSA domain")
+ self.jit_cache_key = _jit_cache_key(
+ args, world_size, self.max_tokens, allow_hybrid_mode, jit_config
+ )
+ os.environ["EP_JIT_CACHE_DIR"] = str(jit_root / self.jit_cache_key)
+ realized_config = {
+ "jit_cache_key": self.jit_cache_key,
+ "num_max_tokens_per_rank": self.max_tokens,
+ **jit_config,
+ }
+ _require_cross_rank_equal(realized_config, "realized tuning/topology")
+ comm = getattr(self.buffer, "nccl_comm_handle", None)
+ communicator = (
+ "deepep-managed" if getattr(comm, "managed", True) else "pytorch-reused"
+ )
+
+ loaded_libraries = _loaded_library_evidence()
+ _require_cross_rank_equal(loaded_libraries, "loaded libraries")
+ self.backend_provenance = {
+ "deepep_version": DEEPEP_V2_VERSION,
+ "deepep_distribution_version": importlib.metadata.version("deep_ep"),
+ "deepep_commit": DEEPEP_V2_COMMIT,
+ "deepep_tree": DEEPEP_V2_TREE,
+ "deepep_pr": DEEPEP_V2_PR,
+ "deepep_fix_pr": DEEPEP_V2_FIX_PR,
+ "fmt_commit": DEEPEP_V2_FMT_COMMIT,
+ "api": "deep_ep.ElasticBuffer",
+ "api_signature_sha256": _api_sha256(),
+ "communication_backend": communication_backend,
+ "gin_enabled": gin_enabled,
+ "nccl_communicator": communicator,
+ "torch_version": torch_version,
+ "torch_git_version": str(torch.version.git_version),
+ "cuda_version": str(torch.version.cuda),
+ "nccl_package_version": importlib.metadata.version("nvidia-nccl-cu13"),
+ "nccl_version": nccl_runtime_version,
+ "nvshmem_package_version": importlib.metadata.version("nvidia-nvshmem-cu12"),
+ "loaded_libraries": loaded_libraries,
+ "jit_cache_key": self.jit_cache_key,
+ "jit_cubins": [],
+ "jit_random_seed": DEEPEP_V2_JIT_RANDOM_SEED,
+ "num_experts": int(args.experts),
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "deterministic": False,
+ "resource_mode": "tuned",
+ "requested_num_sms": self.num_sms,
+ "tuning_num_experts": tuning_num_experts,
+ "num_sms": self.num_sms,
+ "num_qps": self.num_qps,
+ "allocated_qps": int(self.buffer.num_allocated_qps),
+ "device_sms": device_sms,
+ "sm_fraction": self.num_sms / device_sms,
+ "tuned_source": "deepep-v2-analytical-sm-qp-logical-experts-v1",
+ "num_max_tokens_per_rank": self.max_tokens,
+ "allow_hybrid_mode": bool(self.buffer.allow_hybrid_mode),
+ "allow_multiple_reduction": bool(self.buffer.allow_multiple_reduction),
+ "prefer_overlap_with_compute": bool(
+ self.buffer.prefer_overlap_with_compute
+ ),
+ "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks),
+ "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks),
+ "physical_rdma_ranks": int(self.buffer.num_rdma_ranks),
+ "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks),
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(deep_ep.topk_idx_t),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ recv_x, recv_topk_idx, recv_topk_weights, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_experts=self.args.experts,
+ num_max_tokens_per_rank=self.max_tokens,
+ expert_alignment=1,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ do_handle_copy=True,
+ do_cpu_sync=True,
+ do_expand=False,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ combined_x, _, _ = self.buffer.combine(
+ h.combine_input,
+ handle=h.handle,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ )
+ return combined_x
+
+ def capture_deferred_provenance(self):
+ # destroy() uses this same barrier. Materialize its JIT kernel before hashing the
+ # implementation so the first and later routing cases see identical evidence.
+ self.buffer.barrier(use_comm_stream=True, with_cpu_sync=True)
+ torch.cuda.synchronize()
+ jit_cubins = _jit_artifact_evidence()
+ _require_cross_rank_equal(jit_cubins, "JIT CUBINs")
+ if (
+ self._deferred_jit_snapshot is not None
+ and jit_cubins != self._deferred_jit_snapshot
+ ):
+ raise RuntimeError("DeepEP V2 JIT CUBIN set changed after measurement")
+ self._deferred_jit_snapshot = jit_cubins
+ self.backend_provenance["jit_cubins"] = jit_cubins
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ local_idx = h.recv_topk_idx[:count]
+ valid = local_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ local_idx + self.rank * (self.args.experts // self.world_size),
+ local_idx,
+ )
+ local = local_idx[valid].to(torch.int64)
+ return types.SimpleNamespace(
+ payload=h.recv_x[:count],
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights[:count].masked_fill(~valid, 0),
+ local_expert_counts=torch.bincount(
+ local, minlength=self.args.experts // self.world_size
+ ),
+ ordering_contract="elastic-source-metadata-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combine_input = torch.zeros_like(h.recv_x)
+ combine_input[: transformed.shape[0]].copy_(transformed.to(combine_input.dtype))
+ combined, _, _ = self.buffer.combine(
+ combine_input,
+ handle=h.handle,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ )
+ return combined
+
+ def recv_tokens(self, h):
+ return int(h.handle.psum_num_recv_tokens_per_scaleup_rank[-1].item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ self.buffer.destroy()
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ return 1
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 0000000000..ca9dee8fcf
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,1362 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`)
+implement a small duck-typed protocol; this module owns the source-tokens-per-rank
+sweep, the timing, the correctness gate, and the provenance-tagged JSON doc.
+
+Fair-comparison contract (see docs/methodology.md):
+ * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs +
+ gate weights are generated once from a fixed seed over the *global* batch and are
+ identical on every SKU; each rank materializes its slice. So every platform runs
+ the *same* problem (no per-rank/per-platform RNG in the adapters).
+ * **Explicit measurement contract**: layout-and-dispatch-v1 includes routing-layout
+ generation in dispatch timing. Combine excludes staging.
+ Isolated sum is derived independently at each percentile and is not a measured chained op.
+ * **Correct collective percentile**: each iteration's latency is reduced MAX across
+ ranks first (a collective finishes with its slowest rank), THEN percentiled —
+ `median_i(max_r)`, not `max_r(median_i)`.
+ * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and
+ `global_tokens = T * ep_size` are recorded as explicit chart coordinates.
+
+stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported
+lazily inside run_sweep) so this file `py_compile`s without torch.
+
+Backend protocol:
+ name, mode, combine_needs_redispatch, backend_provenance(dict)
+ buffer_cap(args) -> int|None
+ make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice
+ dispatch(problem) -> handle # pure dispatch comm (timed)
+ stage(problem, handle) # untimed expert-output placement
+ combine(problem, handle) -> tensor # pure combine comm (timed)
+ inspect_dispatch(problem, handle) -> view # normalized payload/expert/weight metadata
+ combine_transformed(problem, handle, tensor) -> tensor
+ recv_tokens(handle) -> int # realized tokens received this rank
+ finalize(rc) -> int|NoReturn
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import math
+import os
+
+import contracts
+import identity
+import workload as workload_contract
+
+# Raw v1 result emitted by one benchmark case. Publication uses a separate contract.
+SCHEMA_VERSION = 1
+
+# Every comparison-grade EP point uses the same literal timing profile on every SKU/backend.
+# Eight timed iterations keep each MoRI burst well below its sustained-iteration wedge, 64 trials
+# provide 512 observations per operation, and 32 warmups meet Blackwell's measured clock-ramp floor.
+SAMPLING_CONTRACT = identity.V1_CASE_PROFILE["sampling_contract"]
+TIMED_SAMPLES_PER_POINT = 512
+TIMED_ITERS_PER_TRIAL = 8
+TRIALS_PER_POINT = 64
+WARMUP_ITERS_PER_TRIAL = 32
+WARMUP_SEMANTICS = "full-roundtrip-before-each-component-trial-point-v1"
+ROUTING_SEED = 67
+ROUTING_GENERATOR = workload_contract.GENERATOR_VERSION
+ACTIVATION_PROFILE = "canonical-counter-source-v3"
+ACTIVATION_GENERATOR = workload_contract.ACTIVATION_GENERATOR
+PLACEMENT = "packed"
+COMPONENT_ORDER_CONTRACT = "roundtrip-dispatch-activation-only-combine-v2"
+
+# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal
+# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a
+# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+CONDITIONING_LADDERS = {
+ phase: list(ladder) for phase, ladder in contracts.V1_CONDITIONING_LADDERS.items()
+}
+CONDITIONING_ROUNDS_PER_SHAPE = contracts.V1_CONDITIONING_ROUNDS_PER_SHAPE
+CONDITIONING_CONTRACT = identity.V1_CASE_PROFILE["conditioning_contract"]
+ORACLE_CONTRACT = identity.V1_CASE_PROFILE["oracle_contract"]
+ORACLE_RTOL = 5e-2
+ORACLE_ATOL = 2e-2
+
+BF16_BYTES = 2
+EPLB_REDUNDANT_EXPERTS = 32
+EPLB_REFERENCE_TOKENS_PER_RANK = 2048
+EPLB_PLANNER = "greedy-rank-major-v1"
+V1_PROFILE = {
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "mode": "normal",
+ "measurement_contract": "layout-and-dispatch-v1",
+ "resource_mode": "tuned",
+ "placement": PLACEMENT,
+ "activation_profile": ACTIVATION_PROFILE,
+ "activation_generator": ACTIVATION_GENERATOR,
+ "routing_generator": ROUTING_GENERATOR,
+ "component_order_contract": COMPONENT_ORDER_CONTRACT,
+ "conditioning_contract": CONDITIONING_CONTRACT,
+ "eplb_reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK,
+ "eplb_redundant_experts": EPLB_REDUNDANT_EXPERTS,
+ "eplb_planner": EPLB_PLANNER,
+ # DeepEP/UCCL use this only as the fallback when their tuned default is not exported.
+ "num_sms": 24,
+}
+
+def format_collective_version(raw) -> str:
+ """Normalize PyTorch's tuple or packed NCCL/RCCL version representation."""
+ if isinstance(raw, int):
+ if raw < 10_000:
+ return f"{raw // 1000}.{raw // 100 % 10}.{raw % 100}"
+ return f"{raw // 10_000}.{raw // 100 % 100}.{raw % 100}"
+ if isinstance(raw, (tuple, list)):
+ return ".".join(map(str, raw))
+ return str(raw) if raw not in (None, "") else "unknown"
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+ """Add the varying v1 inputs; fixed profile values are not CLI axes."""
+ ap.set_defaults(**V1_PROFILE)
+ ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+ help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder")
+ ap.add_argument("--tokens-ladder", default="",
+ help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+ ap.add_argument("--hidden", type=int, default=7168)
+ ap.add_argument("--topk", type=int, default=8)
+ ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)")
+ ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
+ # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical
+ # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform
+ # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew.
+ ap.add_argument("--eplb", action="store_true",
+ help="apply EPLB expert replication/placement to the routing trace")
+ # Canonical workloads consume pre-generated trace bytes instead of the
+ # seeded runtime generator, so a result is provably the SAME workload as another machine's
+ # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py).
+ ap.add_argument("--workload-dir", default="",
+ help="dir of canonical workload traces; empty = seeded runtime generation (dev)")
+ ap.add_argument("--case-id", default="")
+ ap.add_argument("--suite", default="")
+ ap.add_argument("--workload-name", default="")
+ ap.add_argument("--required-publication", default="")
+ ap.add_argument("--seed", type=int, default=ROUTING_SEED)
+ # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks +
+ # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us
+ # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within
+ # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless.
+ ap.add_argument("--warmup", type=int, default=WARMUP_ITERS_PER_TRIAL,
+ help=f"untimed full roundtrips before each trial/point; fixed by "
+ f"{SAMPLING_CONTRACT} to {WARMUP_ITERS_PER_TRIAL}")
+ ap.add_argument("--iters", type=int, default=TIMED_ITERS_PER_TRIAL,
+ help=f"timed iterations per trial; fixed by {SAMPLING_CONTRACT} to "
+ f"{TIMED_ITERS_PER_TRIAL}")
+ ap.add_argument("--trials", type=int, default=TRIALS_PER_POINT,
+ help=f"timed trials; fixed by {SAMPLING_CONTRACT} to {TRIALS_PER_POINT}")
+ # provenance / output
+ ap.add_argument("--runner", required=True)
+ ap.add_argument("--topology-class", required=True)
+ ap.add_argument("--transport", default="")
+ # gpus-per-node=0 means one node containing the whole EP group.
+ ap.add_argument("--gpus-per-node", type=int, default=0)
+ ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)")
+ ap.add_argument("--timestamp")
+ ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+ """Return (ladder, dropped): explicit spec else the phase default; positive ints;
+ clamped to `cap` with dropped points reported (never silently truncated)."""
+ if spec and spec.strip():
+ want = [int(t) for t in spec.replace(",", " ").split() if t]
+ else:
+ want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+ want = sorted({t for t in want if t > 0})
+ if cap is not None:
+ return [t for t in want if t <= cap], [t for t in want if t > cap]
+ return want, []
+
+
+def sampling_contract_error(iters: int, trials: int, warmup: int) -> str | None:
+ """Return a user-facing error unless the exact cross-SKU timing profile is used."""
+ expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL)
+ observed = (iters, trials, warmup)
+ if observed != expected:
+ return (f"{SAMPLING_CONTRACT} requires exactly iters:trials:warmup="
+ f"{expected[0]}:{expected[1]}:{expected[2]} on every SKU/backend; got "
+ f"{observed[0]}:{observed[1]}:{observed[2]} "
+ f"({iters * trials if iters > 0 and trials > 0 else 'invalid'} timed samples)")
+ return None
+
+
+def _stats_vec(xs: list[int]) -> dict:
+ """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token
+ or load summary without dumping the full vector."""
+ n = len(xs) or 1
+ mean = sum(xs) / n
+ var = sum((x - mean) ** 2 for x in xs) / n
+ cv = (var ** 0.5 / mean) if mean > 0 else 0.0
+ return {"min": min(xs) if xs else 0, "mean": round(mean, 3),
+ "max": max(xs) if xs else 0, "cv": round(cv, 4),
+ "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n}
+
+
+def percentile(xs: list[float], q: float) -> float:
+ if not xs:
+ return float("nan")
+ s = sorted(xs)
+ i = max(0, min(len(s) - 1, math.ceil(q / 100.0 * len(s)) - 1))
+ return s[i]
+
+
+def _sha256_json(value) -> str:
+ payload = json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":")
+ ).encode()
+ return hashlib.sha256(payload).hexdigest()
+
+
+def _series_provenance(provenance: dict) -> dict:
+ """Retain stable semantic build identity while keeping raw binaries diagnostic."""
+ return contracts.series_provenance(provenance)
+
+
+def _write_bytes_atomic(path: str, payload: bytes) -> tuple[str, int]:
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+ temporary = f"{path}.tmp-{os.getpid()}"
+ try:
+ with open(temporary, "wb") as handle:
+ handle.write(payload)
+ handle.flush()
+ os.fsync(handle.fileno())
+ os.replace(temporary, path)
+ finally:
+ try:
+ os.unlink(temporary)
+ except FileNotFoundError:
+ pass
+ return hashlib.sha256(payload).hexdigest(), len(payload)
+
+
+def _write_json_atomic(path: str, value) -> tuple[str, int]:
+ payload = (
+ json.dumps(value, allow_nan=False, ensure_ascii=False, indent=2) + "\n"
+ ).encode()
+ return _write_bytes_atomic(path, payload)
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+ """Per-iteration CUDA-event latencies (µs) for THIS rank.
+
+ Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync
+ before the start event so its GPU work can't bleed in), then times `fn(pre_result)`
+ — how combine is isolated when it consumes the dispatch state and needs a fresh
+ untimed dispatch+stage before every sample. Returns the raw per-iteration series;
+ the caller reduces across ranks per iteration before percentiling.
+ """
+ def sample():
+ arg = pre() if pre is not None else None
+ if pre is not None:
+ torch.cuda.synchronize()
+ s = torch.cuda.Event(enable_timing=True)
+ e = torch.cuda.Event(enable_timing=True)
+ s.record()
+ fn(arg) if pre is not None else fn()
+ e.record()
+ torch.cuda.synchronize()
+ return s.elapsed_time(e) * 1000.0 # ms -> us
+
+ for _ in range(max(0, warmup)):
+ if pre is not None:
+ a = pre()
+ torch.cuda.synchronize()
+ fn(a)
+ else:
+ fn()
+ # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn
+ # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back
+ # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort
+ # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync.
+ torch.cuda.synchronize()
+ return [sample() for _ in range(iters)]
+
+
+def kernel_generation(backend) -> str:
+ """Return the adapter's explicit kernel family when one exists."""
+ declared = getattr(backend, "kernel_generation", None)
+ if declared:
+ return declared
+ return {
+ "deepep": "v1",
+ "deepep-v2": "v2-elastic-buffer",
+ "deepep-hybrid": "hybrid",
+ }.get(backend.name, "n-a")
+
+
+def _reduce_vec(torch, dist, device, vals, op):
+ t = torch.tensor(vals, device=device, dtype=torch.float64)
+ dist.all_reduce(t, op=op)
+ return [float(x) for x in t.tolist()]
+
+
+def _reduce_int(torch, dist, device, v: int, op) -> int:
+ t = torch.tensor([int(v)], device=device, dtype=torch.int64)
+ dist.all_reduce(t, op=op)
+ return int(t.item())
+
+
+def _same_hash_across_ranks(torch, dist, device, digest: str) -> bool:
+ parts = [int(digest[offset:offset + 8], 16) for offset in range(0, 64, 8)]
+ low = torch.tensor(parts, device=device, dtype=torch.int64)
+ high = low.clone()
+ dist.all_reduce(low, op=dist.ReduceOp.MIN)
+ dist.all_reduce(high, op=dist.ReduceOp.MAX)
+ return bool(torch.equal(low, high))
+
+
+def _tensor_sha256(*tensors) -> str:
+ digest = hashlib.sha256()
+ for tensor in tensors:
+ digest.update(tensor.detach().contiguous().cpu().numpy().tobytes())
+ return digest.hexdigest()
+
+
+def _normalized_expert_metadata(torch, expert_ids, weights):
+ """Sort each row by global expert ID while keeping -1 sentinels last."""
+ valid = expert_ids >= 0
+ keys = torch.where(valid, expert_ids.to(torch.int64), torch.full_like(expert_ids, 1 << 30))
+ order = torch.argsort(keys, dim=1, stable=True)
+ sorted_ids = torch.gather(expert_ids.to(torch.int64), 1, order)
+ sorted_weights = torch.gather(weights.to(torch.float32), 1, order)
+ sorted_valid = sorted_ids >= 0
+ return (
+ torch.where(sorted_valid, sorted_ids, torch.full_like(sorted_ids, -1)),
+ sorted_weights.masked_fill(~sorted_valid, 0),
+ )
+
+
+def _expert_transform(torch, payload, expert_ids, weights, combine_weight_semantics):
+ """Build one local expert aggregate for the v1 unweighted combine contract."""
+ if combine_weight_semantics != "unweighted-rank-sum":
+ raise ValueError("v1 requires unweighted rank-sum combine")
+ valid = expert_ids >= 0
+ expert = expert_ids.clamp(min=0).to(torch.int64)
+ gate = weights.to(torch.float32).masked_fill(~valid, 0)
+ scale = ((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32
+ offset_a = (((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64
+ offset_b = (((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128
+ scale_sum = (gate * scale).sum(dim=1, keepdim=True)
+ offset_a_sum = (gate * offset_a).sum(dim=1, keepdim=True)
+ offset_b_sum = (gate * offset_b).sum(dim=1, keepdim=True)
+ columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64)
+ pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8
+ transformed = (
+ payload.float() * scale_sum + offset_a_sum + offset_b_sum * pattern.unsqueeze(0)
+ )
+ return transformed.to(payload.dtype)
+
+
+def _expected_transformed_combine(torch, problem):
+ """Independently derive sum_i gate_i * expert_i(x) for each source token."""
+ expected = torch.zeros_like(problem.x, dtype=torch.float32)
+ expert_ids = problem.topk_idx.to(torch.int64)
+ weights = problem.topk_weights.to(torch.float32)
+ columns = torch.arange(problem.x.shape[1], device=problem.x.device, dtype=torch.int64)
+ pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8
+ for slot in range(expert_ids.shape[1]):
+ expert = expert_ids[:, slot]
+ gate = weights[:, slot].unsqueeze(1)
+ scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1)
+ offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1)
+ offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1)
+ expert_output = problem.x.float() * scale + offset_a + offset_b * pattern.unsqueeze(0)
+ expected.add_(gate * expert_output)
+ return expected
+
+
+def _run_expert_oracle(
+ torch,
+ routing,
+ backend,
+ problem,
+ global_idx,
+ global_weights,
+ rank: int,
+ experts_per_rank: int,
+ seed: int,
+):
+ """Verify one real dispatch/transform/combine without entering a timed region."""
+ handle = backend.dispatch(problem)
+ torch.cuda.synchronize()
+ try:
+ view = backend.inspect_dispatch(problem, handle)
+ source_ids = routing.decode_source_ids(view.payload, seed)
+ except Exception as inspection_error:
+ try:
+ problem.recv_tokens = backend.recv_tokens(handle)
+ backend.stage(problem, handle)
+ backend.combine(problem, handle)
+ torch.cuda.synchronize()
+ except Exception as cleanup_error:
+ raise inspection_error from cleanup_error
+ return {
+ "contract": ORACLE_CONTRACT,
+ "passed": False,
+ "ordering_contract": "adapter-inspection-failed",
+ "order_sha256": None,
+ "dispatch_sha256": None,
+ "combine_weight_semantics": getattr(
+ backend, "combine_weight_semantics", "undeclared"
+ ),
+ "receive_count": 0,
+ "atol": ORACLE_ATOL,
+ "max_absolute_error": None,
+ "max_elementwise_relative_error": None,
+ "max_relative_error": None,
+ "max_weight_error": None,
+ "rtol": ORACLE_RTOL,
+ "checks": {
+ "combine_values": False,
+ "counts": False,
+ "metadata": False,
+ "multiplicity": False,
+ "payload": False,
+ "source_set": False,
+ "weights": False,
+ },
+ }
+
+ receive_count = int(view.payload.shape[0])
+ shape_ok = (
+ view.payload.ndim == 2
+ and view.expert_ids.shape == (receive_count, problem.topk_idx.shape[1])
+ and view.weights.shape == view.expert_ids.shape
+ )
+ source_range = bool(
+ receive_count == 0
+ or ((source_ids >= 0) & (source_ids < global_idx.shape[0])).all().item()
+ )
+ if source_range:
+ expected_idx = global_idx.to(problem.x.device).index_select(0, source_ids)
+ expected_weights = global_weights.to(problem.x.device).index_select(0, source_ids)
+ local = (expected_idx // experts_per_rank) == rank
+ expected_ids = torch.where(local, expected_idx, torch.full_like(expected_idx, -1))
+ expected_weights = expected_weights.masked_fill(~local, 0)
+ expected_payload = routing.activations_for_source_ids(
+ source_ids, problem.x.shape[1], seed, problem.x.dtype
+ )
+ else:
+ expected_ids = torch.full_like(view.expert_ids, -1)
+ expected_weights = torch.zeros_like(view.weights)
+ expected_payload = torch.empty_like(view.payload)
+ actual_ids, actual_weights = _normalized_expert_metadata(
+ torch, view.expert_ids, view.weights
+ )
+ expected_ids, expected_weights = _normalized_expert_metadata(
+ torch, expected_ids, expected_weights
+ )
+ expected_sources = (
+ ((global_idx // experts_per_rank) == rank).any(dim=1).nonzero(as_tuple=True)[0]
+ ).to(problem.x.device)
+ source_set_ok = (
+ source_range
+ and source_ids.numel() == torch.unique(source_ids).numel()
+ and torch.equal(torch.sort(source_ids).values, expected_sources)
+ )
+ payload_ok = source_range and torch.equal(view.payload, expected_payload)
+ metadata_ok = shape_ok and torch.equal(actual_ids, expected_ids)
+ max_weight_error = (
+ float((actual_weights - expected_weights).abs().max().item())
+ if actual_weights.numel()
+ else 0.0
+ )
+ weights_ok = max_weight_error == 0.0
+ valid_expected = expected_ids >= 0
+ expected_local = expected_ids[valid_expected] - rank * experts_per_rank
+ expected_counts = torch.bincount(expected_local, minlength=experts_per_rank)
+ counts_ok = torch.equal(
+ view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64)
+ )
+ multiplicity_ok = torch.equal(
+ (actual_ids >= 0).sum(dim=1), (expected_ids >= 0).sum(dim=1)
+ )
+ # Receive-slot assignment may use atomics and is not a semantic EP guarantee. Compare
+ # pre/post dispatch evidence in canonical source-token order without changing the native path.
+ canonical_order = torch.argsort(source_ids.to(torch.int64), stable=True)
+ canonical_sources = source_ids.to(torch.int64).index_select(0, canonical_order)
+ canonical_ids = actual_ids.to(torch.int64).index_select(0, canonical_order)
+ canonical_weights = actual_weights.index_select(0, canonical_order)
+ ordering_contract = f"canonical-source-id-v1/{view.ordering_contract}"
+ order_sha256 = _tensor_sha256(canonical_sources)
+ dispatch_sha256 = _tensor_sha256(
+ canonical_sources, canonical_ids, canonical_weights
+ )
+
+ problem.recv_tokens = receive_count
+ combine_weight_semantics = backend.combine_weight_semantics
+ transformed = _expert_transform(
+ torch, view.payload, actual_ids, actual_weights, combine_weight_semantics
+ )
+ combined = backend.combine_transformed(problem, handle, transformed)
+ torch.cuda.synchronize()
+ expected_combined = _expected_transformed_combine(torch, problem)
+ if combined.shape == expected_combined.shape and combined.numel():
+ absolute_error = (combined.float() - expected_combined).abs()
+ max_absolute_error = float(absolute_error.max().item())
+ max_relative_error = max_absolute_error / (
+ float(expected_combined.abs().max().item()) + 1e-6
+ )
+ max_elementwise_relative_error = float(
+ (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item()
+ )
+ combine_values_ok = bool(torch.allclose(
+ combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL
+ ))
+ elif combined.shape == expected_combined.shape:
+ max_absolute_error = 0.0
+ max_elementwise_relative_error = 0.0
+ max_relative_error = 0.0
+ combine_values_ok = True
+ else:
+ max_absolute_error = None
+ max_elementwise_relative_error = None
+ max_relative_error = None
+ combine_values_ok = False
+ tolerance = float(getattr(backend, "tolerance", 5e-2))
+ checks = {
+ "combine_values": combine_values_ok,
+ "counts": counts_ok,
+ "metadata": metadata_ok,
+ "multiplicity": multiplicity_ok,
+ "payload": payload_ok,
+ "source_set": source_set_ok,
+ "weights": weights_ok,
+ }
+ return {
+ "contract": ORACLE_CONTRACT,
+ "passed": bool(
+ all(checks.values())
+ and ordering_contract
+ and max_relative_error is not None
+ and max_relative_error < tolerance
+ ),
+ "atol": ORACLE_ATOL,
+ "combine_weight_semantics": combine_weight_semantics,
+ "ordering_contract": ordering_contract,
+ "order_sha256": order_sha256,
+ "dispatch_sha256": dispatch_sha256,
+ "receive_count": receive_count,
+ "max_absolute_error": max_absolute_error,
+ "max_elementwise_relative_error": max_elementwise_relative_error,
+ "max_relative_error": max_relative_error,
+ "max_weight_error": max_weight_error,
+ "rtol": ORACLE_RTOL,
+ "checks": checks,
+ }
+
+
+def _histogram(xs: list[float], nbins: int = 40) -> dict:
+ """Compact equal-width summary of the exact private cross-rank-max samples."""
+ if not xs:
+ return {"n": 0}
+ lo, hi = min(xs), max(xs)
+ if hi <= lo:
+ return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]}
+ counts = [0] * nbins
+ span = hi - lo
+ for x in xs:
+ b = min(nbins - 1, int((x - lo) / span * nbins))
+ counts[b] += 1
+ return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts}
+
+
+def _derive_publication_status(v: dict) -> str:
+ """Classify raw attempts; only the isolated coverage publisher may promote evidence."""
+ if v["execution_status"] != "complete":
+ return "failed"
+ if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \
+ or v["workload_identity"] == "inconsistent":
+ return "invalid"
+ # Per-case producers cannot prove exact matrix coverage, repeat stability, or controlled
+ # cohorts. Keep even sound attempts diagnostic until the isolated publisher validates them.
+ return "diagnostic"
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+ """Drive the source-tokens-per-rank sweep for one fully-specified line."""
+ sampling_error = sampling_contract_error(args.iters, args.trials, args.warmup)
+ if sampling_error:
+ if rank == 0:
+ print(f"ERROR: {sampling_error}")
+ return 2
+ import routing # torch-based; imported lazily so the module byte-compiles without torch
+ import eplb # stdlib planner + torch remap (the EPLB transform)
+
+ ep_size = world_size
+ # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the
+ # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL
+ # experts then remapped to physical (build_trace), so the whole sweep runs over the
+ # balanced physical placement with no adapter change.
+ eplb_on = getattr(args, "eplb", False)
+ num_logical = getattr(args, "num_logical_experts", args.experts)
+ if args.experts % ep_size != 0:
+ if rank == 0:
+ print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+ return 2
+ experts_per_rank = args.experts // ep_size
+ if getattr(backend, "combine_weight_semantics", None) != "unweighted-rank-sum":
+ if rank == 0:
+ print("ERROR: v1 requires activation-only unweighted combine")
+ return 2
+
+ cap = backend.buffer_cap(args)
+ conditioning_ladder = CONDITIONING_LADDERS[args.phase]
+ if cap is not None and cap < conditioning_ladder[-1]:
+ if rank == 0:
+ print(f"ERROR: {backend.name} buffer cap {cap} cannot run the v1 conditioning ladder")
+ return 2
+ ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+ if rank == 0 and dropped:
+ print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+ f"(hidden={args.hidden}); not silently truncated.")
+ if not ladder:
+ if rank == 0:
+ print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+ return 2
+ MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM
+
+ # EPLB plan (once): estimate logical load from the global logical trace at the largest
+ # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB
+ # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps
+ # to physical when the plan is present; otherwise it's the identity (logical == physical).
+ eplb_plan = None
+ if eplb_on:
+ ref_idx, _ = routing.build_global_routing(
+ EPLB_REFERENCE_TOKENS_PER_RANK * ep_size,
+ num_logical,
+ args.topk,
+ args.routing,
+ args.seed,
+ )
+ load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist()
+ eplb_plan = eplb.build_plan(load, args.experts, ep_size)
+ if rank == 0:
+ print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); "
+ f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> "
+ f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts "
+ f"replicated (hottest {eplb_plan['max_replicas']}x)")
+
+ canonical = bool(getattr(args, "workload_dir", ""))
+ loaded_workload_ids, loaded_checksums = [], {}
+ if canonical:
+ import workload as _wl
+
+ def build_trace(gt):
+ # canonical: load pre-serialized trace bytes (verified by checksum) so this run is
+ # provably the SAME workload as any other consuming the same files. else: seeded gen.
+ if canonical:
+ wid = _wl.compute_workload_id(
+ args.routing, args.hidden, args.topk, num_logical, ep_size, gt, args.seed
+ )
+ idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True)
+ idx_l = torch.from_numpy(idx_np).to(torch.int64)
+ w = torch.from_numpy(w_np).to(torch.float32)
+ if wid not in loaded_workload_ids:
+ loaded_workload_ids.append(wid)
+ loaded_checksums[wid] = man.get("checksums")
+ else:
+ idx_l, w = routing.build_global_routing(
+ gt, num_logical, args.topk, args.routing, args.seed
+ )
+ return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w
+
+ # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold
+ # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually
+ # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone
+ # and is also cold-jump-safe for MoRI.
+ def warm_roundtrips(problem, count):
+ for _ in range(count):
+ handle = backend.dispatch(problem)
+ if not hasattr(problem, "recv_tokens"):
+ # Dynamic receive cardinality is stable for this fixed routing trace. Cache it
+ # during untimed conditioning so adapters never read a device scalar in timing.
+ problem.recv_tokens = backend.recv_tokens(handle)
+ backend.stage(problem, handle)
+ backend.combine(problem, handle)
+ torch.cuda.synchronize()
+
+ for wt in conditioning_ladder:
+ # Warm-only shapes need not have canonical manifests: they are never measured or emitted.
+ wi, ww = routing.build_global_routing(
+ wt * ep_size, num_logical, args.topk, args.routing, args.seed,
+ )
+ if eplb_plan is not None:
+ wi = eplb.remap_idx(wi, eplb_plan)
+ wsi, wsw = routing.rank_slice(wi, ww, rank, wt)
+ wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16)
+ wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx)
+ warm_roundtrips(wp, CONDITIONING_ROUNDS_PER_SHAPE)
+ torch.cuda.synchronize()
+ dist.barrier()
+ # Setup may materialize deferred provenance such as DeepEP V2 JIT CUBINs.
+ # Resolve it after conditioning but before correctness or timed measurements.
+ capture_deferred_provenance = getattr(backend, "capture_deferred_provenance", None)
+ if capture_deferred_provenance is not None:
+ capture_deferred_provenance()
+ provenance_issues = contracts.backend_provenance_issues(
+ backend.name, backend.backend_provenance
+ )
+ if provenance_issues:
+ if rank == 0:
+ print(
+ f"ERROR: unpinned provenance {provenance_issues} "
+ f"in {backend.backend_provenance}"
+ )
+ return 4
+ elem_dispatch = BF16_BYTES
+
+ # ---- Pass 1: build each deterministic problem and run the expert oracle. ----
+ problems, gate, gts, global_traces, input_snapshots = {}, {}, {}, {}, {}
+ routing_hashes = set()
+ for T in ladder:
+ counts = [T] * ep_size
+ gt = T * ep_size
+ gts[T] = gt
+ idx_g, w_g = build_trace(gt)
+ rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g)
+ gpn = args.gpus_per_node or ep_size
+ rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T),
+ gpn, args.scale_up_domain or None)
+ rstats["source_token_stats"] = _stats_vec(counts)
+ routing_hashes.add(rstats["routing_hash"])
+ my_off, my_cnt = rank * T, T
+ idx_s = idx_g[my_off:my_off + my_cnt].contiguous()
+ w_s = w_g[my_off:my_off + my_cnt].contiguous()
+ x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16)
+ problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x)
+ input_snapshots[T] = (
+ problem.x.clone(), problem.topk_idx.clone(), problem.topk_weights.clone()
+ )
+ oracle = _run_expert_oracle(
+ torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank,
+ args.seed,
+ )
+ before_x, before_idx, before_weights = input_snapshots[T]
+ pre_input_unchanged = (
+ torch.equal(problem.x, before_x)
+ and torch.equal(problem.topk_idx, before_idx)
+ and torch.equal(problem.topk_weights, before_weights)
+ )
+ problems[T] = problem
+ global_traces[T] = (idx_g, w_g)
+ gate[T] = {
+ "rstats": rstats,
+ "recv_local": oracle["receive_count"],
+ "max_rel": oracle["max_relative_error"] or 0.0,
+ "local_ok": int(oracle["passed"]),
+ "oracle_pre": oracle,
+ "pre_input_unchanged": pre_input_unchanged,
+ }
+
+ # ---- Pass 2: every backend uses the same ascending point order and conditioning ramp.
+ # Per-iteration cross-rank MAX samples are pooled across trials. ----
+ disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch)
+ comb_pool = {T: [] for T in ladder} # ... combine
+ rt_pool = {T: [] for T in ladder} # independently measured round trip
+ disp_trials = {T: [] for T in ladder}
+ comb_trials = {T: [] for T in ladder}
+ rt_trials = {T: [] for T in ladder}
+ order = list(ladder)
+ for _trial in range(args.trials):
+ for T in order:
+ problem = problems[T]
+ # Stateful paired APIs may expose only a measured round trip.
+ # Do not synthesize component latency from that measurement.
+ roundtrip_only = getattr(backend, "roundtrip_only", False)
+
+ def rt_once(p=problem):
+ hh = backend.dispatch(p)
+ backend.stage(p, hh)
+ return backend.combine(p, hh)
+
+ # Every available component starts after the same synchronized full-roundtrip warmup.
+ # Roundtrip is first on every backend because it is the comparison headline.
+ warm_roundtrips(problem, args.warmup)
+ rt_iters = time_us(torch, lambda p=problem: rt_once(p), 0, args.iters)
+ if roundtrip_only:
+ disp_iters = comb_iters = []
+ else:
+ warm_roundtrips(problem, args.warmup)
+ disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p),
+ 0, args.iters)
+
+ def prep(p=problem):
+ hh = backend.dispatch(p)
+ backend.stage(p, hh)
+ return hh
+ warm_roundtrips(problem, args.warmup)
+ if backend.combine_needs_redispatch:
+ comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+ 0, args.iters, pre=prep)
+ else:
+ hh = prep()
+ torch.cuda.synchronize()
+ comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+ 0, args.iters)
+ # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled.
+ if disp_iters:
+ reduced_dispatch = _reduce_vec(torch, dist, device, disp_iters, MAX)
+ reduced_combine = _reduce_vec(torch, dist, device, comb_iters, MAX)
+ disp_trials[T].append(reduced_dispatch)
+ comb_trials[T].append(reduced_combine)
+ disp_pool[T] += reduced_dispatch
+ comb_pool[T] += reduced_combine
+ reduced_roundtrip = _reduce_vec(torch, dist, device, rt_iters, MAX)
+ rt_trials[T].append(reduced_roundtrip)
+ rt_pool[T] += reduced_roundtrip
+
+ # ---- Pass 3: prove timed inputs were immutable and repeat the full oracle. ----
+ for T in ladder:
+ problem = problems[T]
+ before_x, before_idx, before_weights = input_snapshots[T]
+ input_unchanged = gate[T]["pre_input_unchanged"] and (
+ torch.equal(problem.x, before_x)
+ and torch.equal(problem.topk_idx, before_idx)
+ and torch.equal(problem.topk_weights, before_weights)
+ )
+ idx_g, w_g = global_traces[T]
+ post = _run_expert_oracle(
+ torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank,
+ args.seed,
+ )
+ pre = gate[T]["oracle_pre"]
+ order_stable = (
+ pre["ordering_contract"] == post["ordering_contract"]
+ and pre["order_sha256"] == post["order_sha256"]
+ and pre["dispatch_sha256"] == post["dispatch_sha256"]
+ )
+ gate[T].update({
+ "input_unchanged": input_unchanged,
+ "local_ok": int(pre["passed"] and post["passed"] and input_unchanged and order_stable),
+ "max_rel": max(pre["max_relative_error"] or 0.0, post["max_relative_error"] or 0.0),
+ "oracle_post": post,
+ "order_stable": order_stable,
+ })
+
+ # ---- Pass 4: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ----
+ def pcts(xs):
+ return ({"p50": percentile(xs, 50), "p90": percentile(xs, 90),
+ "p95": percentile(xs, 95), "p99": percentile(xs, 99)} if xs else None)
+
+ def component(percentiles, count, *, derived=False):
+ if percentiles is None:
+ return {"availability": "unavailable", "origin": None,
+ "percentiles_us": None, "sample_count": 0}
+ return {
+ "availability": "derived" if derived else "measured",
+ "origin": "derived-percentile-sum" if derived else "measured",
+ "percentiles_us": percentiles,
+ "sample_count": 0 if derived else count,
+ }
+ rows = []
+ all_anomalies = []
+ thr_rt = 3.0
+ for T in ladder:
+ gt = gts[T]
+ g = gate[T]
+ rstats = g["rstats"]
+ d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T]
+ dp, cp, rtp = pcts(d), pcts(c), pcts(rt)
+ # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op
+ # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput
+ # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency.
+ isum = {key: dp[key] + cp[key] for key in dp} if dp and cp else None
+ recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM)
+ recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX)
+ recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN)
+ global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN)
+ max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0]
+ point_ok = bool(global_ok) and recv_total > 0
+ rank_evidence = [None] * world_size
+ dist.all_gather_object(
+ rank_evidence,
+ {
+ "input_unchanged": g["input_unchanged"],
+ "order_stable": g["order_stable"],
+ "post_timing": g["oracle_post"],
+ "pre_timing": g["oracle_pre"],
+ "rank": rank,
+ },
+ )
+ # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv
+ # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy
+ # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert.
+ token_rank_copies = rstats["routed_copies"]
+ H = args.hidden
+ throughput = {
+ percentile_name: gt / (latency_us * 1e-6)
+ for percentile_name, latency_us in rtp.items()
+ }
+ disp_bytes_l = token_rank_copies * H * elem_dispatch
+ comb_bytes_l = token_rank_copies * H * 2
+ # Contract-level anomalies are attached to the row and rolled into validity.
+ # roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99.
+ # roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained
+ # op can't finish faster than its slowest required component (sync semantics violated).
+ row_anoms = []
+ if isum and isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]:
+ row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T,
+ "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2),
+ "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt})
+ floor = max(dp["p50"], cp["p50"]) if dp and cp else None
+ if floor and rtp["p50"] > 0 and rtp["p50"] < 0.95 * floor:
+ row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T,
+ "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)})
+ all_anomalies.extend(row_anoms)
+ rows.append({
+ "anomalies": row_anoms,
+ "components": {
+ "combine": component(cp, len(c)),
+ "dispatch": component(dp, len(d)),
+ "isolated_sum": component(isum, 0, derived=True),
+ "roundtrip": component(rtp, len(rt)),
+ },
+ "correctness": {
+ "contract": ORACLE_CONTRACT,
+ "max_relative_error": max_rel,
+ "passed": point_ok,
+ "rank_evidence": rank_evidence,
+ "scope": "dispatch-metadata-and-transformed-combine",
+ },
+ "global_tokens": gt,
+ "logical_bytes": {
+ "combine": comb_bytes_l,
+ "dispatch": disp_bytes_l,
+ "roundtrip": disp_bytes_l + comb_bytes_l,
+ },
+ "receive": {
+ "max": recv_max,
+ "mean": recv_total / world_size,
+ "min": recv_min,
+ "total": recv_total,
+ },
+ "routing": {
+ "empty_expert_count": rstats["empty_expert_count"],
+ "empty_rank_count": rstats["empty_rank_count"],
+ "expert_assignment_rank_cv": rstats["expert_assignment_rank_cv"],
+ "expert_assignments_per_rank": rstats["expert_assignments_per_rank"],
+ "expert_load_cv": rstats["expert_load_cv"],
+ "expert_load_max": rstats["expert_load_max"],
+ "expert_load_mean": rstats["expert_load_mean"],
+ "expert_load_min": rstats["expert_load_min"],
+ "fanout_histogram": rstats["fanout_hist"],
+ "fanout_max": rstats["fanout_max"],
+ "fanout_mean": rstats["fanout_mean"],
+ "fanout_min": rstats["fanout_min"],
+ "hash": rstats["routing_hash"],
+ "hotspot_ratio": rstats["hotspot_ratio"],
+ "locality": rstats.get("locality"),
+ "payload_copies_per_rank": rstats["payload_copies_per_rank"],
+ "payload_rank_cv": rstats["payload_rank_cv"],
+ "routed_copies": rstats["routed_copies"],
+ "source_token_stats": rstats.get("source_token_stats"),
+ },
+ "sample_histograms": {
+ "dispatch": _histogram(d) if d else None,
+ "combine": _histogram(c) if c else None,
+ "roundtrip": _histogram(rt),
+ },
+ "token_rate_at_latency_percentile": throughput,
+ "tokens_per_rank": T,
+ })
+ if rank == 0:
+ component_log = (f"disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} "
+ f"comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " if dp and cp
+ else "components=unavailable ")
+ print(f" T={T:<5} {component_log}"
+ f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(rt)} fanout={rstats['fanout_mean']:.2f} "
+ f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} "
+ f"correct={point_ok}")
+
+ # Cross-rank workload-identity proof: every rank must have built the SAME global routing
+ # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and
+ # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing.
+ trace_sig = hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()
+ routing_consistent = _same_hash_across_ranks(torch, dist, device, trace_sig)
+
+ # Capture again after correctness and timing so no lazily generated kernel can escape
+ # the implementation identity recorded in the artifact.
+ if capture_deferred_provenance is not None:
+ capture_deferred_provenance()
+
+ if rank != 0:
+ return 0
+
+ # status=valid requires correctness AND a proven-identical routing trace across ranks.
+ all_ok = bool(rows) and all(r["correctness"]["passed"] for r in rows) and routing_consistent
+
+ # Adapters never self-label official; status is derived from these gates.
+ prov = backend.backend_provenance
+ provenance_complete = contracts.provenance_complete(
+ prov,
+ backend.name,
+ getattr(args, "git_run", None),
+ image_digest=getattr(args, "image_digest", None),
+ image_verified=getattr(args, "image_digest_verified", False),
+ squash_sha256=getattr(args, "squash_sha256", None),
+ )
+ resource_profile = contracts.project_resource_profile(prov)
+ resource_conformance = resource_profile["conformance_class"]
+ # record the canonical workload identity consumed (one trace per T -> set of ids/checksums).
+ if canonical and loaded_workload_ids:
+ args.workload_id = identity.workload_id(
+ {
+ "members": [
+ {"checksums": loaded_checksums[member], "workload_id": member}
+ for member in sorted(loaded_workload_ids)
+ ]
+ }
+ )
+ args.workload_members = sorted(loaded_workload_ids)
+ args.workload_checksums = loaded_checksums
+ canonical_workload = bool(getattr(args, "workload_id", None))
+ activation_identity = workload_contract.compute_activation_identity(args.seed, args.hidden)
+ # EPLB identity covers replica placement, not only counts.
+ eplb_mapping_hash = None
+ if eplb_plan is not None:
+ eplb_mapping_hash = eplb.mapping_hash(eplb_plan)
+ anomaly_free = len(all_anomalies) == 0
+ validity = {
+ "execution_status": "complete" if rows else "failed",
+ "semantic_correctness": (
+ "pass" if rows and all(r["correctness"]["passed"] for r in rows) else "fail"
+ ),
+ "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent",
+ "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime",
+ "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run
+ "sampling_conformance": "conformant", # fixed-512-v1 gate rejects any other profile
+ "resource_conformance": resource_conformance,
+ "provenance_complete": provenance_complete,
+ # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above).
+ "anomaly_free": anomaly_free,
+ }
+ publication_status = _derive_publication_status(validity)
+
+ shape = { # FIXED line identity (no T, no per-backend resource knobs)
+ "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+ "experts_per_rank": experts_per_rank, "dispatch_dtype": "bf16",
+ "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical,
+ # V2 is reserved for the PR #605 ElasticBuffer adapter; package versions never imply it.
+ "kernel_gen": kernel_generation(backend),
+ "activation_profile": ACTIVATION_PROFILE,
+ "quant": {
+ "combine_input_dtype": "bf16",
+ "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"),
+ "combine_output_dtype": "bf16", "combine_quant_mode": "none",
+ "scale_layout": None,
+ },
+ }
+ generated_at = args.timestamp or _dt.datetime.now().astimezone().isoformat()
+ realized_placement = getattr(args, "realized_placement", None)
+ nodes = (
+ realized_placement["nodes"]
+ if realized_placement is not None
+ else int(os.environ.get("SLURM_NNODES", "1"))
+ )
+ case_factors = {
+ "case": {
+ "backend": backend.name,
+ "canonical": canonical,
+ "eplb": bool(eplb_plan),
+ "ep": ep_size,
+ "experts": num_logical,
+ "gpus_per_node": args.gpus_per_node or ep_size,
+ "hidden": args.hidden,
+ "ladder": " ".join(map(str, ladder)),
+ "nodes": nodes,
+ "phase": args.phase,
+ "required_publication": args.required_publication or "diagnostic",
+ "routing": args.routing,
+ "samples_per_point": TIMED_SAMPLES_PER_POINT,
+ "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size),
+ "suite": args.suite or "manual",
+ "timing": f"{args.iters}:{args.trials}:{args.warmup}",
+ "topk": args.topk,
+ "warmup_semantics": WARMUP_SEMANTICS,
+ "workload": args.workload_name or "manual",
+ },
+ "profile": identity.V1_CASE_PROFILE,
+ "sku": args.runner,
+ }
+ computed_case_id = identity.digest("case", case_factors)
+ if args.case_id and args.case_id != computed_case_id:
+ raise ValueError(
+ f"scheduled case ID does not match realized factors: {args.case_id} != {computed_case_id}"
+ )
+ case_identifier = args.case_id or computed_case_id
+ git_run = getattr(args, "git_run", None) or {}
+ allocation_factors = {
+ "artifact": git_run.get("artifact"),
+ "execution_id": getattr(args, "allocation_execution_id", None),
+ "job": git_run.get("job"),
+ "repo": git_run.get("repo"),
+ "run_attempt": git_run.get("run_attempt"),
+ "run_id": git_run.get("run_id"),
+ "runner": args.runner,
+ "source_sha": git_run.get("source_sha"),
+ }
+ allocation_identifier = identity.allocation_id(allocation_factors)
+ try:
+ attempt_ordinal = int(os.environ.get("CX_ATTEMPT_ID", "1"))
+ except ValueError:
+ attempt_ordinal = 0
+ if attempt_ordinal <= 0:
+ raise ValueError("CX_ATTEMPT_ID must be a positive integer")
+ attempt_identifier = identity.attempt_id(
+ allocation=allocation_identifier, case=case_identifier, ordinal=attempt_ordinal
+ )
+ runtime_fingerprint = getattr(args, "runtime_fingerprint", None) or {}
+ implementation_contract = {
+ "kernel_generation": kernel_generation(backend),
+ "name": backend.name,
+ "provenance": _series_provenance(backend.backend_provenance),
+ "resource_profile": resource_profile,
+ }
+ public_config = contracts.public_series_config(
+ kernel_generation=implementation_contract["kernel_generation"],
+ provenance=backend.backend_provenance,
+ resource_profile=resource_profile,
+ resource_mode=args.resource_mode,
+ device_product=getattr(args, "runtime_device_product", None),
+ )
+ series_factors = {
+ "backend": backend.name,
+ "implementation_contract_sha256": _sha256_json(implementation_contract),
+ "public_config_sha256": contracts.public_series_config_sha256(public_config),
+ "routing_control_sha256": contracts.routing_implementation_control_sha256(
+ implementation_contract
+ ),
+ "case_id": case_identifier,
+ "image_digest": getattr(args, "image_digest", None),
+ "runtime_fingerprint_sha256": _sha256_json(runtime_fingerprint),
+ "source_sha": git_run.get("source_sha"),
+ "squash_sha256": getattr(args, "squash_sha256", None),
+ "workload_id": getattr(args, "workload_id", None) or trace_sig,
+ }
+ series_identifier = identity.series_id(series_factors)
+
+ sample_points = []
+ for row in rows:
+ token_count = row["tokens_per_rank"]
+
+ def sampled_component(trials):
+ return {
+ "availability": "measured" if trials else "unavailable",
+ "sample_count": sum(len(trial) for trial in trials),
+ "trials": trials if trials else None,
+ }
+
+ sample_point = {
+ "components": {
+ "combine": sampled_component(comb_trials[token_count]),
+ "dispatch": sampled_component(disp_trials[token_count]),
+ "roundtrip": sampled_component(rt_trials[token_count]),
+ },
+ "tokens_per_rank": token_count,
+ }
+ sample_sha256 = _sha256_json(sample_point)
+ point_identifier = identity.point_id(
+ series=series_identifier, tokens_per_rank=token_count
+ )
+ evidence_identifier = identity.evidence_id(
+ point=point_identifier,
+ allocation=allocation_identifier,
+ attempt=attempt_identifier,
+ sample_sha256=sample_sha256,
+ )
+ sample_point.update(
+ {
+ "evidence_id": evidence_identifier,
+ "point_id": point_identifier,
+ "sample_sha256": sample_sha256,
+ }
+ )
+ sample_points.append(sample_point)
+ row.update({
+ "evidence_id": evidence_identifier,
+ "point_id": point_identifier,
+ "sample_sha256": sample_sha256,
+ })
+
+ samples_path = args.out[:-5] + ".samples.json" if args.out.endswith(".json") else args.out + ".samples.json"
+ samples_document = {
+ "allocation_id": allocation_identifier,
+ "attempt_id": attempt_identifier,
+ "case_id": case_identifier,
+ "format": "collectivex.samples.v1",
+ "points": sample_points,
+ "sampling": {
+ "iterations_per_trial": args.iters,
+ "reduction": identity.V1_CASE_PROFILE["rank_reduction"],
+ "trials": args.trials,
+ },
+ "schema_version": 1,
+ "series_id": series_identifier,
+ }
+ samples_payload = contracts.canonical_json_bytes(samples_document)
+ samples_sha256 = hashlib.sha256(samples_payload).hexdigest()
+ samples_bytes = len(samples_payload)
+ sample_artifact = {
+ "bytes": samples_bytes,
+ "format": "collectivex.samples.v1",
+ "path": os.path.basename(samples_path),
+ "sha256": samples_sha256,
+ }
+ headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+ eplb_record = (
+ {
+ "enabled": True,
+ "imbalance_after": eplb_plan["imbalance_after"],
+ "imbalance_before": eplb_plan["imbalance_before"],
+ "mapping_hash": eplb_mapping_hash,
+ "max_replicas": eplb_plan["max_replicas"],
+ "num_logical_experts": num_logical,
+ "num_physical_experts": args.experts,
+ "num_redundant": args.experts - num_logical,
+ "planner": EPLB_PLANNER,
+ "reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK,
+ "replicated_experts": eplb_plan["replicated_experts"],
+ }
+ if eplb_plan
+ else {
+ "enabled": False,
+ "imbalance_after": None,
+ "imbalance_before": None,
+ "mapping_hash": None,
+ "max_replicas": None,
+ "num_logical_experts": num_logical,
+ "num_physical_experts": args.experts,
+ "num_redundant": 0,
+ "planner": None,
+ "reference_tokens_per_rank": None,
+ "replicated_experts": 0,
+ }
+ )
+ doc = {
+ "format": "collectivex.ep.v1",
+ "schema_version": SCHEMA_VERSION,
+ "record_type": "case-attempt",
+ "generated_at": generated_at,
+ "identity": {
+ "allocation_factors": allocation_factors,
+ "allocation_id": allocation_identifier,
+ "attempt_id": attempt_identifier,
+ "attempt_ordinal": attempt_ordinal,
+ "case_factors": case_factors,
+ "case_id": case_identifier,
+ "series_factors": series_factors,
+ "series_id": series_identifier,
+ },
+ "case": {
+ "attempt_ordinal": attempt_ordinal,
+ "backend": backend.name,
+ "eplb": eplb_record,
+ "ep_size": ep_size,
+ "mode": "normal",
+ "phase": args.phase,
+ "required_publication": args.required_publication or "diagnostic",
+ "resource_mode": "tuned",
+ "runner": args.runner,
+ "shape": shape,
+ "suite": args.suite or "manual",
+ "workload_name": args.workload_name or "manual",
+ },
+ "workload": {
+ "activation_generator": ACTIVATION_GENERATOR,
+ "activation_identity": activation_identity,
+ "activation_profile": ACTIVATION_PROFILE,
+ "cross_rank_consistent": routing_consistent,
+ "manifest_checksums": getattr(args, "workload_checksums", None),
+ "members": getattr(args, "workload_members", None),
+ "routing_generator": ROUTING_GENERATOR,
+ "source": validity["workload_source"],
+ "trace_hashes": sorted(routing_hashes),
+ "trace_signature": trace_sig,
+ "workload_id": getattr(args, "workload_id", None),
+ },
+ "measurement": {
+ "component_order_contract": COMPONENT_ORDER_CONTRACT,
+ "conditioning": {
+ "contract": CONDITIONING_CONTRACT,
+ "ladder": conditioning_ladder,
+ "roundtrips_per_shape": CONDITIONING_ROUNDS_PER_SHAPE,
+ },
+ "contract": "layout-and-dispatch-v1",
+ "rows": rows,
+ "sampling": {
+ "contract": SAMPLING_CONTRACT,
+ "iterations_per_trial": args.iters,
+ "percentile_method": identity.V1_CASE_PROFILE["percentile_method"],
+ "reduction": identity.V1_CASE_PROFILE["rank_reduction"],
+ "samples_per_component": TIMED_SAMPLES_PER_POINT,
+ "trials": args.trials,
+ "warmup_iterations": args.warmup,
+ "warmup_semantics": WARMUP_SEMANTICS,
+ },
+ "source_allocation": "even",
+ },
+ "implementation": {
+ "kernel_generation": kernel_generation(backend),
+ "name": backend.name,
+ "provenance": backend.backend_provenance,
+ "resource_profile": resource_profile,
+ },
+ "topology": {
+ "device_count": getattr(args, "runtime_device_count", None),
+ "device_product": getattr(args, "runtime_device_product", None),
+ "gpus_per_node": args.gpus_per_node or ep_size,
+ "nodes": nodes,
+ "placement": "packed",
+ "realized_placement": realized_placement,
+ "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size),
+ "topology_class": args.topology_class,
+ "transport": args.transport,
+ "world_size": world_size,
+ },
+ "runtime_fingerprint": runtime_fingerprint,
+ "provenance": {
+ "command": getattr(args, "reproduction_command", ""),
+ "distributed_launcher": getattr(args, "distributed_launcher", None),
+ "git_run": getattr(args, "git_run", None),
+ "image": {
+ "arch": getattr(args, "image_arch", None),
+ "digest": getattr(args, "image_digest", "") or None,
+ "digest_verified": getattr(args, "image_digest_verified", False),
+ "reference": getattr(args, "image", "") or None,
+ "squash_sha256": getattr(args, "squash_sha256", None),
+ },
+ "redaction": "sanitized-v1",
+ },
+ "sample_artifact": sample_artifact,
+ "outcome": {
+ "publication_status": publication_status,
+ "reasons": [] if all_ok else ["semantic correctness or routing identity failed"],
+ "status": "success" if all_ok else "invalid",
+ "validity": validity,
+ },
+ }
+ contracts.validate_raw_document(doc, samples_document)
+ _write_bytes_atomic(samples_path, samples_payload)
+ _write_json_atomic(args.out, doc)
+ dispatch_percentiles = headline["components"]["dispatch"]["percentiles_us"]
+ dispatch_p99 = dispatch_percentiles["p99"] if dispatch_percentiles else None
+ component_summary = (f"disp_p99={dispatch_p99:.1f}us "
+ if dispatch_p99 is not None
+ else "components=unavailable ")
+ print(f"{backend.name} ep-dispatch-combine [{args.phase}/normal/layout-and-dispatch-v1]: "
+ f"status={doc['outcome']['status']} {len(rows)} pts, routing_consistent={routing_consistent}, "
+ f"headline T={headline['tokens_per_rank']} {component_summary}"
+ f"-> {args.out}")
+ # A complete invalid document is still a successfully captured terminal outcome. Launchers
+ # inspect its status to fail the case without conflating it with an execution failure.
+ return 0
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 0000000000..7f99990253
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""CollectiveX MoRI adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import re
+import sys
+import types
+
+# MoRI registers the whole symmetric heap at import time.
+os.environ["MORI_SHMEM_HEAP_SIZE"] = "2G"
+
+import torch
+import torch.distributed as dist
+
+try:
+ import mori # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: mori import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _project_local_metadata(torch_module, raw_expert_ids, raw_weights, rank, experts_per_rank):
+ local_start = rank * experts_per_rank
+ local = (raw_expert_ids >= local_start) & (
+ raw_expert_ids < local_start + experts_per_rank
+ )
+ expert_ids = torch_module.where(
+ local, raw_expert_ids, torch_module.full_like(raw_expert_ids, -1)
+ )
+ weights = torch_module.where(local, raw_weights, torch_module.zeros_like(raw_weights))
+ return expert_ids, weights, raw_expert_ids[local] - local_start
+
+
+def _mori_source_commit() -> str:
+ module_path = Path(mori.__file__).resolve()
+ for root in module_path.parents:
+ head = root / ".git" / "HEAD"
+ if not head.is_symlink() and head.is_file() and head.stat().st_size <= 128:
+ value = head.read_text(encoding="ascii").strip()
+ if re.fullmatch(r"[0-9a-f]{40}", value):
+ return value
+ raise RuntimeError("MoRI image source is not pinned to a detached commit")
+ raise RuntimeError("MoRI image source revision is unavailable")
+
+
+class MoRIBackend:
+ name = "mori"
+ combine_needs_redispatch = True
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+
+ self.ep_size = world_size
+ self.experts_per_rank = args.experts // self.ep_size
+ device_cus = torch.cuda.get_device_properties(device).multi_processor_count
+ self.block_num = self._block_target = 80
+ self._block_floored = False
+ self._tuned_source = "default-80"
+ self.dispatch_warps = 16
+ self.combine_warps = 8
+
+ # MI355X uses the direct intranode kernel. MI325X uses MoRI's split
+ # AsyncLL send/receive kernel as its normal-mode XGMI transport.
+ kernel_request = os.environ.get("CX_MORI_KERNEL_TYPE", "intranode").strip().lower()
+ self._kernel_type = None
+ self._kernel_type_label = "IntraNode"
+ self._async_ll = False
+ if kernel_request in ("asyncll", "async_ll", "async-ll"):
+ kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None)
+ if kernel_enum is None or not hasattr(kernel_enum, "AsyncLL"):
+ raise RuntimeError(
+ "CX_MORI_KERNEL_TYPE=asyncll requires "
+ "EpDispatchCombineKernelType.AsyncLL"
+ )
+ self._kernel_type = kernel_enum.AsyncLL
+ self._kernel_type_label = "AsyncLL"
+ self._async_ll = True
+ self.block_num = self._block_target = 64
+ self.dispatch_warps = self.combine_warps = 8
+ self._tuned_source = "upstream-asyncll-64x8-external-input"
+ elif kernel_request not in ("intranode", "intra_node", "intra-node", ""):
+ raise RuntimeError(
+ f"unknown CX_MORI_KERNEL_TYPE={kernel_request!r} (expected intranode|asyncll)"
+ )
+ self.kernel_generation = "async-ll" if self._async_ll else "intranode"
+
+ world_group = torch.distributed.group.WORLD
+ torch._C._distributed_c10d._register_process_group("default", world_group)
+ mori.shmem.shmem_torch_process_group_init("default")
+
+ self._cap = self.buffer_cap(args)
+ config_kwargs = {
+ "data_type": torch.bfloat16,
+ "rank": rank,
+ "world_size": world_size,
+ "hidden_dim": args.hidden,
+ "scale_dim": 0,
+ "scale_type_size": 1,
+ "max_token_type_size": torch.tensor([], dtype=torch.float32).element_size(),
+ "max_num_inp_token_per_rank": max(512, self._cap),
+ "num_experts_per_rank": self.experts_per_rank,
+ "num_experts_per_token": args.topk,
+ "use_external_inp_buf": self._async_ll,
+ "quant_type": "none",
+ }
+ if self._async_ll:
+ config_kwargs["kernel_type"] = self._kernel_type
+ config_kwargs["max_total_recv_tokens"] = 0
+ config_kwargs["block_num"] = self.block_num
+ config_kwargs["warp_num_per_block"] = self.dispatch_warps
+ self.config = mori.ops.EpDispatchCombineConfig(**config_kwargs)
+ if self._async_ll and (
+ self.config.block_num != self.block_num
+ or self.config.warp_num_per_block != self.dispatch_warps
+ ):
+ raise RuntimeError("MoRI AsyncLL launch configuration was not realized")
+ self.op = mori.ops.EpDispatchCombineOp(self.config)
+
+ expected_mori_commit = os.environ.get("MORI_COMMIT")
+ mori_commit = _mori_source_commit()
+ if expected_mori_commit and mori_commit != expected_mori_commit:
+ raise RuntimeError("MoRI image source revision differs from canonical provenance")
+ self.backend_provenance = {
+ "mori_commit": mori_commit,
+ "api": (
+ "mori.ops.EpDispatchCombineOp/external-input"
+ if self._async_ll
+ else "mori.ops.EpDispatchCombineOp/registered-input"
+ ),
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "kernel_type": self._kernel_type_label,
+ "enable_sdma": os.environ.get("MORI_ENABLE_SDMA"),
+ "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+ "max_num_inp_token_per_rank": max(512, self._cap),
+ "max_total_recv_tokens": config_kwargs.get("max_total_recv_tokens"),
+ "num_qps": 1,
+ "resource_mode": "tuned",
+ "block_num": self.block_num,
+ "block_num_target": self._block_target,
+ "block_num_floored": self._block_floored,
+ "dispatch_warps": self.dispatch_warps,
+ "combine_warps": self.combine_warps,
+ "device_cus": device_cus,
+ "sm_fraction": None if self._async_ll else self.block_num / device_cus,
+ "tuned_source": self._tuned_source,
+ }
+
+ def buffer_cap(self, args):
+ return 512
+
+ def make_problem(self, T, idx, weights, x):
+ indices = idx.to(torch.int32)
+ gate_weights = weights.to(torch.float32)
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=indices,
+ topk_weights=gate_weights,
+ indices=indices,
+ weights=gate_weights,
+ scales=torch.empty((T, 0), dtype=torch.uint8, device=self.device),
+ )
+
+ def dispatch(self, p):
+ dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num = (
+ self.op.dispatch(
+ p.x,
+ p.weights,
+ p.scales,
+ p.indices,
+ block_num=self.block_num,
+ warp_per_block=self.dispatch_warps,
+ )
+ )
+ if self._async_ll:
+ self.op.dispatch_recv(warp_per_block=self.dispatch_warps)
+ return types.SimpleNamespace(
+ dispatch_output=dispatch_output,
+ dispatch_weights=dispatch_weights,
+ dispatch_indices=dispatch_indices,
+ recv_num=recv_num[0],
+ combine_input=dispatch_output.to(torch.bfloat16),
+ )
+
+ def stage(self, p, h):
+ rows = getattr(p, "recv_tokens", None)
+ if not isinstance(rows, int) or rows < 0 or rows > h.combine_input.size(0):
+ raise RuntimeError("MoRI receive count was not validated before staging")
+ if self._async_ll:
+ return None
+ buffer = self.op.get_registered_combine_input_buffer(
+ torch.bfloat16, hidden_dim=h.combine_input.size(1)
+ )
+ buffer[:rows, :].copy_(h.combine_input[:rows, :])
+ h.combine_input = buffer
+
+ def combine(self, p, h):
+ combine_indices = p.indices if self._async_ll else h.dispatch_indices
+ combined, _weights = self.op.combine(
+ h.combine_input,
+ None,
+ combine_indices,
+ block_num=self.block_num,
+ warp_per_block=self.combine_warps,
+ )
+ if self._async_ll:
+ self.op.combine_recv(warp_per_block=self.combine_warps)
+ return combined[:p.T]
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ if h.dispatch_weights is None:
+ raise RuntimeError("MoRI dispatch did not expose gate weights")
+ if count < 0 or any(
+ tensor.ndim == 0 or count > tensor.size(0)
+ for tensor in (h.dispatch_output, h.dispatch_indices, h.dispatch_weights)
+ ):
+ raise RuntimeError("MoRI receive count exceeds dispatch metadata")
+ raw_expert_ids = h.dispatch_indices[:count].to(torch.int64)
+ expert_ids, weights, local_expert_ids = _project_local_metadata(
+ torch,
+ raw_expert_ids,
+ h.dispatch_weights[:count].to(torch.float32),
+ self.rank,
+ self.experts_per_rank,
+ )
+ return types.SimpleNamespace(
+ payload=h.dispatch_output[:count],
+ expert_ids=expert_ids,
+ weights=weights,
+ local_expert_counts=torch.bincount(
+ local_expert_ids, minlength=self.experts_per_rank
+ ),
+ ordering_contract="mori-global-topk-masked-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ h.combine_input = transformed.to(torch.bfloat16)
+ self.stage(p, h)
+ return self.combine(p, h)
+
+ def recv_tokens(self, h):
+ return int(h.recv_num.item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ except Exception:
+ pass
+ sys.stdout.flush()
+ sys.stderr.flush()
+ os._exit(rc if 0 <= rc <= 255 else 1)
diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py
new file mode 100644
index 0000000000..327a4063f8
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_nccl.py
@@ -0,0 +1,177 @@
+"""CollectiveX NCCL all-to-all expert-parallel reference backend.
+
+The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``. Like the
+DeepEP-family APIs, dispatch sends one hidden-state copy to each distinct destination rank, even when
+multiple selected experts live on that rank. Combine reverses the shuffle and sums those rank copies.
+
+Why this exists alongside DeepEP/UCCL/MoRI: it is the portable collective reference baseline for the
+same rank-deduplicated payload and routing metadata. It keeps the library comparison anchored to the
+platform collective stack without claiming the custom fused kernels use the same transport algorithm.
+
+Scope: BF16, normal mode, layout-and-dispatch-v1. The timed dispatch includes layout, count exchange,
+payload, rank-masked expert indices, gate weights, and source-token metadata; combine returns only
+the activation payload. RCCL exposes the same API. The v1 AMD matrix uses this backend at EP8.
+"""
+
+import re
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+
+def _runtime_collective(args, torch_module) -> tuple[str, str]:
+ expected = "rccl" if torch_module.version.hip else "nccl"
+ fingerprint = getattr(args, "runtime_fingerprint", None)
+ collective = fingerprint.get("collective_library") if isinstance(fingerprint, dict) else None
+ if (
+ not isinstance(collective, dict)
+ or collective.get("kind") != expected
+ or not isinstance(collective.get("version"), str)
+ or not re.fullmatch(r"[0-9]+\.[0-9]+\.[0-9]+", collective["version"])
+ ):
+ raise RuntimeError("loaded collective runtime identity is unavailable")
+ return expected, collective["version"]
+
+
+class NCCLBackend:
+ name = "nccl-ep"
+ combine_needs_redispatch = False # dispatch saves the permutation + splits
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.experts = args.experts
+ if args.experts % world_size:
+ raise ValueError(f"experts({args.experts}) must divide world_size({world_size})")
+ self.experts_per_rank = args.experts // world_size
+ self.tolerance = 5e-2 # bf16 round-trip
+ _library, _version = _runtime_collective(args, torch)
+ self.kernel_generation = contracts.collective_kernel_generation(_library)
+ self.backend_provenance = {
+ "backend": f"{_library}-all2all",
+ "backend_lineage": _library,
+ "collective_library": _library,
+ "nccl_version": _version,
+ "transport": f"{_library}-all_to_all_single",
+ "resource_mode": "tuned",
+ "num_sms": None,
+ "device_sms": torch.cuda.get_device_properties(device).multi_processor_count,
+ "tuned_source": "nccl-collective",
+ "reference_semantics": "rank-deduplicated-payload-plus-routing-metadata-v2",
+ "routing_metadata": "expert-index-gate-weight-source-token",
+ }
+
+ def buffer_cap(self, args):
+ return None # no fixed pre-allocated buffer; all-to-all sizes itself per step
+
+ def make_problem(self, T, idx, weights, x):
+ # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice.
+ return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32), layout=None)
+
+ def dispatch(self, p):
+ ws = self.world_size
+ x = p.x # [T, H] bf16
+ idx = p.topk_idx # [T, topk]
+ T, H = int(x.shape[0]), int(x.shape[1])
+ dev = x.device
+ # DeepEP dispatches one token per destination rank, not one copy per expert. Build the same
+ # rank-deduplicated routing map so NCCL traffic and combine semantics are comparable.
+ destinations = (idx // self.experts_per_rank).clamp_(0, ws - 1)
+ present = torch.zeros((T, ws), dtype=torch.bool, device=dev)
+ present.scatter_(1, destinations, True)
+ flat_token, flat_dest = present.nonzero(as_tuple=True)
+ # Group rank copies by destination (stable -> deterministic, invertible permutation).
+ order = torch.argsort(flat_dest, stable=True)
+ ordered_token = flat_token.index_select(0, order)
+ ordered_dest = flat_dest.index_select(0, order)
+ send_counts = torch.bincount(flat_dest, minlength=ws) # [ws]
+ send_x = x.index_select(0, ordered_token).contiguous()
+ send_topk_idx = idx.index_select(0, ordered_token).contiguous()
+ expert_start = ordered_dest.unsqueeze(1) * self.experts_per_rank
+ local_mask = ((send_topk_idx >= expert_start)
+ & (send_topk_idx < expert_start + self.experts_per_rank))
+ send_topk_idx = torch.where(
+ local_mask, send_topk_idx - expert_start, torch.full_like(send_topk_idx, -1)
+ )
+ send_topk_weights = p.topk_weights.index_select(0, ordered_token).contiguous()
+ send_topk_weights.masked_fill_(~local_mask, 0)
+ send_src_metadata = (ordered_token.to(torch.int64) | (self.rank << 32)).contiguous()
+ # Exchange per-rank counts so every rank can size its receive buffer.
+ recv_counts = torch.empty_like(send_counts)
+ dist.all_to_all_single(recv_counts, send_counts)
+ sc = send_counts.tolist()
+ rc = recv_counts.tolist()
+ total_recv = int(sum(rc))
+ recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev)
+ recv_topk_idx = torch.empty((total_recv, int(idx.shape[1])), dtype=idx.dtype, device=dev)
+ recv_topk_weights = torch.empty((total_recv, int(idx.shape[1])),
+ dtype=p.topk_weights.dtype, device=dev)
+ recv_src_metadata = torch.empty((total_recv,), dtype=torch.int64, device=dev)
+ # Dispatch the uneven per-rank splits over the configured collective transport.
+ dist.all_to_all_single(recv_x, send_x, rc, sc)
+ dist.all_to_all_single(recv_topk_idx, send_topk_idx, rc, sc)
+ dist.all_to_all_single(recv_topk_weights, send_topk_weights, rc, sc)
+ dist.all_to_all_single(recv_src_metadata, send_src_metadata, rc, sc)
+ return types.SimpleNamespace(
+ recv_x=recv_x, combine_input=None, order=order, flat_token=flat_token,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights, recv_src_rank=recv_src_metadata >> 32,
+ recv_src_token=recv_src_metadata & ((1 << 32) - 1), send_counts=sc, recv_counts=rc,
+ T=T, H=H, topk=int(idx.shape[1]), total_recv=total_recv)
+
+ def stage(self, p, h):
+ # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity).
+ h.combine_input = h.recv_x
+ return None
+
+ def combine(self, p, h):
+ # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists).
+ send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype,
+ device=h.combine_input.device)
+ dist.all_to_all_single(send_back, h.combine_input.contiguous(),
+ h.send_counts, h.recv_counts)
+ # send_back is in send (sorted) order; invert the argsort to token-copy order.
+ copies = torch.empty_like(send_back)
+ copies[h.order] = send_back
+ # Sum one copy per destination rank under this reference's explicit unweighted contract.
+ out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device)
+ out.index_add_(0, h.flat_token, copies.float())
+ return out.to(p.x.dtype)
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * self.experts_per_rank,
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.bincount(
+ h.recv_topk_idx[valid], minlength=self.experts_per_rank
+ ),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ h.combine_input = transformed.to(h.recv_x.dtype)
+ return self.combine(p, h)
+
+ def recv_tokens(self, h):
+ return int(h.total_recv)
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py
new file mode 100644
index 0000000000..c962b4ce13
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_uccl.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""CollectiveX UCCL adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import importlib.metadata as metadata
+import json
+import os
+from pathlib import Path
+from pathlib import PurePosixPath
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import uccl
+ import uccl_deepep
+ from uccl_deepep import Buffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: uccl.ep import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _uccl_version() -> str:
+ try:
+ return metadata.version("uccl")
+ except Exception:
+ return getattr(uccl, "__version__", "unknown")
+
+
+def _uccl_dependency_versions() -> dict[str, str]:
+ versions = {
+ package: metadata.version(package)
+ for package in contracts.UCCL_DEPENDENCY_VERSIONS
+ }
+ if versions != contracts.UCCL_DEPENDENCY_VERSIONS:
+ raise RuntimeError(
+ "UCCL runtime dependency versions differ from the v1 contract"
+ )
+ return versions
+
+
+def _is_uccl_runtime_payload(name: str) -> bool:
+ path = PurePosixPath(name)
+ return (
+ bool(path.parts)
+ and path.parts[0] in {"uccl", "uccl.libs"}
+ and "__pycache__" not in path.parts
+ and path.suffix != ".pyc"
+ )
+
+
+def _python_dependency_evidence(package: str, version: str) -> dict[str, str]:
+ distribution = metadata.distribution(package)
+ runtime_files = []
+ for entry in distribution.files or ():
+ logical = PurePosixPath(entry.as_posix())
+ path = Path(distribution.locate_file(entry))
+ if (
+ logical.parts
+ and logical.parts[0] == package
+ and "__pycache__" not in logical.parts
+ and logical.suffix != ".pyc"
+ and path.is_file()
+ ):
+ runtime_files.append((entry.as_posix(), path))
+ return contracts.content_manifest_evidence(
+ role=f"{package}-distribution",
+ name=f"{package}-{version}",
+ files=runtime_files,
+ )
+
+
+def _loaded_libcudart_evidence(
+ version: str, maps_path: Path = Path("/proc/self/maps")
+) -> dict[str, str]:
+ distribution = metadata.distribution("nvidia-cuda-runtime-cu12")
+ candidates = {
+ Path(distribution.locate_file(entry)).resolve()
+ for entry in distribution.files or ()
+ if PurePosixPath(entry.as_posix()).name.startswith("libcudart.so")
+ and Path(distribution.locate_file(entry)).is_file()
+ }
+ candidate_names = {path.name for path in candidates}
+ if not candidates or not candidate_names:
+ raise RuntimeError("pinned CUDA runtime distribution has no libcudart payload")
+
+ loaded: set[Path] = set()
+ try:
+ mappings = maps_path.read_text().splitlines()
+ except OSError as exc:
+ raise RuntimeError("cannot inspect mapped UCCL runtime libraries") from exc
+ for mapping in mappings:
+ columns = mapping.split(maxsplit=5)
+ if len(columns) != 6:
+ continue
+ raw_path = columns[5]
+ deleted = raw_path.endswith(" (deleted)")
+ if deleted:
+ raw_path = raw_path.removesuffix(" (deleted)")
+ mapped = Path(raw_path)
+ if mapped.name not in candidate_names:
+ continue
+ if deleted or not mapped.is_file():
+ raise RuntimeError(
+ "mapped libcudart is unavailable for content verification"
+ )
+ resolved = mapped.resolve()
+ if resolved not in candidates:
+ raise RuntimeError(
+ "mapped libcudart is not owned by the pinned CUDA runtime package"
+ )
+ loaded.add(resolved)
+ if len(loaded) != 1:
+ raise RuntimeError(
+ "expected exactly one mapped libcudart from the pinned CUDA runtime"
+ )
+ return contracts.content_manifest_evidence(
+ role="cuda-runtime",
+ name=f"nvidia-cuda-runtime-cu12-{version}",
+ files=[("libcudart.so", loaded.pop())],
+ )
+
+
+def _uccl_build_evidence(
+ version: str, dependency_versions: dict[str, str]
+) -> list[dict[str, str]]:
+ distribution = metadata.distribution("uccl")
+ distribution_files = [
+ (entry.as_posix(), distribution.locate_file(entry))
+ for entry in distribution.files or ()
+ if _is_uccl_runtime_payload(entry.as_posix())
+ and Path(distribution.locate_file(entry)).is_file()
+ ]
+ wrapper_root = Path(uccl_deepep.__file__).resolve().parent
+ wrapper_files = [
+ (path.relative_to(wrapper_root).as_posix(), path)
+ for path in wrapper_root.rglob("*.py")
+ if path.is_file()
+ ]
+ return [
+ contracts.content_manifest_evidence(
+ role="uccl-distribution",
+ name=f"uccl-{version}",
+ files=distribution_files,
+ ),
+ contracts.content_manifest_evidence(
+ role="uccl-wrapper",
+ name="uccl-deepep-wrapper",
+ files=wrapper_files,
+ ),
+ _python_dependency_evidence("intervaltree", dependency_versions["intervaltree"]),
+ _python_dependency_evidence(
+ "sortedcontainers", dependency_versions["sortedcontainers"]
+ ),
+ _loaded_libcudart_evidence(dependency_versions["nvidia-cuda-runtime-cu12"]),
+ ]
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"UCCL {label} differs across ranks")
+
+
+class UCCLBackend:
+ name = "uccl"
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+
+ self.group = dist.group.WORLD
+ device_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ num_nvl_bytes = 4 * 1024 * 1024 * 1024
+ self.buffer = Buffer(self.group, num_nvl_bytes, 0)
+
+ num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+ try:
+ Buffer.set_num_sms(num_sms)
+ except Exception as exc: # pragma: no cover - version dependent
+ raise RuntimeError(
+ f"UCCL did not apply requested num_sms={num_sms}: {exc!r}"
+ ) from exc
+ applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+ if applied_num_sms != num_sms:
+ raise RuntimeError(
+ f"UCCL num_sms mismatch: requested={num_sms} applied={applied_num_sms}"
+ )
+
+ version = _uccl_version()
+ dependency_versions = _uccl_dependency_versions()
+ loaded_libraries = _uccl_build_evidence(version, dependency_versions)
+ _require_cross_rank_equal(loaded_libraries, "installed content identities")
+ self.backend_provenance = {
+ "uccl_version": version,
+ "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{version}",
+ "uccl_wrapper_commit": os.environ.get("UCCL_WRAPPER_COMMIT"),
+ "backend_lineage": "uccl",
+ "uccl_dependency_versions": dependency_versions,
+ "loaded_libraries": loaded_libraries,
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "resource_mode": "tuned",
+ "requested_num_sms": num_sms,
+ "num_sms": applied_num_sms,
+ "device_sms": device_sms,
+ "sm_fraction": applied_num_sms / device_sms,
+ "tuned_source": "uccl-default-num_sms",
+ "num_nvl_bytes": num_nvl_bytes,
+ }
+
+ def buffer_cap(self, args):
+ return None
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ (
+ num_tokens_per_rank,
+ _,
+ num_tokens_per_expert,
+ is_token_in_rank,
+ _,
+ ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+ recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_tokens_per_rank=num_tokens_per_rank,
+ is_token_in_rank=is_token_in_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle)
+ return combined_x
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * (self.args.experts // self.world_size),
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle)
+ return combined
+
+ def recv_tokens(self, h):
+ return int(h.recv_x.shape[0])
+
+ def finalize(self, rc):
+ # UCCL's proxy teardown can crash after results are written; preserve the real rc.
+ try:
+ dist.barrier()
+ except Exception:
+ pass
+ sys.stdout.flush()
+ sys.stderr.flush()
+ os._exit(rc if 0 <= rc <= 255 else 1)
diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py
new file mode 100644
index 0000000000..b1479da9f1
--- /dev/null
+++ b/experimental/CollectiveX/tests/eplb.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for
+skewed (zipf) expert load.
+
+Under skewed routing, the ranks hosting hot logical experts receive far more token-copies
+than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX
+the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts
+onto extra physical slots and PLACES the slots so every rank carries ~equal load.
+
+This module is backend-agnostic: it is purely a transform of the deterministic routing
+trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to
+rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots
+RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping
+reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical`
+and the remapped (physical) trace; nothing else changes.
+
+ num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size)
+ build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks
+ remap_idx(): each token's logical targets -> physical replicas, spread by global token id
+
+Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+
+
+def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int:
+ """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the
+ physical experts divide evenly across ranks (symmetric dispatch)."""
+ r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size
+ return num_logical + r
+
+
+def _contiguous_rank_load(logical_load, ep_size):
+ """Per-rank received load WITHOUT EPLB: logical experts placed contiguously
+ (experts_per_rank = num_logical/ep_size), so rank r carries its block's total."""
+ n = len(logical_load)
+ per = n // ep_size
+ return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)]
+
+
+def build_plan(logical_load, num_physical: int, ep_size: int) -> dict:
+ """logical_load: list[float] length num_logical (token-copies per logical expert).
+ Returns the replication+placement plan (all pure-Python lists) + before/after balance."""
+ num_logical = len(logical_load)
+ assert num_physical >= num_logical, "num_physical must be >= num_logical"
+ assert num_physical % ep_size == 0, "num_physical must divide ep_size"
+ assert num_logical % ep_size == 0, "num_logical must divide ep_size"
+ spp = num_physical // ep_size # physical slots per rank (fixed)
+
+ # 1) Replica allocation — start one slot per logical expert, then hand each redundant
+ # slot to the expert with the highest CURRENT per-replica load (greedy min-max).
+ replicas = [1] * num_logical
+ for _ in range(num_physical - num_logical):
+ best, best_lps = 0, -1.0
+ for e in range(num_logical):
+ lps = logical_load[e] / replicas[e]
+ if lps > best_lps:
+ best, best_lps = e, lps
+ replicas[best] += 1
+
+ # 2) Slots = (per-replica load, logical expert), one per replica.
+ slots = []
+ for e in range(num_logical):
+ lps = logical_load[e] / replicas[e]
+ slots.extend((lps, e) for _ in range(replicas[e]))
+
+ # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the
+ # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity.
+ slots.sort(reverse=True)
+ rank_slots = [[] for _ in range(ep_size)]
+ rank_load = [0.0] * ep_size
+ for lps, e in slots:
+ r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp),
+ key=lambda r: rank_load[r])
+ rank_slots[r].append(e)
+ rank_load[r] += lps
+
+ # 4) Rank-major physical numbering -> contiguous placement == this balanced placement.
+ phys2log, rank_of_phys = [], []
+ for r in range(ep_size):
+ for e in rank_slots[r]:
+ phys2log.append(e)
+ rank_of_phys.append(r)
+ log2phys = [[] for _ in range(num_logical)]
+ for pid, e in enumerate(phys2log):
+ log2phys[e].append(pid)
+
+ before = _contiguous_rank_load(logical_load, ep_size)
+ total = sum(logical_load) or 1.0
+ mean = total / ep_size
+ return {
+ "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size,
+ "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas),
+ "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys,
+ "rank_load_after": rank_load, "rank_load_before": before,
+ # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts.
+ "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean,
+ "replicated_experts": sum(1 for r in replicas if r > 1),
+ }
+
+
+def mapping_hash(plan: dict) -> str:
+ """Hash the placement fields that fully determine the logical-to-physical remap."""
+ payload = {
+ "phys2log": plan["phys2log"],
+ "rank_of_phys": plan["rank_of_phys"],
+ "replicas": plan["replicas"],
+ }
+ return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
+
+
+def remap_rows(indices: list[list[int]], plan: dict) -> list[list[int]]:
+ """Pure-Python equivalent of remap_idx for contract verification."""
+ replicas = plan["log2phys"]
+ return [
+ [replicas[expert][token % len(replicas[expert])] for expert in row]
+ for token, row in enumerate(indices)
+ ]
+
+
+def remap_idx(idx_logical, plan):
+ """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace).
+ Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's
+ physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out
+ across its replicas (= across ranks). Replicas of distinct logical experts are disjoint,
+ so a token's top-k physical ids stay distinct (dispatch invariant preserved)."""
+ import torch
+ replicas = plan["replicas"]
+ num_logical = len(replicas)
+ max_rc = plan["max_replicas"]
+ rc = torch.tensor(replicas, dtype=torch.int64)
+ # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed
+ # past rc[e] because the replica index is taken mod rc[e]).
+ padded = torch.zeros(num_logical, max_rc, dtype=torch.int64)
+ for e, phys in enumerate(plan["log2phys"]):
+ for k in range(max_rc):
+ padded[e, k] = phys[k] if k < len(phys) else phys[0]
+ gt = idx_logical.shape[0]
+ rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id
+ e = idx_logical.to(torch.int64) # [gt,topk]
+ ridx = rows % rc[e] # [gt,topk] replica index
+ return padded[e, ridx] # [gt,topk] physical ids
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+ # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed.
+ import sys
+ NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32
+ load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)]
+ nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP)
+ plan = build_plan(load, nphys, EP)
+ print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}")
+ print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} "
+ f"(hottest expert 0 replicas={plan['replicas'][0]})")
+ print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}")
+ print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}")
+ print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x")
+ # Gates: equal slot cardinality, every logical expert placed, big imbalance cut.
+ assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL))
+ assert sum(plan["replicas"]) == nphys
+ assert len(plan["phys2log"]) == nphys
+ assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL))
+ # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing
+ assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"])
+ assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance"
+ assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}"
+ # remap (if torch present): distinctness + balanced receive on a sampled zipf trace.
+ try:
+ import torch
+ g = torch.Generator().manual_seed(0)
+ p = torch.tensor(load)
+ p = (p / p.sum()).expand(4096, NUM_LOGICAL)
+ idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64)
+ idx_p = remap_idx(idx_l, plan)
+ assert idx_p.shape == idx_l.shape
+ # top-k physical ids distinct per token
+ assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct"
+ spp = plan["slots_per_rank"]
+ recv_before = [0] * EP
+ recv_after = [0] * EP
+ per_log = NUM_LOGICAL // EP
+ for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()):
+ for e in row_l:
+ recv_before[e // per_log] += 1
+ for pid in row_p:
+ recv_after[pid // spp] += 1
+ ib = max(recv_before) / (sum(recv_before) / EP)
+ ia = max(recv_after) / (sum(recv_after) / EP)
+ print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x")
+ assert ia < ib and ia < 1.35, "remap must balance per-rank receive load"
+ print("remap self-test: OK")
+ except ImportError:
+ print("(torch absent — skipped remap self-test; planner gates passed)")
+ print("EPLB self-test: PASS")
+ sys.exit(0)
diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py
new file mode 100644
index 0000000000..862c3d0375
--- /dev/null
+++ b/experimental/CollectiveX/tests/make_workloads.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Generate canonical serialized workloads. Runs the stdlib counter generator for
+each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a
+dir that runs then consume via `run_ep.py --workload-dir`. One trace is emitted per global-token
+count because global token count is part of workload identity.
+
+ python3 tests/make_workloads.py --out-dir /path/to/cx_workloads \\
+ --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\
+ --tokens-ladder "1 2 4 8 16 32 64 128 256 512"
+
+Or by the named v1 workload in configs/workloads.yaml. Explicit dimension flags still override it:
+
+ python3 tests/make_workloads.py --out-dir /path/to/cx_workloads --workload deepseek-v3-v1 --routing uniform --ep 8
+
+--id-only prints the content-bound workload_id per ladder point without torch/numpy:
+
+ python3 tests/make_workloads.py --workload deepseek-v3-v1 --ep 8 --id-only
+
+Generate every routing the suites need by running once per --routing. Idempotent (same id => same
+file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import workload as wl # noqa: E402
+
+# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution.
+_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def resolve_manifest(name):
+ """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts).
+ Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`.
+ Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib."""
+ import yaml
+ path = os.path.join(_REPO, "configs", "workloads.yaml")
+ with open(path) as handle:
+ cfg = yaml.safe_load(handle)
+ known = []
+ for section in ("synthetic", "model_derived"):
+ sec = cfg.get(section) or {}
+ known += list(sec)
+ m = sec.get(name)
+ if m is None:
+ continue
+ experts = m.get("experts", m.get("routed_experts"))
+ if m.get("hidden") is None or m.get("topk") is None or experts is None:
+ raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}")
+ return int(m["hidden"]), int(m["topk"]), int(experts)
+ raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}")
+
+
+def main() -> int:
+ ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads")
+ ap.add_argument("--out-dir", help="required unless --id-only")
+ ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)")
+ ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
+ ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)")
+ ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)")
+ ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)")
+ ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)")
+ ap.add_argument("--seed", type=int, default=67)
+ ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512")
+ ap.add_argument("--id-only", action="store_true",
+ help="print content-bound workload_id per point without torch/numpy")
+ a = ap.parse_args()
+
+ # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override
+ # per field. With neither, fall back to the v1 DeepSeek dimensions (7168/8/256).
+ base_h, base_t, base_e = (7168, 8, 256)
+ if a.workload:
+ base_h, base_t, base_e = resolve_manifest(a.workload)
+ hidden = a.hidden if a.hidden is not None else base_h
+ topk = a.topk if a.topk is not None else base_t
+ experts = a.experts if a.experts is not None else base_e
+
+ if not a.id_only and not a.out_dir:
+ ap.error("--out-dir is required unless --id-only")
+
+ raw_ladder = [int(token) for token in a.tokens_ladder.replace(",", " ").split()]
+ if (a.ep <= 0 or min(hidden, topk, experts) <= 0 or topk > experts or experts % a.ep
+ or not raw_ladder or any(token <= 0 for token in raw_ladder)
+ or len(raw_ladder) != len(set(raw_ladder))):
+ ap.error("shape, EP, and token ladder must be positive, divisible, and unique")
+ ladder = sorted(raw_ladder)
+ epr = experts // a.ep
+ label = f"workload={a.workload} " if a.workload else ""
+
+ if a.id_only:
+ # The stdlib counter generator derives the same content-bound ID on every runtime.
+ made = []
+ for T in ladder:
+ gt = T * a.ep
+ wid = wl.compute_workload_id(a.routing, hidden, topk, experts, a.ep, gt, a.seed)
+ made.append((T, gt, wid))
+ print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}")
+ print(f"{label}id-only: {len(made)} workload_id(s) "
+ f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})")
+ return 0
+
+ os.makedirs(a.out_dir, exist_ok=True)
+ made = []
+ for T in ladder:
+ gt = T * a.ep
+ idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr)
+ wid = wl.save_workload(a.out_dir, idx, w, man)
+ made.append((T, gt, wid))
+ print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} "
+ f"(trace sha {man['checksums']['trace'][:12]})")
+ print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py
new file mode 100644
index 0000000000..6065a06e43
--- /dev/null
+++ b/experimental/CollectiveX/tests/routing.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""CollectiveX — deterministic, platform-independent MoE routing trace.
+
+Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated
+ONCE from a fixed seed over the *global* token batch, indexed by global token id, and
+is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k).
+Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations
+are per-rank (same rank ⇒ same x on any platform), so a given global token id has
+identical activation everywhere without materializing a global activation tensor.
+
+The v1 suite keeps two routing distributions:
+
+ * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT.
+ Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈
+ 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson.
+ * zipf — expert popularity proportional to 1/rank, producing expert/rank load skew.
+
+Always publish the realized fan-out so the workload is never misread again
+(`routing_stats`).
+"""
+from __future__ import annotations
+
+import hashlib
+
+import torch
+
+ACTIVATION_GENERATOR = "collectivex-activation-counter-v3"
+SOURCE_ID_BASE = 128
+SOURCE_ID_COLUMNS = 4
+
+
+def build_global_routing(
+ global_tokens: int, experts: int, topk: int, routing: str, seed: int
+):
+ """Return byte-stable counter-generated routing tensors on CPU."""
+ import workload
+
+ indices, weights = workload.canonical_routing_rows(
+ int(global_tokens), int(experts), int(topk), routing, int(seed)
+ )
+ return (
+ torch.tensor(indices, dtype=torch.int64),
+ torch.tensor(weights, dtype=torch.float32),
+ )
+
+
+def rank_slice(idx, weights, rank: int, tokens_per_rank: int):
+ lo = rank * tokens_per_rank
+ return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous()
+
+
+def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device,
+ dtype=torch.bfloat16):
+ """Exact counter-derived inputs with a reversible global source-token prefix."""
+ source = torch.arange(tokens, device=device, dtype=torch.int64) + rank * tokens
+ return activations_for_source_ids(source, hidden, seed, dtype)
+
+
+def activations_for_source_ids(source, hidden: int, seed: int, dtype=torch.bfloat16):
+ """Materialize canonical activations for arbitrary global source-token IDs."""
+ if hidden < SOURCE_ID_COLUMNS:
+ raise ValueError(f"hidden must be at least {SOURCE_ID_COLUMNS}")
+ source = source.to(torch.int64)
+ column = torch.arange(hidden, device=source.device, dtype=torch.int64)
+ values = (source[:, None] * 131 + column[None, :] * 17 + int(seed) * 19) % 257 - 128
+ output = values.to(dtype).mul_(1 / 64)
+ output[:, 0] = source % SOURCE_ID_BASE
+ output[:, 1] = (source // SOURCE_ID_BASE) % SOURCE_ID_BASE
+ output[:, 2] = (source // (SOURCE_ID_BASE**2)) % SOURCE_ID_BASE
+ output[:, 3] = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE
+ return output
+
+
+def decode_source_ids(payload, seed: int):
+ """Decode and validate source IDs carried by rank_activations."""
+ if payload.ndim != 2 or payload.shape[1] < SOURCE_ID_COLUMNS:
+ raise ValueError("received payload cannot carry the source-token prefix")
+ prefix = payload[:, :SOURCE_ID_COLUMNS].float()
+ digits = prefix.round().to(torch.int64)
+ if not torch.equal(prefix, digits.float()):
+ raise ValueError("received source-token prefix is not exact")
+ if bool(((digits < 0) | (digits >= SOURCE_ID_BASE)).any().item()):
+ raise ValueError("received source-token prefix is out of range")
+ source = digits[:, 0] + SOURCE_ID_BASE * digits[:, 1] + SOURCE_ID_BASE**2 * digits[:, 2]
+ checksum = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE
+ if not torch.equal(checksum, digits[:, 3]):
+ raise ValueError("received source-token checksum differs")
+ return source
+
+
+def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int,
+ gpus_per_node: int, scale_up_domain: int = None) -> dict:
+ """Locality of rank-deduplicated payload copies under packed placement."""
+ import torch as _t
+ gt = idx.shape[0]
+ assignments = (idx // experts_per_rank).clamp(max=ep_size - 1)
+ destinations = _t.zeros((gt, ep_size), dtype=_t.bool)
+ destinations.scatter_(1, assignments, True)
+ token, dest = destinations.nonzero(as_tuple=True)
+ src = (token // max(1, tokens_per_rank)).clamp(max=ep_size - 1)
+ sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain
+ phys = _t.arange(ep_size, dtype=_t.int64)
+ pd, ps = phys[dest], phys[src]
+ local = (dest == src)
+ same_node = (pd // gpus_per_node) == (ps // gpus_per_node)
+ same_dom = (pd // sud) == (ps // sud)
+ n = dest.numel()
+ return {
+ "placement": "packed",
+ "local_rank_fraction": float(local.float().mean()),
+ "same_node_fraction": float(same_node.float().mean()),
+ "same_scaleup_domain_fraction": float(same_dom.float().mean()),
+ "cross_node_fraction": float((~same_node).float().mean()),
+ "cross_domain_fraction": float((~same_dom).float().mean()),
+ "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n),
+ }
+
+
+def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict:
+ """Realized routing properties for the GLOBAL trace — published per point so the
+ fan-out / load can never be silently misread. idx is the global [gt, topk] tensor;
+ weights the matching [gt, topk] gate weights (hashed too for workload identity).
+ """
+ ep = max(1, experts // max(1, experts_per_rank))
+ ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment
+ # unique destination ranks per token (fan-out)
+ onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool)
+ onehot.scatter_(1, ranks.clamp(max=ep - 1), True)
+ fanout = onehot.sum(dim=1) # [gt]
+ hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep
+ load = torch.bincount(idx.reshape(-1), minlength=experts).float()
+ # Keep expert assignments (compute load) separate from rank-deduplicated payload copies
+ # (network load). Conflating them overstates traffic when two experts share a rank.
+ assignment_load = torch.bincount(
+ ranks.reshape(-1).clamp(max=ep - 1), minlength=ep
+ ).float()
+ payload_load = onehot.sum(dim=0).float()
+ # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity
+ # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio =
+ # worst expert load over the mean. Zipf should be more concentrated than uniform.
+ def _cv(t):
+ m = float(t.mean())
+ return float(t.std(unbiased=False) / m) if m > 0 else 0.0
+ expert_load_cv = _cv(load)
+ assignment_rank_cv = _cv(assignment_load)
+ payload_rank_cv = _cv(payload_load)
+ hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0
+ # Empty experts capture compute skew; empty destination ranks capture network skew.
+ empty_expert_count = int((load == 0).sum())
+ empty_rank_count = int((payload_load == 0).sum())
+ # SHA-256 workload identity over both topk_idx and gate weights: a chart
+ # point's routing is provably identical across SKUs only if both hashes match.
+ idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes()
+ idx_hash = hashlib.sha256(idx_bytes).hexdigest()
+ if weights is not None:
+ w_bytes = weights.to(torch.float32).cpu().numpy().tobytes()
+ w_hash = hashlib.sha256(w_bytes).hexdigest()
+ routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()
+ else:
+ w_hash, routing_hash = None, idx_hash
+ return {
+ "fanout_mean": float(fanout.float().mean()),
+ "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()),
+ "fanout_hist": hist, # index k-1 = #tokens with fan-out k
+ "expert_assignments_per_rank": [int(x) for x in assignment_load.tolist()],
+ "payload_copies_per_rank": [int(x) for x in payload_load.tolist()],
+ "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs
+ "expert_load_min": int(load.min()), "expert_load_max": int(load.max()),
+ "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv,
+ "expert_assignment_rank_cv": assignment_rank_cv,
+ "payload_rank_cv": payload_rank_cv, "hotspot_ratio": hotspot_ratio,
+ "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count,
+ "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash,
+ }
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+ import sys
+ E, TOPK, EPR, GT = 256, 8, 32, 4096
+ ui, _ = build_global_routing(GT, E, TOPK, "uniform", 67)
+ zi, _ = build_global_routing(GT, E, TOPK, "zipf", 67)
+ assert all(len(set(row.tolist())) == TOPK for row in ui[:16])
+ uniform, zipf = routing_stats(ui, E, EPR), routing_stats(zi, E, EPR)
+ assert uniform["hotspot_ratio"] < zipf["hotspot_ratio"]
+ dev = torch.device("cpu")
+ first = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32)
+ second = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32)
+ assert torch.equal(first, second) and torch.isfinite(first).all()
+ print("routing self-test: PASS")
+ sys.exit(0)
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 0000000000..cf019af28f
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""CollectiveX v1 EP benchmark entrypoint for torchrun or rank environments."""
+
+from __future__ import annotations
+
+import argparse
+import ctypes
+import json
+import os
+import platform
+import re
+import shlex
+import socket
+import subprocess
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path[:0] = [HERE, os.path.dirname(HERE)]
+
+import ep_harness # noqa: E402 (stdlib-only; safe before torch)
+import identity # noqa: E402
+
+
+def _numeric_version(command: list[str]) -> str | None:
+ try:
+ result = subprocess.run(
+ command, capture_output=True, check=False, text=True, timeout=10
+ )
+ except (OSError, subprocess.TimeoutExpired):
+ return None
+ if result.returncode != 0:
+ return None
+ match = re.search(r"\b[0-9]+(?:\.[0-9]+){1,3}\b", result.stdout)
+ return match.group(0) if match else None
+
+
+def _loaded_collective_version() -> str | None:
+ try:
+ with open("/proc/self/maps", encoding="utf-8") as handle:
+ paths = {
+ os.path.realpath(line.rstrip().split()[-1])
+ for line in handle
+ if any(name in line for name in ("libnccl.so", "librccl.so"))
+ and os.path.isfile(line.rstrip().split()[-1])
+ }
+ if len(paths) != 1:
+ return None
+ version = ctypes.c_int()
+ library = ctypes.CDLL(paths.pop())
+ if library.ncclGetVersion(ctypes.byref(version)) != 0:
+ return None
+ return ep_harness.format_collective_version(version.value)
+ except (AttributeError, OSError):
+ return None
+
+
+def _runtime_fingerprint(
+ torch, device, *, machine: str, vendor: str, arch: str
+) -> dict:
+ """Return strict runtime facts without hosts, addresses, UUIDs, or paths."""
+ properties = torch.cuda.get_device_properties(device)
+ if vendor == "nvidia":
+ driver = _numeric_version(
+ ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+ )
+ runtime_kind, runtime_version, collective_kind = (
+ "cuda",
+ torch.version.cuda,
+ "nccl",
+ )
+ else:
+ driver = _numeric_version(["rocm-smi", "--showdriverversion"])
+ runtime_kind, runtime_version, collective_kind = (
+ "hip",
+ torch.version.hip,
+ "rccl",
+ )
+ return {
+ "accelerator_runtime": {"kind": runtime_kind, "version": runtime_version},
+ "collective_library": {
+ "kind": collective_kind,
+ "version": _loaded_collective_version(),
+ },
+ "device": {
+ "arch": arch,
+ "compute_units": int(properties.multi_processor_count),
+ "memory_bytes": int(properties.total_memory),
+ "product": torch.cuda.get_device_name(device),
+ "warp_size": int(properties.warp_size),
+ },
+ "driver_version": driver,
+ "framework": {"kind": "torch", "version": str(torch.__version__)},
+ "machine": machine,
+ "python_version": platform.python_version(),
+ "vendor": vendor,
+ }
+
+
+def _summarize_realized_placement(
+ records: list[tuple[str, int]],
+ *,
+ expected_nodes: int,
+ expected_gpus_per_node: int,
+ expected_world_size: int,
+) -> dict:
+ """Validate private host/rank records and return only publication-safe aggregates."""
+ if expected_nodes < 1 or expected_gpus_per_node < 1:
+ raise ValueError("requested placement dimensions must be positive")
+ if expected_nodes * expected_gpus_per_node != expected_world_size:
+ raise ValueError("requested nodes x GPUs per node differs from world size")
+ if len(records) != expected_world_size:
+ raise ValueError("realized rank count differs from world size")
+
+ by_host: dict[str, list[int]] = {}
+ for host, local_rank in records:
+ if not isinstance(host, str) or not host or type(local_rank) is not int:
+ raise ValueError("realized placement record has invalid types")
+ by_host.setdefault(host, []).append(local_rank)
+
+ counts = sorted(len(local_ranks) for local_ranks in by_host.values())
+ complete_local_ranks = all(
+ sorted(local_ranks) == list(range(expected_gpus_per_node))
+ for local_ranks in by_host.values()
+ )
+ unique_pairs = len(set(records)) == len(records)
+ if len(by_host) != expected_nodes:
+ raise ValueError(
+ f"realized node count {len(by_host)} differs from requested {expected_nodes}"
+ )
+ if counts != [expected_gpus_per_node] * expected_nodes:
+ raise ValueError("realized ranks per node differ from requested GPUs per node")
+ if not complete_local_ranks or not unique_pairs:
+ raise ValueError("realized local ranks are incomplete or duplicated")
+ return {
+ "gpus_per_node": expected_gpus_per_node,
+ "nodes": expected_nodes,
+ "ranks_per_node": expected_gpus_per_node,
+ "unique_local_ranks": True,
+ "valid": True,
+ }
+
+
+def _common_runtime_fingerprint(records: list[dict]) -> dict:
+ """Return the shared sanitized fingerprint, rejecting heterogeneous ranks."""
+ if not records:
+ raise ValueError("runtime fingerprint evidence is empty")
+ canonical = {
+ json.dumps(record, allow_nan=False, sort_keys=True, separators=(",", ":"))
+ for record in records
+ }
+ if len(canonical) != 1:
+ raise ValueError("runtime fingerprint differs across distributed ranks")
+ return records[0]
+
+
+def main() -> int:
+ ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+ ap.add_argument(
+ "--backend",
+ required=True,
+ choices=[
+ "deepep",
+ "deepep-v2",
+ "deepep-hybrid",
+ "mori",
+ "uccl",
+ "nccl-ep",
+ ],
+ )
+ ep_harness.add_common_args(ap)
+ args = ap.parse_args()
+
+ if args.case_id and not identity.is_typed_id(args.case_id, "case"):
+ print(f"ERROR: invalid native case ID {args.case_id!r}", file=sys.stderr)
+ return 2
+ if args.case_id and args.seed != ep_harness.ROUTING_SEED:
+ print(
+ f"ERROR: scheduled v1 cases require seed={ep_harness.ROUTING_SEED}; got {args.seed}",
+ file=sys.stderr,
+ )
+ return 2
+
+ sampling_error = ep_harness.sampling_contract_error(
+ args.iters, args.trials, args.warmup
+ )
+ if sampling_error:
+ print(f"ERROR: {sampling_error}", file=sys.stderr)
+ return 2
+
+ try:
+ import torch
+ import torch.distributed as dist
+ except Exception as exc: # pragma: no cover
+ print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+ return 3
+
+ rank = int(os.environ.get("RANK", "0"))
+ world_size = int(os.environ.get("WORLD_SIZE", "1"))
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+ torch.cuda.set_device(local_rank)
+ device = torch.device(f"cuda:{local_rank}")
+ os.environ.setdefault("MASTER_ADDR", "localhost")
+ os.environ.setdefault("MASTER_PORT", "12355")
+
+ import capability
+
+ sku = capability.PLATFORMS.get(args.runner)
+ if sku is None:
+ print(f"ERROR: unknown runner identity {args.runner!r}", file=sys.stderr)
+ return 5
+ machine = {"x86_64": "amd64", "aarch64": "arm64"}.get(
+ platform.machine(), platform.machine()
+ )
+ props = torch.cuda.get_device_properties(device)
+ if torch.version.hip:
+ vendor = "amd"
+ accelerator = str(getattr(props, "gcnArchName", "")).split(":", 1)[0]
+ else:
+ vendor = "nvidia"
+ major, minor = torch.cuda.get_device_capability(device)
+ accelerator = f"sm{major}{minor}"
+ device_name = torch.cuda.get_device_name(device)
+ device_count = torch.cuda.device_count()
+ identity_issues = capability.runtime_identity_issues(
+ args.runner,
+ vendor=vendor,
+ arch=accelerator,
+ machine=machine,
+ device_name=device_name,
+ device_count=device_count,
+ world_size=world_size,
+ )
+ if identity_issues:
+ print(
+ f"ERROR: runtime identity does not match {args.runner}: "
+ + "; ".join(identity_issues),
+ file=sys.stderr,
+ )
+ return 5
+ if args.gpus_per_node and args.gpus_per_node != sku["gpus_per_node"]:
+ print(
+ f"ERROR: {args.runner} requires {sku['gpus_per_node']} GPUs per node",
+ file=sys.stderr,
+ )
+ return 5
+ args.runtime_device_product = device_name
+ args.runtime_device_count = device_count
+ args.allocation_execution_id = os.environ.get("COLLECTIVEX_EXECUTION_ID")
+
+ # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction
+ # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL
+ # routing trace and remaps it to the balanced physical placement (a pure routing transform,
+ # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count.
+ if getattr(args, "eplb", False):
+ import eplb
+
+ args.num_logical_experts = args.experts
+ args.experts = eplb.physical_count(
+ args.experts, ep_harness.EPLB_REDUNDANT_EXPERTS, world_size
+ )
+
+ # Reproduction provenance (recorded in the artifact). Rack launchers provide ranks directly
+ # through srun, while single-node launchers use torchrun; do not claim torchrun for both.
+ if os.environ.get("TORCHELASTIC_RUN_ID"):
+ args.distributed_launcher = "torchrun"
+ prefix = f"torchrun --nproc_per_node={world_size}"
+ else:
+ args.distributed_launcher = "rank-environment"
+ prefix = f"RANK={rank} WORLD_SIZE={world_size} LOCAL_RANK={local_rank} python3"
+ args.reproduction_command = f"{prefix} tests/run_ep.py {shlex.join(sys.argv[1:])}"
+ args.image = os.environ.get("COLLECTIVEX_IMAGE", "")
+ args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "")
+ args.image_digest_verified = (
+ os.environ.get("COLLECTIVEX_IMAGE_DIGEST_VERIFIED") == "1"
+ )
+ # Container architecture and local squash hash for Enroot/Pyxis.
+ args.image_arch = machine
+ args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256")
+ # GitHub provenance: repo, run ID, attempt, ref, source SHA, job,
+ # artifact. A result is only publication-'official' when these are present (validity gate).
+ _run = {
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA")
+ or os.environ.get("GITHUB_SHA"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+ args.git_run = _run if any(_run.values()) else None
+
+ # Import the backend class only after torch initializes. Every adapter implements
+ # the same fixed v1 profile; the CLI has no precision/mode/contract fallbacks.
+ if args.backend == "mori":
+ from ep_mori import MoRIBackend as Backend
+ elif args.backend == "nccl-ep":
+ from ep_nccl import NCCLBackend as Backend
+ elif args.backend == "uccl":
+ from ep_uccl import UCCLBackend as Backend
+ elif args.backend == "deepep-hybrid":
+ from ep_deepep_hybrid import DeepEPHybridBackend as Backend
+ elif args.backend == "deepep-v2":
+ from ep_deepep_v2 import DeepEPV2Backend as Backend
+ else:
+ from ep_deepep import DeepEPBackend as Backend
+
+ # MoRI uses the gloo+NCCL group shape from its reference; other adapters use NCCL/RCCL.
+ if not dist.is_initialized():
+ if args.backend == "mori":
+ dist.init_process_group(
+ backend="cpu:gloo,cuda:nccl",
+ rank=rank,
+ world_size=world_size,
+ device_id=device,
+ )
+ elif args.backend == "deepep-v2":
+ # PR #605 reuses PyTorch's NCCL communicator through ``_comm_ptr``. Supplying
+ # device_id eagerly forms it before ElasticBuffer construction.
+ dist.init_process_group("nccl", device_id=device)
+ else:
+ dist.init_process_group("nccl")
+
+ args.runtime_fingerprint = _runtime_fingerprint(
+ torch, device, machine=machine, vendor=vendor, arch=accelerator
+ )
+
+ gpus_per_node = args.gpus_per_node or sku["gpus_per_node"]
+ try:
+ expected_nodes = int(
+ os.environ.get("SLURM_NNODES", str(world_size // gpus_per_node))
+ )
+ except ValueError as exc:
+ raise ValueError("SLURM_NNODES must be a positive integer") from exc
+ realized_records: list[tuple[str, int, dict] | None] = [None] * world_size
+ dist.all_gather_object(
+ realized_records,
+ (socket.gethostname(), local_rank, args.runtime_fingerprint),
+ )
+ complete_records = [record for record in realized_records if record is not None]
+ args.realized_placement = _summarize_realized_placement(
+ [(record[0], record[1]) for record in complete_records],
+ expected_nodes=expected_nodes,
+ expected_gpus_per_node=gpus_per_node,
+ expected_world_size=world_size,
+ )
+ args.runtime_fingerprint = _common_runtime_fingerprint(
+ [record[2] for record in complete_records]
+ )
+
+ # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its
+ # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an
+ # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise.
+ try:
+ backend = Backend(args, rank, world_size, local_rank, device)
+ if rank == 0:
+ print(
+ f"[run_ep] backend={args.backend} phase={args.phase} mode=normal "
+ f"world={world_size} ep_size={world_size} hidden={args.hidden} "
+ f"topk={args.topk} experts={args.experts} dtype=bf16 "
+ f"routing={args.routing} seed={args.seed}"
+ )
+ rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+ except Exception:
+ import traceback
+
+ print(
+ f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n"
+ + traceback.format_exc(),
+ flush=True,
+ )
+ raise
+ # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+ # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+ return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/test_deepep_v2_contract.py b/experimental/CollectiveX/tests/test_deepep_v2_contract.py
new file mode 100644
index 0000000000..afd01ea3e6
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_deepep_v2_contract.py
@@ -0,0 +1,1852 @@
+#!/usr/bin/env python3
+"""CPU-only structural and registry tests for the pinned DeepEP V2 path."""
+from __future__ import annotations
+
+import ast
+import copy
+import ctypes
+import hashlib
+import json
+import os
+from pathlib import Path
+from pathlib import PurePosixPath
+import shutil
+import stat
+import subprocess
+import sys
+import tempfile
+import types
+import unittest
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path.insert(0, str(ROOT))
+
+import capability # noqa: E402
+import contracts # noqa: E402
+import ep_harness # noqa: E402
+import run_ep # noqa: E402
+
+
+COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+TREE = "29809e75c5874e6609dac4804e7b651d5226959f"
+FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+
+
+def deepep_v2_jit_provenance() -> list[dict[str, str]]:
+ return [
+ {
+ "cache_key": f"kernel.{name}.{index:032x}",
+ "cubin_sha256": f"{index + 1:x}" * 64,
+ "sass_sha256": f"{index + 2:x}" * 64,
+ "source_sha256": f"{index + 3:x}" * 64,
+ }
+ for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS))
+ ]
+
+
+def hybrid_realized_config() -> dict[str, object]:
+ config = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS}
+ for field in contracts.HYBRID_REALIZED_BOOL_FIELDS:
+ config[field] = True
+ config["token_data_type"] = "UINT16"
+ return config
+
+
+def hybrid_jit_provenance(ranks: int = 2) -> tuple[list[str], list[dict[str, object]]]:
+ keys = ["combine-key", "dispatch-key", "preprocess-key"]
+ artifacts = [
+ {
+ "kernel_key": key,
+ "rank_artifacts": [
+ {"bytes": 10 + index, "rank": rank, "sha256": f"{index + 1:x}" * 64}
+ for rank in range(ranks)
+ ],
+ }
+ for index, key in enumerate(keys)
+ ]
+ return keys, artifacts
+
+
+def load_uccl_function(name: str, namespace: dict[str, object]):
+ path = HERE / "ep_uccl.py"
+ function = next(
+ node
+ for node in ast.parse(path.read_text()).body
+ if isinstance(node, ast.FunctionDef) and node.name == name
+ )
+ exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace)
+ return namespace[name]
+
+
+def operator_config(root: Path) -> dict[str, object]:
+ path = str(root)
+ runners = {
+ "h100-dgxc": {"partition": "test", "account": "test", "squash_dir": path},
+ "h200-dgxc": {"partition": "test", "squash_dir": path},
+ "b200-dgxc": {"partition": "test", "account": "test", "squash_dir": path},
+ "b300": {
+ "partition": "test", "account": "test", "squash_dir": path, "stage_dir": path,
+ },
+ "gb200": {"partition": "test", "account": "test", "storage_roots": [path]},
+ "gb300": {
+ "partition": "test", "account": "test", "squash_dir": path,
+ "stage_dir": path, "enroot_cache_path": path,
+ },
+ "mi325x": {"partition": "test", "squash_dir": path},
+ "mi355x": {"partition": "test", "squash_dir": path},
+ }
+ return {"schema_version": 1, "runners": runners}
+
+
+class DeepEPV2ContractTests(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls) -> None:
+ cls.path = HERE / "ep_deepep_v2.py"
+ cls.tree = ast.parse(cls.path.read_text(), str(cls.path))
+
+ def test_capability_is_explicit_for_every_sku(self) -> None:
+ backend = capability.BACKENDS["deepep-v2"]
+ self.assertEqual(
+ (backend["implementation"], backend["commit"], backend["torch"], backend["nccl"]),
+ ("deep_ep.ElasticBuffer", COMMIT, "2.10.0+cu130", "2.30.4"),
+ )
+ self.assertEqual(backend["source"], "deepseek-ai/DeepEP#605+#630")
+ self.assertEqual(backend["communication_backend"], "nccl-device-lsa")
+ self.assertEqual(set(backend["sku_capabilities"]), set(capability.PLATFORMS))
+ for sku, platform in capability.PLATFORMS.items():
+ ok, _ = capability.resolve(sku, "deepep-v2")
+ self.assertEqual(ok, platform["vendor"] == "nvidia")
+ self.assertEqual(
+ set(backend["sku_capabilities"][sku]), {"basis", "schedulable"}
+ )
+
+ def test_adapter_ast_pins_elastic_api_and_weight_semantics(self) -> None:
+ imports = {
+ alias.name
+ for node in ast.walk(self.tree)
+ if isinstance(node, ast.ImportFrom) and node.module == "deep_ep"
+ for alias in node.names
+ }
+ self.assertEqual(imports, {"ElasticBuffer"})
+ constants = {
+ node.targets[0].id: ast.literal_eval(node.value)
+ for node in self.tree.body
+ if isinstance(node, ast.Assign)
+ and len(node.targets) == 1
+ and isinstance(node.targets[0], ast.Name)
+ and isinstance(node.value, ast.Constant)
+ }
+ self.assertEqual(constants["DEEPEP_V2_COMMIT"], COMMIT)
+ self.assertEqual(constants["DEEPEP_V2_TREE"], TREE)
+ self.assertEqual(constants["DEEPEP_V2_FMT_COMMIT"], FMT_COMMIT)
+ self.assertEqual(constants["DEEPEP_V2_PR"], 605)
+ self.assertEqual(constants["DEEPEP_V2_FIX_PR"], 630)
+ self.assertEqual(
+ constants["DEEPEP_V2_JIT_RANDOM_SEED"],
+ "collectivex-deepep-v2-fa8a9b1",
+ )
+ self.assertEqual(constants["NCCL_VERSION"], "2.30.4")
+ self.assertEqual(constants["NVSHMEM_VERSION"], "3.3.9")
+ backend = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend"
+ )
+ assignments = {
+ node.targets[0].id: ast.literal_eval(node.value)
+ for node in backend.body
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Name)
+ and isinstance(node.value, ast.Constant)
+ }
+ self.assertEqual(assignments["combine_weight_semantics"], "unweighted-rank-sum")
+ methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)}
+ self.assertTrue({
+ "dispatch", "inspect_dispatch", "combine_transformed", "capture_deferred_provenance",
+ "finalize",
+ } <= methods)
+ self.assertNotIn("expected", methods)
+ constructor = next(
+ node for node in ast.walk(backend)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "ElasticBuffer"
+ )
+ deterministic = next(
+ keyword for keyword in constructor.keywords if keyword.arg == "deterministic"
+ )
+ self.assertIs(ast.literal_eval(deterministic.value), False)
+ self.assertIn("deterministic", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("tuning_num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("jit_random_seed", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("gin_enabled", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("communication_backend", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("deepep_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("deepep_fix_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ source = self.path.read_text()
+ self.assertIn('getattr(args, "num_logical_experts", args.experts)', source)
+ self.assertIn('"use_expanded_layout": False', source)
+ self.assertIn("allow_hybrid_mode = _configure_gin_mode(args, world_size)", source)
+ self.assertIn("get_theoretical_num_sms(tuning_num_experts, args.topk)", source)
+
+ jit_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_jit_cache_key"
+ )
+ namespace = {"hashlib": __import__("hashlib"), "json": json}
+ exec(compile(ast.Module(body=[jit_function], type_ignores=[]), str(self.path), "exec"), namespace)
+ key = namespace["_jit_cache_key"]
+ baseline = types.SimpleNamespace(
+ runner="h100-dgxc", hidden=7168, topk=8, experts=256,
+ routing="uniform", eplb=False, case_id="uniform",
+ )
+ zipf = types.SimpleNamespace(**{**vars(baseline), "routing": "zipf", "case_id": "zipf"})
+ eplb = types.SimpleNamespace(
+ **{**vars(zipf), "experts": 288, "num_logical_experts": 256, "eplb": True}
+ )
+ realized = {
+ "num_sms": 24,
+ "num_qps": 9,
+ "allocated_qps": 17,
+ "logical_scaleout_ranks": 1,
+ "logical_scaleup_ranks": 8,
+ "physical_rdma_ranks": 2,
+ "physical_nvlink_ranks": 4,
+ "is_scaleup_nvlink": False,
+ "device_arch_major": 9,
+ "device_arch_minor": 0,
+ "device_sms": 132,
+ "device_smem_bytes": 232448,
+ "gpu_timeout_cycles": 198000000000,
+ }
+ direct = key(baseline, 8, 128, False, realized)
+ self.assertTrue(direct.startswith("jitcfg-v3-"))
+ self.assertEqual(direct, key(zipf, 8, 128, False, realized))
+ self.assertNotEqual(direct, key(zipf, 8, 128, True, realized))
+ self.assertNotEqual(direct, key(eplb, 8, 128, False, realized))
+ for field, value in realized.items():
+ changed = not value if type(value) is bool else value + 1
+ self.assertNotEqual(
+ direct,
+ key(baseline, 8, 128, False, {**realized, field: changed}),
+ field,
+ )
+ init = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef) and node.name == "__init__"
+ )
+ buffer_call = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "ElasticBuffer"
+ )
+ jit_config_check = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "_require_cross_rank_equal"
+ and ast.literal_eval(node.args[1]) == "JIT configuration"
+ )
+ cache_assignment = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Subscript)
+ and ast.unparse(node.targets[0].value) == "os.environ"
+ and ast.literal_eval(node.targets[0].slice) == "EP_JIT_CACHE_DIR"
+ )
+ self.assertLess(buffer_call.lineno, jit_config_check.lineno)
+ self.assertLess(jit_config_check.lineno, cache_assignment.lineno)
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name == "capture_deferred_provenance"
+ )
+ calls = [node for node in ast.walk(capture) if isinstance(node, ast.Call)]
+ barrier = next(
+ node for node in calls
+ if isinstance(node.func, ast.Attribute) and node.func.attr == "barrier"
+ )
+ self.assertEqual(
+ {keyword.arg: ast.literal_eval(keyword.value) for keyword in barrier.keywords},
+ {"use_comm_stream": True, "with_cpu_sync": True},
+ )
+ scan = next(
+ node for node in calls
+ if isinstance(node.func, ast.Name) and node.func.id == "_jit_artifact_evidence"
+ )
+ self.assertLess(barrier.lineno, scan.lineno)
+ realized_check = next(
+ node for node in ast.walk(backend)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "_require_cross_rank_equal"
+ and len(node.args) > 1
+ and isinstance(node.args[1], ast.Constant)
+ and node.args[1].value == "realized tuning/topology"
+ )
+ self.assertIsInstance(realized_check, ast.Call)
+ self.assertEqual(
+ (ROOT / "tests" / "ep_harness.py").read_text().count(
+ "capture_deferred_provenance()"
+ ),
+ 2,
+ )
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ provenance = schema["properties"]["implementation"]["properties"]["provenance"]
+ self.assertEqual(provenance["properties"]["deterministic"], {"type": "boolean"})
+ self.assertEqual(
+ provenance["properties"]["num_experts"],
+ {"minimum": 1, "type": "integer"},
+ )
+ self.assertEqual(
+ provenance["properties"]["tuning_num_experts"],
+ {"minimum": 1, "type": "integer"},
+ )
+ self.assertEqual(
+ provenance["properties"]["jit_cubins"]["items"],
+ {"$ref": "#/$defs/deepep_v2_jit_cubin"},
+ )
+ self.assertEqual(
+ (
+ provenance["properties"]["jit_cubins"]["minItems"],
+ provenance["properties"]["jit_cubins"]["maxItems"],
+ ),
+ (5, 5),
+ )
+ self.assertEqual(
+ provenance["properties"]["jit_random_seed"],
+ {"const": "collectivex-deepep-v2-fa8a9b1"},
+ )
+ self.assertEqual(provenance["properties"]["allow_hybrid_mode"], {"const": False})
+ self.assertEqual(provenance["properties"]["gin_enabled"], {"const": False})
+ self.assertEqual(provenance["properties"]["deepep_pr"], {"const": 605})
+ self.assertEqual(provenance["properties"]["deepep_fix_pr"], {"const": 630})
+ self.assertEqual(
+ provenance["properties"]["communication_backend"],
+ {"const": "nccl-device-lsa"},
+ )
+ for field, value in (
+ ("num_experts", "288"),
+ ("tuning_num_experts", "not-an-integer"),
+ ("tuning_num_experts", 0),
+ ):
+ with self.subTest(provenance_field=field, value=value):
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues(
+ "deepep-v2", {field: value}
+ ),
+ )
+
+ def test_v2_gin_mode_uses_the_scale_up_domain_and_safe_fallbacks(self) -> None:
+ functions = {
+ node.name: node for node in self.tree.body if isinstance(node, ast.FunctionDef)
+ }
+ namespace = {"os": os}
+ exec(
+ compile(
+ ast.Module(
+ body=[
+ functions["_configure_gin_mode"],
+ functions["_lsa_topology_is_valid"],
+ ],
+ type_ignores=[],
+ ),
+ str(self.path),
+ "exec",
+ ),
+ namespace,
+ )
+ configure = namespace["_configure_gin_mode"]
+ topology_is_valid = namespace["_lsa_topology_is_valid"]
+ original = os.environ.get("EP_DISABLE_GIN")
+ try:
+ args = types.SimpleNamespace(scale_up_domain=72, gpus_per_node=4)
+ self.assertFalse(configure(args, 8))
+ self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1")
+
+ os.environ["EP_DISABLE_GIN"] = "stale"
+ args = types.SimpleNamespace(scale_up_domain=8, gpus_per_node=4)
+ self.assertTrue(configure(args, 16))
+ self.assertNotIn("EP_DISABLE_GIN", os.environ)
+
+ args = types.SimpleNamespace(gpus_per_node=4)
+ self.assertTrue(configure(args, 8))
+ self.assertNotIn("EP_DISABLE_GIN", os.environ)
+
+ self.assertFalse(configure(types.SimpleNamespace(), 8))
+ self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1")
+
+ topology = {
+ "physical_rdma_ranks": 1,
+ "physical_nvlink_ranks": 8,
+ "logical_scaleout_ranks": 1,
+ "logical_scaleup_ranks": 8,
+ "is_scaleup_nvlink": True,
+ }
+ self.assertTrue(topology_is_valid(False, 8, topology))
+ self.assertTrue(topology_is_valid(True, 16, topology))
+ topology["physical_nvlink_ranks"] = 4
+ self.assertFalse(topology_is_valid(False, 8, topology))
+ finally:
+ if original is None:
+ os.environ.pop("EP_DISABLE_GIN", None)
+ else:
+ os.environ["EP_DISABLE_GIN"] = original
+
+ def test_ep_adapters_declare_unweighted_rank_sum(self) -> None:
+ adapters = {
+ "ep_deepep.py": "DeepEPBackend",
+ "ep_deepep_v2.py": "DeepEPV2Backend",
+ "ep_deepep_hybrid.py": "DeepEPHybridBackend",
+ "ep_mori.py": "MoRIBackend",
+ "ep_nccl.py": "NCCLBackend",
+ "ep_uccl.py": "UCCLBackend",
+ }
+ for filename, class_name in adapters.items():
+ with self.subTest(adapter=filename):
+ tree = ast.parse((HERE / filename).read_text())
+ backend = next(
+ node for node in tree.body
+ if isinstance(node, ast.ClassDef) and node.name == class_name
+ )
+ assignment = next(
+ node for node in backend.body
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Name)
+ and node.targets[0].id == "combine_weight_semantics"
+ )
+ self.assertEqual(ast.literal_eval(assignment.value), "unweighted-rank-sum")
+ combine_methods = [
+ item for item in backend.body
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef))
+ and item.name in {"combine", "combine_transformed"}
+ ]
+ self.assertEqual(len(combine_methods), 2)
+ for method in combine_methods:
+ self.assertNotIn("topk_weights", ast.unparse(method))
+ self.assertNotIn("combine_topk_weights", ast.unparse(method))
+
+ def test_deepep_v2_jit_evidence_is_strict_and_stable(self) -> None:
+ valid = deepep_v2_jit_provenance()
+ self.assertTrue(contracts._deepep_v2_jit_cubins_are_valid(valid))
+ for invalid in (
+ [],
+ [{**valid[0], "path": "/private/kernel.cubin"}],
+ [{**item, "cache_key": "dispatch"} for item in valid],
+ [{**item, "cubin_sha256": "invalid"} for item in valid],
+ valid[:-1],
+ [*valid, valid[0]],
+ [
+ *valid,
+ {
+ **valid[0],
+ "cache_key": valid[0]["cache_key"][:-32] + "f" * 32,
+ },
+ ],
+ ):
+ with self.subTest(invalid=invalid):
+ self.assertFalse(contracts._deepep_v2_jit_cubins_are_valid(invalid))
+
+ backend = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend"
+ )
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name == "capture_deferred_provenance"
+ )
+ artifacts = copy.deepcopy(valid)
+
+ class FakeBuffer:
+ @staticmethod
+ def barrier(*, use_comm_stream: bool, with_cpu_sync: bool) -> None:
+ self.assertTrue(use_comm_stream)
+ self.assertTrue(with_cpu_sync)
+
+ namespace = {
+ "torch": types.SimpleNamespace(
+ cuda=types.SimpleNamespace(synchronize=lambda: None)
+ ),
+ "_jit_artifact_evidence": lambda: copy.deepcopy(artifacts),
+ "_require_cross_rank_equal": lambda _value, _label: None,
+ }
+ exec(
+ compile(ast.Module(body=[capture], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ state = types.SimpleNamespace(
+ buffer=FakeBuffer(),
+ _deferred_jit_snapshot=None,
+ backend_provenance={"jit_cubins": []},
+ )
+ namespace["capture_deferred_provenance"](state)
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0]["cubin_sha256"] = "f" * 64
+ with self.assertRaisesRegex(RuntimeError, "changed after measurement"):
+ namespace["capture_deferred_provenance"](state)
+
+ def test_deepep_v2_jit_files_are_complete_regular_and_content_bound(self) -> None:
+ functions = [
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name in {"_sha256", "_jit_artifact_evidence"}
+ ]
+ namespace = {
+ "hashlib": hashlib,
+ "os": os,
+ "Path": Path,
+ "re": __import__("re"),
+ "DEEPEP_V2_JIT_KERNELS": contracts.DEEPEP_V2_JIT_KERNELS,
+ }
+ exec(compile(ast.Module(body=functions, type_ignores=[]), str(self.path), "exec"), namespace)
+ with tempfile.TemporaryDirectory() as temporary:
+ cache = Path(temporary) / "cache"
+ cache.mkdir()
+ for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)):
+ kernel = cache / f"kernel.{name}.{index:032x}"
+ kernel.mkdir()
+ for suffix in ("cu", "cubin", "sass"):
+ (kernel / f"kernel.{suffix}").write_bytes(f"{name}-{suffix}".encode())
+ old_cache = os.environ.get("EP_JIT_CACHE_DIR")
+ os.environ["EP_JIT_CACHE_DIR"] = temporary
+ try:
+ evidence = namespace["_jit_artifact_evidence"]()
+ self.assertEqual(len(evidence), len(contracts.DEEPEP_V2_JIT_KERNELS))
+ self.assertEqual(
+ set(evidence[0]),
+ {"cache_key", "cubin_sha256", "sass_sha256", "source_sha256"},
+ )
+ first = cache / evidence[0]["cache_key"]
+ duplicate = cache / (evidence[0]["cache_key"][:-32] + "f" * 32)
+ duplicate.mkdir()
+ for suffix in ("cu", "cubin", "sass"):
+ (duplicate / f"kernel.{suffix}").write_bytes(b"duplicate")
+ with self.assertRaisesRegex(RuntimeError, "kernel set"):
+ namespace["_jit_artifact_evidence"]()
+ shutil.rmtree(duplicate)
+ (first / "kernel.sass").unlink()
+ with self.assertRaisesRegex(RuntimeError, "incomplete"):
+ namespace["_jit_artifact_evidence"]()
+ (first / "kernel.sass").symlink_to(first / "kernel.cubin")
+ with self.assertRaisesRegex(RuntimeError, "regular file"):
+ namespace["_jit_artifact_evidence"]()
+ finally:
+ if old_cache is None:
+ os.environ.pop("EP_JIT_CACHE_DIR", None)
+ else:
+ os.environ["EP_JIT_CACHE_DIR"] = old_cache
+
+ def test_runtime_and_shared_version_formatter_are_valid(self) -> None:
+ subprocess.run(
+ ["bash", "-n", str(ROOT / "runtime" / "run_in_container.sh")],
+ check=True,
+ )
+ self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4")
+ self.assertEqual(ep_harness.format_collective_version((2, 30, 4)), "2.30.4")
+ source = self.path.read_text()
+ version_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_loaded_nccl_version"
+ )
+
+ class FakeNccl:
+ @staticmethod
+ def ncclGetVersion(pointer) -> int:
+ pointer._obj.value = 23004
+ return 0
+
+ namespace = {
+ "ctypes": types.SimpleNamespace(
+ CDLL=lambda _path: FakeNccl(), byref=ctypes.byref, c_int=ctypes.c_int,
+ ),
+ "ep_harness": ep_harness,
+ "os": os,
+ "_loaded_library_paths": lambda: {"/safe/libnccl.so.2"},
+ }
+ exec(
+ compile(ast.Module(body=[version_function], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ self.assertEqual(namespace["_loaded_nccl_version"](), "2.30.4")
+ for paths in (set(), {"/safe/libnccl.so.2", "/other/libnccl.so.2"}):
+ namespace["_loaded_library_paths"] = lambda paths=paths: paths
+ with self.assertRaisesRegex(RuntimeError, "exactly one"):
+ namespace["_loaded_nccl_version"]()
+ evidence_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_loaded_library_evidence"
+ )
+ paths = {
+ "/safe/_C.cpython-310-x86_64-linux-gnu.so",
+ "/safe/libnccl.so.2",
+ "/safe/libnvshmem_host.so.3",
+ }
+ namespace.update(
+ _loaded_library_paths=lambda: paths,
+ _sha256=lambda _path: "a" * 64,
+ )
+ exec(
+ compile(ast.Module(body=[evidence_function], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ evidence = namespace["_loaded_library_evidence"]()
+ self.assertIn(
+ {"name": "deep_ep._C", "role": "deepep-extension", "sha256": "a" * 64},
+ evidence,
+ )
+ self.assertTrue(
+ contracts._content_evidence_is_valid(
+ evidence, {"deepep-extension", "nccl", "nvshmem"}
+ )
+ )
+ self.assertNotIn("torch.cuda.nccl.version()", source)
+ fingerprint = {"runtime": "cuda", "version": "13.0"}
+ self.assertIs(
+ run_ep._common_runtime_fingerprint([fingerprint, dict(fingerprint)]),
+ fingerprint,
+ )
+ with self.assertRaises(ValueError):
+ run_ep._common_runtime_fingerprint([fingerprint, {"runtime": "cuda", "version": "12.8"}])
+
+ def test_conditioning_contract_is_exact_for_each_phase(self) -> None:
+ expected = {
+ "decode": [1, 2, 4, 8, 16, 32, 64, 128],
+ "prefill": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
+ }
+ for phase, ladder in expected.items():
+ valid = {
+ "contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "ladder": ladder,
+ "roundtrips_per_shape": 8,
+ }
+ self.assertIs(contracts.validate_conditioning_contract(valid, phase), valid)
+ for mutate in (
+ lambda item: item["ladder"].reverse(),
+ lambda item: item["ladder"].pop(),
+ lambda item: item.update(ladder=[1.0, *item["ladder"][1:]]),
+ lambda item: item.update(roundtrips_per_shape=7),
+ lambda item: item.update(roundtrips_per_shape=8.0),
+ ):
+ changed = copy.deepcopy(valid)
+ mutate(changed)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_conditioning_contract(changed, phase)
+ other = "prefill" if phase == "decode" else "decode"
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_conditioning_contract(valid, other)
+
+ def test_content_manifest_evidence_is_stable_and_content_sensitive(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ first, second = root / "first", root / "second"
+ first.write_bytes(b"first")
+ second.write_bytes(b"second")
+ files = [("pkg/first", first), ("pkg/second", second)]
+ evidence = contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=files,
+ )
+ self.assertNotIn(temporary, json.dumps(evidence))
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=reversed(files),
+ ),
+ )
+ self.assertRegex(evidence["sha256"], r"^[0-9a-f]{64}$")
+ second.write_bytes(b"changed")
+ self.assertNotEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=files,
+ ),
+ )
+ for invalid in (
+ [("../first", first)],
+ [("same", first), ("same", second)],
+ [("missing", root / "missing")],
+ ):
+ with self.assertRaises(contracts.ContractError):
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=invalid,
+ )
+
+ def test_hybrid_realized_config_and_jit_evidence_are_path_free(self) -> None:
+ path = HERE / "ep_deepep_hybrid.py"
+ tree = ast.parse(path.read_text(), str(path))
+ selected = [
+ node for node in tree.body
+ if (
+ isinstance(node, ast.Assign)
+ and any(
+ isinstance(target, ast.Name) and target.id == "HYBRID_CONFIG_FIELDS"
+ for target in node.targets
+ )
+ )
+ or isinstance(node, ast.FunctionDef)
+ and node.name in {
+ "_hybrid_realized_config", "_sha256_with_size", "_hybrid_jit_evidence",
+ }
+ ]
+ namespace = {"Path": Path, "hashlib": hashlib, "re": __import__("re")}
+ exec(compile(ast.Module(body=selected, type_ignores=[]), str(path), "exec"), namespace)
+ fields = namespace["HYBRID_CONFIG_FIELDS"]
+ self.assertEqual(set(fields), contracts.HYBRID_REALIZED_CONFIG_FIELDS)
+
+ class TokenType:
+ def __init__(self, label: str, name: str | None = None) -> None:
+ self.label = label
+ if name is not None:
+ self.name = name
+
+ def __str__(self) -> str:
+ return self.label
+
+ values = {field: 1 for field in fields}
+ values.update({field: True for field in contracts.HYBRID_REALIZED_BOOL_FIELDS})
+ for raw, expected in (("uint16_t", "UINT16"), ("uint8_t", "UINT8")):
+ values["token_data_type"] = TokenType(raw)
+ config = types.SimpleNamespace(**values)
+ realized = namespace["_hybrid_realized_config"](config)
+ self.assertEqual(realized["token_data_type"], expected)
+ self.assertEqual(set(realized), contracts.HYBRID_REALIZED_CONFIG_FIELDS)
+ values["token_data_type"] = TokenType("opaque-enum", "UINT16")
+ self.assertEqual(
+ namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))[
+ "token_data_type"
+ ],
+ "UINT16",
+ )
+ values["token_data_type"] = TokenType("UINT16")
+ with self.assertRaisesRegex(RuntimeError, "token_data_type is invalid"):
+ namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))
+ values["token_data_type"] = TokenType("uint16_t")
+ config = types.SimpleNamespace(**values)
+ delattr(config, "hidden_dim")
+ with self.assertRaisesRegex(RuntimeError, "omits hidden_dim"):
+ namespace["_hybrid_realized_config"](config)
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ for key, payload in (
+ ("preprocess-key", b"pre"),
+ ("combine-key", b"combine"),
+ ("dispatch-key", b"dispatch"),
+ ):
+ (root / f"{key}.so").write_bytes(payload)
+ evidence = namespace["_hybrid_jit_evidence"](root)
+ self.assertEqual(
+ [item["kernel_key"] for item in evidence],
+ ["combine-key", "dispatch-key", "preprocess-key"],
+ )
+ self.assertNotIn(temporary, json.dumps(evidence))
+ (root / "dispatch-key.so").write_bytes(b"changed")
+ self.assertNotEqual(evidence, namespace["_hybrid_jit_evidence"](root))
+ (root / "extra-key.so").write_bytes(b"extra")
+ with self.assertRaisesRegex(RuntimeError, "expected 3"):
+ namespace["_hybrid_jit_evidence"](root)
+ (root / "extra-key.so").unlink()
+ (root / "bad key.so").write_bytes(b"bad")
+ with self.assertRaisesRegex(RuntimeError, "kernel key"):
+ namespace["_hybrid_jit_evidence"](root)
+ (root / "bad key.so").unlink()
+ (root / "combine-key.so").unlink()
+ (root / "combine-key.so").symlink_to(root / "dispatch-key.so")
+ with self.assertRaisesRegex(RuntimeError, "regular file"):
+ namespace["_hybrid_jit_evidence"](root)
+ empty = root / "empty"
+ empty.mkdir()
+ with self.assertRaisesRegex(RuntimeError, "expected 3"):
+ namespace["_hybrid_jit_evidence"](empty)
+
+ def test_hybrid_deferred_provenance_wraps_before_conditioning_and_recaptures(self) -> None:
+ path = HERE / "ep_deepep_hybrid.py"
+ source = path.read_text()
+ tree = ast.parse(source, str(path))
+ backend = next(
+ node for node in tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPHybridBackend"
+ )
+ methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)}
+ self.assertIn("capture_deferred_provenance", methods)
+ constructor = next(node for node in backend.body if isinstance(node, ast.FunctionDef) and node.name == "__init__")
+ buffer_call = next(
+ node for node in ast.walk(constructor)
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name)
+ and node.func.id == "HybridEPBuffer"
+ )
+ wrapper_install = next(
+ node for node in ast.walk(constructor)
+ if isinstance(node, ast.Assign)
+ and any(
+ isinstance(target, ast.Attribute)
+ and target.attr == "update_template_config"
+ for target in node.targets
+ )
+ )
+ cache_line = source[:source.index('os.environ["HYBRID_EP_CACHE_DIR"]')].count("\n") + 1
+ self.assertLess(cache_line, buffer_call.lineno)
+ self.assertLess(buffer_call.lineno, wrapper_install.lineno)
+
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef) and node.name == "capture_deferred_provenance"
+ )
+ called = {
+ node.func.id if isinstance(node.func, ast.Name) else node.func.attr
+ for node in ast.walk(capture) if isinstance(node, ast.Call)
+ and isinstance(node.func, (ast.Name, ast.Attribute))
+ }
+ self.assertTrue({"_hybrid_jit_evidence", "_require_cross_rank_equal", "all_gather_object"} <= called)
+ self.assertIn("changed after measurement", ast.get_source_segment(source, capture))
+
+ artifacts = [[
+ {"bytes": 1, "kernel_key": key, "sha256": digit * 64}
+ for key, digit in (("a", "1"), ("b", "2"), ("c", "3"))
+ ]]
+
+ class FakeCuda:
+ @staticmethod
+ def synchronize() -> None:
+ return None
+
+ class FakeDist:
+ @staticmethod
+ def barrier() -> None:
+ return None
+
+ @staticmethod
+ def get_world_size() -> int:
+ return 2
+
+ @staticmethod
+ def all_gather_object(output, value) -> None:
+ output[:] = [copy.deepcopy(value), copy.deepcopy(value)]
+
+ namespace = {
+ "torch": types.SimpleNamespace(cuda=FakeCuda),
+ "dist": FakeDist,
+ "_hybrid_jit_evidence": lambda _root: copy.deepcopy(artifacts[0]),
+ "_require_cross_rank_equal": lambda _value, _label: None,
+ }
+ exec(compile(ast.Module(body=[capture], type_ignores=[]), str(path), "exec"), namespace)
+ state = types.SimpleNamespace(
+ _deferred_jit_diagnostics=None,
+ _deferred_semantic_snapshot=None,
+ _jit_root=Path("private-cache"),
+ _realized_config=hybrid_realized_config(),
+ backend_provenance={},
+ )
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0][0]["kernel_key"] = "changed"
+ with self.assertRaisesRegex(RuntimeError, "kernel set changed"):
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0][0]["kernel_key"] = "a"
+ artifacts[0][0]["sha256"] = "f" * 64
+ with self.assertRaisesRegex(RuntimeError, "artifacts changed"):
+ namespace["capture_deferred_provenance"](state)
+
+ harness = (HERE / "ep_harness.py").read_text()
+ captures = [
+ index for index in range(len(harness))
+ if harness.startswith("capture_deferred_provenance()", index)
+ ]
+ self.assertEqual(len(captures), 2)
+ self.assertLess(harness.index("for wt in conditioning_ladder:"), captures[0])
+ self.assertLess(captures[0], harness.index("oracle = _run_expert_oracle("))
+ self.assertLess(harness.index("trace_sig = hashlib.sha256"), captures[1])
+
+ def test_hybrid_diagnostic_hashes_do_not_split_series_identity(self) -> None:
+ keys, artifacts = hybrid_jit_provenance()
+ provenance = {
+ "deepep_tree": "b" * 40,
+ "jit_kernel_keys": keys,
+ "jit_shared_objects": artifacts,
+ "loaded_libraries": [{
+ "name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension",
+ "sha256": "a" * 64,
+ }],
+ "realized_config": hybrid_realized_config(),
+ }
+ baseline = ep_harness._series_provenance(provenance)
+ changed = copy.deepcopy(provenance)
+ changed["jit_shared_objects"][0]["rank_artifacts"][0]["sha256"] = "f" * 64
+ self.assertEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["loaded_libraries"][0]["sha256"] = "f" * 64
+ self.assertEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["jit_kernel_keys"][0] = "changed-key"
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["realized_config"]["num_of_blocks_dispatch_api"] += 1
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["deepep_tree"] = "c" * 40
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+
+ def test_v2_series_identity_uses_source_and_sass_not_container_metadata(self) -> None:
+ provenance = {
+ "deepep_tree": "a" * 40,
+ "loaded_libraries": [
+ {"name": "deep_ep._C.so", "role": "deepep-extension", "sha256": "1" * 64},
+ {"name": "libnccl.so.2", "role": "nccl", "sha256": "2" * 64},
+ ],
+ "jit_cubins": deepep_v2_jit_provenance(),
+ "jit_random_seed": "collectivex-deepep-v2-fa8a9b1",
+ }
+ baseline = contracts.series_provenance(provenance)
+ changed = copy.deepcopy(provenance)
+ changed["loaded_libraries"][0]["sha256"] = "f" * 64
+ changed["jit_cubins"][0]["cubin_sha256"] = "e" * 64
+ self.assertEqual(contracts.series_provenance(changed), baseline)
+ for mutate in (
+ lambda item: item["loaded_libraries"][1].update(sha256="f" * 64),
+ lambda item: item["jit_cubins"][0].update(source_sha256="f" * 64),
+ lambda item: item["jit_cubins"][0].update(sass_sha256="f" * 64),
+ lambda item: item.update(deepep_tree="f" * 40),
+ ):
+ changed = copy.deepcopy(provenance)
+ mutate(changed)
+ self.assertNotEqual(contracts.series_provenance(changed), baseline)
+
+ def test_mnnvl_resolution_has_no_ambiguous_signature_fallback(self) -> None:
+ self.assertEqual(
+ contracts.resolve_deepep_mnnvl(
+ requested=False, signature_parameters=(), deepep_commit=None,
+ ),
+ ({}, "not-requested"),
+ )
+ self.assertEqual(
+ contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=("allow_mnnvl",),
+ deepep_commit="a" * 40,
+ ),
+ ({"allow_mnnvl": True}, "explicit-allow-mnnvl"),
+ )
+ with self.assertRaises(contracts.ContractError):
+ contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=(),
+ deepep_commit="814e508537c6ffc775d59f6f1b9ba43f3a65968c",
+ )
+
+ def test_backend_provenance_requires_lineage_and_content_hashes(self) -> None:
+ def record(role: str, name: str, digit: str) -> dict[str, str]:
+ return {"role": role, "name": name, "sha256": digit * 64}
+
+ hybrid_keys, hybrid_artifacts = hybrid_jit_provenance()
+ v2 = {
+ **contracts.DEEPEP_V2_V1_PROVENANCE,
+ "api_signature_sha256": "c" * 64,
+ "loaded_libraries": [
+ record("deepep-extension", "deep_ep._C", "1"),
+ record("nccl", "libnccl.so.2", "2"),
+ record("nvshmem", "libnvshmem_host.so.3", "3"),
+ ],
+ "jit_cubins": deepep_v2_jit_provenance(),
+ "jit_random_seed": "collectivex-deepep-v2-fa8a9b1",
+ "deterministic": False,
+ "num_experts": 256,
+ "tuning_num_experts": 256,
+ }
+ deepep = {
+ "deepep_version": "1.1.0", "deepep_commit": "a" * 40,
+ "backend_lineage": "deepep-v1", "allow_mnnvl": False,
+ "mnnvl_comm": "not-requested",
+ }
+ hybrid = {
+ "deepep_commit": "a" * 40, "deepep_tree": "b" * 40,
+ "branch": "hybrid-ep", "backend_lineage": "deepep-hybrid",
+ "loaded_libraries": [
+ record("deepep-extension", "deep_ep_cpp", "1"),
+ record("deepep-hybrid-extension", "hybrid_ep_cpp", "2"),
+ ],
+ "jit_kernel_keys": hybrid_keys,
+ "jit_shared_objects": hybrid_artifacts,
+ "realized_config": hybrid_realized_config(),
+ }
+ uccl = {
+ "uccl_version": "0.1.1", "uccl_commit": "pkg-0.1.1",
+ "uccl_wrapper_commit": "c" * 40, "backend_lineage": "uccl",
+ "uccl_dependency_versions": dict(contracts.UCCL_DEPENDENCY_VERSIONS),
+ "loaded_libraries": [
+ record("uccl-distribution", "uccl-0.1.1", "3"),
+ record("uccl-wrapper", "uccl-deepep-wrapper", "4"),
+ record("intervaltree-distribution", "intervaltree-3.1.0", "5"),
+ record("sortedcontainers-distribution", "sortedcontainers-2.4.0", "6"),
+ record("cuda-runtime", "nvidia-cuda-runtime-cu12-12.9.79", "7"),
+ ],
+ }
+ reference = {
+ "nccl_version": "2.30.4", "collective_library": "nccl",
+ "backend_lineage": "nccl",
+ }
+ for backend, provenance in (
+ ("deepep", deepep), ("deepep-v2", v2), ("deepep-hybrid", hybrid),
+ ("uccl", uccl), ("nccl-ep", reference),
+ ):
+ self.assertEqual(contracts.backend_provenance_issues(backend, provenance), [])
+ changed = copy.deepcopy(provenance)
+ if "loaded_libraries" in changed:
+ changed["loaded_libraries"][0]["sha256"] = "invalid"
+ expected = "loaded_libraries"
+ else:
+ changed["backend_lineage"] = "wrong"
+ expected = "backend_lineage"
+ self.assertIn(expected, contracts.backend_provenance_issues(backend, changed))
+
+ changed = copy.deepcopy(uccl)
+ changed["uccl_dependency_versions"]["intervaltree"] = "3.2.0"
+ self.assertIn(
+ "uccl_dependency_versions",
+ contracts.backend_provenance_issues("uccl", changed),
+ )
+ changed = copy.deepcopy(uccl)
+ changed["loaded_libraries"] = [
+ item
+ for item in changed["loaded_libraries"]
+ if item["role"] != "sortedcontainers-distribution"
+ ]
+ self.assertIn(
+ "loaded_libraries", contracts.backend_provenance_issues("uccl", changed)
+ )
+
+ for field, mutate in (
+ ("realized_config", lambda item: item["realized_config"].pop("hidden_dim")),
+ ("jit_kernel_keys", lambda item: item["jit_kernel_keys"].reverse()),
+ (
+ "jit_shared_objects",
+ lambda item: item["jit_shared_objects"][0]["rank_artifacts"][0].update(
+ sha256="invalid"
+ ),
+ ),
+ ):
+ with self.subTest(hybrid_field=field):
+ changed = copy.deepcopy(hybrid)
+ mutate(changed)
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-hybrid", changed),
+ )
+
+ for field, value in (
+ ("jit_cubins", [{"cache_key": "invalid", "cubin_sha256": "4" * 64}]),
+ ("jit_random_seed", "different-seed"),
+ ):
+ with self.subTest(v2_field=field):
+ changed = copy.deepcopy(v2)
+ changed[field] = value
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ )
+
+ changed = copy.deepcopy(v2)
+ changed["gin_enabled"] = True
+ self.assertIn("gin_enabled", contracts.backend_provenance_issues("deepep-v2", changed))
+ changed = copy.deepcopy(v2)
+ changed["communication_backend"] = "nccl-gin"
+ self.assertIn(
+ "communication_backend", contracts.backend_provenance_issues("deepep-v2", changed)
+ )
+ changed = copy.deepcopy(v2)
+ changed.update(
+ allow_hybrid_mode=True,
+ gin_enabled=True,
+ communication_backend="nccl-gin",
+ )
+ self.assertEqual(
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ ["allow_hybrid_mode", "communication_backend", "gin_enabled"],
+ )
+ for field, expected in contracts.DEEPEP_V2_V1_PROVENANCE.items():
+ with self.subTest(v2_pin_field=field):
+ changed = copy.deepcopy(v2)
+ changed[field] = not expected if type(expected) is bool else "wrong"
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ )
+
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ provenance_schema = schema["properties"]["implementation"]["properties"]["provenance"]
+ self.assertEqual(
+ provenance_schema["properties"]["realized_config"],
+ {"$ref": "#/$defs/hybrid_realized_config"},
+ )
+ self.assertFalse(schema["$defs"]["hybrid_realized_config"]["additionalProperties"])
+ self.assertEqual(provenance_schema["properties"]["jit_kernel_keys"]["minItems"], 3)
+ self.assertEqual(provenance_schema["properties"]["jit_shared_objects"]["minItems"], 3)
+
+ self.assertEqual(contracts.collective_kernel_generation("nccl"), "nccl")
+ self.assertEqual(contracts.collective_kernel_generation("rccl"), "rccl")
+ with self.assertRaises(contracts.ContractError):
+ contracts.collective_kernel_generation("unknown")
+
+ def test_routing_control_binds_binary_but_allows_treatment_configuration(self) -> None:
+ hybrid_keys, hybrid_artifacts = hybrid_jit_provenance()
+ implementation = {
+ "kernel_generation": "hybrid",
+ "name": "deepep-hybrid",
+ "provenance": {
+ "deepep_tree": "a" * 40,
+ "loaded_libraries": [{
+ "role": "deepep-extension", "name": "deep_ep_cpp", "sha256": "1" * 64,
+ }],
+ "local_experts": 32,
+ "num_experts": 256,
+ "num_sms": 24,
+ "jit_cache_key": "case-one",
+ "jit_cubins": [{"cache_key": "one", "cubin_sha256": "2" * 64}],
+ "jit_kernel_keys": hybrid_keys,
+ "jit_shared_objects": hybrid_artifacts,
+ "realized_config": hybrid_realized_config(),
+ },
+ "resource_profile": {"configured_units": 24},
+ }
+ baseline = contracts.routing_implementation_control_sha256(implementation)
+ treatment = copy.deepcopy(implementation)
+ treatment["provenance"].update({
+ "local_experts": 36,
+ "num_experts": 288,
+ "jit_cache_key": "case-two",
+ "jit_cubins": [{"cache_key": "two", "cubin_sha256": "3" * 64}],
+ "jit_kernel_keys": ["changed-a", "changed-b", "changed-c"],
+ "jit_shared_objects": hybrid_jit_provenance(3)[1],
+ "realized_config": {
+ **hybrid_realized_config(),
+ "num_of_experts_per_rank": 36,
+ },
+ })
+ self.assertEqual(
+ contracts.routing_implementation_control_sha256(treatment), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["loaded_libraries"][0]["sha256"] = "4" * 64
+ self.assertEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["deepep_tree"] = "b" * 40
+ self.assertNotEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["num_sms"] = 20
+ self.assertNotEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+
+ def test_runtime_pins_uccl_wheel_and_hybrid_source_tree(self) -> None:
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ self.assertIn("cd /ix/experimental/CollectiveX", runtime)
+ for launcher_name in ("launch_single-slurm.sh", "launch_gb-nv.sh"):
+ launcher = (ROOT / "launchers" / launcher_name).read_text()
+ self.assertIn("$MOUNT_SRC:/ix", launcher)
+ self.assertIn("cx_prepare_backend_cache", launcher)
+ self.assertNotIn('$(cx_prepare_backend_cache', launcher)
+ self.assertIn('BACKEND_CACHE="$CX_PREPARED_BACKEND_CACHE"', launcher)
+ self.assertIn("$BACKEND_CACHE:/cx-cache", launcher)
+ self.assertIn("CX_BACKEND_CACHE_ROOT=/cx-cache", launcher)
+ self.assertIn("CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources", launcher)
+ self.assertIn('|| [ "$CX_BENCH" = deepep-hybrid ]', launcher)
+ self.assertIn("cx_prepare_backend_source", launcher)
+ cache_block = launcher[launcher.index('if [ "$CX_BENCH" = deepep-v2 ]'):]
+ self.assertLess(
+ cache_block.index("cx_set_failure_stage backend-setup"),
+ cache_block.index("cx_prepare_backend_cache"),
+ )
+ self.assertLess(
+ cache_block.index("cx_prepare_backend_source"),
+ cache_block.index("cx_set_failure_stage scheduler-allocation"),
+ )
+ self.assertIn("--frandom-seed=$seed", runtime)
+ self.assertIn("DEEPEP_V2_JIT_RANDOM_SEED", runtime)
+ persisted = runtime[runtime.index("cx_persist_backend_env()") :]
+ self.assertIn("CUDA_HOME CPATH NVCC_PREPEND_FLAGS", persisted)
+ self.assertIn(
+ "390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec",
+ runtime,
+ )
+ self.assertIn("--require-hashes", runtime)
+ self.assertIn("d77aeab7f1bb52b615666fe178d26ced41fae08e", common)
+ self.assertIn("HEAD^{tree}", runtime)
+ self.assertIn("$PWD/.cx_backend/deepep-hybrid-", runtime)
+ self.assertIn("cx_materialize_backend_source deepep-hybrid", runtime)
+ self.assertIn("cx_materialize_backend_source deepep-v2", runtime)
+ self.assertIn("cx_deepep_hybrid_marker_content_sha256", runtime)
+ self.assertIn("cx_deepep_hybrid_cache_is_valid", runtime)
+ self.assertIn("cx_extension_pair_sha256", runtime)
+ self.assertIn(".collectivex-complete.tmp.", runtime)
+ self.assertNotIn("cx_fetch_revision", runtime)
+ self.assertIn("cx_fetch_revision", common)
+ self.assertIn("third-party/fmt", common)
+ hybrid = runtime[
+ runtime.index("cx_build_deepep_hybrid()"):
+ runtime.index("# UCCL EP")
+ ]
+ self.assertIn("cx_prepare_cuda_cccl", hybrid)
+ self.assertIn("unset NVSHMEM_DIR HYBRID_EP_MULTINODE USE_NIXL", hybrid)
+ self.assertNotIn("cx_prepare_deepep_toolchain", hybrid)
+ toolchain = runtime[
+ runtime.index("cx_prepare_deepep_toolchain()"):
+ runtime.index("cx_probe_deepep()")
+ ]
+ self.assertIn('overlay="$root/nvshmem-overlay"', toolchain)
+ self.assertIn("flock 8 || exit 1", toolchain)
+ self.assertIn('mv "$temporary" "$overlay" || exit 1', toolchain)
+ self.assertNotIn("/tmp/collectivex-nvshmem", toolchain)
+ jit = runtime[
+ runtime.index("cx_enable_deepep_v2_jit_reproducibility()"):
+ runtime.index("cx_probe_deepep_v2()")
+ ]
+ self.assertIn('cccl="${CX_CUDA_CCCL:-}"', jit)
+ self.assertNotIn("/usr/local/cuda*", jit)
+ self.assertIn("deepep-v2-cache-v2|$cpu|sm${arch/./}", runtime)
+ self.assertNotIn("deepep-v2-cache-v1|", runtime)
+ self.assertIn('base="${CX_BACKEND_CACHE_ROOT:-}"', runtime)
+ self.assertNotIn("${CX_BACKEND_CACHE_ROOT:-$PWD/.cx_backend}", runtime)
+ self.assertIn(
+ "recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2", runtime
+ )
+ self.assertNotIn("recipe=aot-source-date-epoch-arch-maxjobs16-v1", runtime)
+ self.assertNotIn("recipe=$source_sha", runtime)
+ self.assertIn("pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0", runtime)
+ self.assertIn("manual-unverified", runtime)
+ self.assertIn("cx_deepep_v2_content_sha256", runtime)
+ self.assertIn("DeepEP V2 cache validation failed", runtime)
+ probe = runtime[
+ runtime.index("cx_probe_deepep_v2()"):
+ runtime.index("cx_deepep_v2_content_sha256()")
+ ]
+ self.assertNotIn("torch.cuda.nccl.version", probe)
+ self.assertIn("ncclGetVersion", probe)
+ self.assertIn("runtime_version.value == 23004", probe)
+ self.assertIn("cx_nvidia_package_root nvidia-nccl-cu13 nccl", runtime)
+ self.assertIn("cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem", runtime)
+ self.assertNotIn("import os,nvidia.nccl", runtime)
+ self.assertNotIn("import os,nvidia.nvshmem", runtime)
+ self.assertIn(
+ 'export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"', runtime
+ )
+ self.assertIn('stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}"', runtime)
+ self.assertNotIn('export EP_JIT_CACHE_DIR="$root/jit"', runtime)
+ self.assertIn('EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"', runtime)
+ reference = (HERE / "ep_nccl.py").read_text()
+ self.assertIn("self.kernel_generation = contracts.collective_kernel_generation", reference)
+
+ def test_deepep_v2_cache_recovers_from_an_unpublished_partial_build(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ content_hash = "b" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ stale = root / "stale-partial-build"
+ stale.write_text("partial\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"; expected_revision="$3"; expected_tree="$4"; expected_fmt="$5"
+ expected_content="$6"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_activate_deepep_v2() { export DEEPEP_V2_COMMIT="$expected_revision"; }
+ cx_prepare_deepep_toolchain() { export NVSHMEM_DIR=/tmp/cx-test-nvshmem; }
+ cx_probe_deepep_v2() { return 0; }
+ cx_deepep_v2_content_sha256() { printf '%s' "$expected_content"; }
+ cx_deepep_v2_cache_is_valid() {
+ test -f "$2" && test "$(wc -l < "$2" | tr -d ' ')" = 5
+ }
+ cx_enable_deepep_v2_jit_reproducibility() { return 0; }
+ cx_materialize_backend_source() { mkdir -p "$2/third-party/fmt"; }
+ flock() { return 0; }
+ python3() {
+ if [ "${1:-}" = -m ] && [ "${2:-}" = venv ]; then
+ mkdir -p "$3/bin"
+ printf '#!/bin/sh\nexit 0\n' > "$3/bin/python"
+ chmod 700 "$3/bin/python"
+ fi
+ return 0
+ }
+ git() {
+ case " $* " in
+ *' third-party/fmt rev-parse HEAD '*) printf '%s\n' "$expected_fmt" ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' "$expected_tree" ;;
+ *' show -s --format=%ct HEAD '*) printf '1\n' ;;
+ *) return 0 ;;
+ esac
+ }
+ cx_git_in_tree() { shift; git "$@"; }
+ cx_build_deepep_v2
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(runtime), str(root),
+ COMMIT, TREE, FMT_COMMIT, content_hash,
+ ],
+ check=True,
+ )
+ self.assertFalse(stale.exists())
+ self.assertEqual(
+ marker.read_text(),
+ f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n",
+ )
+ self.assertEqual(list(root.glob(".collectivex-complete.tmp.*")), [])
+
+ def test_deepep_v2_published_cache_is_never_deleted_after_probe_failure(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ marker.write_text("published\n")
+ sentinel = root / "active-reader"
+ sentinel.write_text("active\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_deepep_v2_cache_is_valid() { return 0; }
+ cx_activate_deepep_v2() { return 0; }
+ cx_prepare_deepep_toolchain() { return 0; }
+ cx_enable_deepep_v2_jit_reproducibility() { return 0; }
+ cx_probe_deepep_v2() { return 1; }
+ ! cx_build_deepep_v2
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(root)],
+ check=True,
+ )
+ self.assertEqual(sentinel.read_text(), "active\n")
+ self.assertEqual(marker.read_text(), "published\n")
+
+ def test_deepep_v2_corrupt_published_cache_fails_without_reset(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ marker.write_text("corrupt\n")
+ sentinel = root / "active-reader"
+ sentinel.write_text("active\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_deepep_v2_cache_is_valid() { return 1; }
+ flock() { return 0; }
+ ! cx_build_deepep_v2
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(root)],
+ check=True,
+ )
+ self.assertEqual(sentinel.read_text(), "active\n")
+ self.assertEqual(marker.read_text(), "corrupt\n")
+
+ def test_deepep_v2_marker_requires_private_owned_cache_objects(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary) / "cache"
+ root.mkdir(mode=0o700)
+ (root / "source").mkdir(mode=0o700)
+ (root / "venv").mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ cache_key = "a" * 64
+ content_hash = "b" * 64
+ marker.write_text(
+ f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n"
+ )
+ root.chmod(0o2700)
+ marker.chmod(0o600)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_v2_marker_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_v2_marker_content_sha256 "$2" "$3" "$4" "$5" "$6" "$7"
+ '''
+ args = [
+ "bash", "-c", command, "_", str(runtime), str(root), str(marker),
+ COMMIT, TREE, FMT_COMMIT, cache_key,
+ ]
+ valid = subprocess.run(args, text=True, capture_output=True, check=True)
+ self.assertEqual(valid.stdout, content_hash)
+ marker.chmod(0o644)
+ self.assertNotEqual(subprocess.run(args).returncode, 0)
+
+ def test_deepep_hybrid_marker_requires_a_private_regular_file(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary) / "cache"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ content_hash = "b" * 64
+ marker.write_text(f"{COMMIT}\n{TREE}\n{content_hash}\n")
+ root.chmod(0o2700)
+ marker.chmod(0o600)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_hybrid_marker_content_sha256 "$2" "$3" "$4" "$5"
+ '''
+ args = [
+ "bash", "-c", command, "_", str(runtime), str(root), str(marker),
+ COMMIT, TREE,
+ ]
+ valid = subprocess.run(args, text=True, capture_output=True, check=True)
+ self.assertEqual(valid.stdout, content_hash)
+ marker_contract = runtime.read_text()
+ marker_contract = marker_contract[
+ marker_contract.index("cx_deepep_hybrid_marker_content_sha256()"):
+ marker_contract.index("cx_deepep_hybrid_cache_is_valid()")
+ ]
+ self.assertIn("marker_item.st_uid != root_item.st_uid", marker_contract)
+ self.assertNotIn("st_uid != os.getuid()", marker_contract)
+ marker.chmod(0o644)
+ self.assertNotEqual(subprocess.run(args).returncode, 0)
+
+ def test_deepep_v2_installed_content_digest_binds_every_distribution_file(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ site = Path(temporary) / "venv" / "lib" / "python3.11" / "site-packages"
+ package = site / "deep_ep"
+ info = site / "deep_ep-2.0.0.dist-info"
+ package.mkdir(parents=True)
+ info.mkdir()
+ (package / "__init__.py").write_text("__version__ = '2.0.0'\n")
+ extension = package / "_C.so"
+ extension.write_bytes(b"extension-one")
+ (info / "METADATA").write_text(
+ "Metadata-Version: 2.1\nName: deep_ep\nVersion: 2.0.0\n"
+ )
+ (info / "RECORD").write_text(
+ "deep_ep/__init__.py,,\n"
+ "deep_ep/_C.so,,\n"
+ "deep_ep-2.0.0.dist-info/METADATA,,\n"
+ "deep_ep-2.0.0.dist-info/RECORD,,\n"
+ )
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_v2_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_v2_content_sha256
+ '''
+ env = {
+ **os.environ,
+ "PYTHONPATH": str(site),
+ "VIRTUAL_ENV": str(Path(temporary) / "venv"),
+ }
+ first = subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)],
+ text=True, capture_output=True, check=True, env=env,
+ ).stdout
+ extension.write_bytes(b"extension-two")
+ second = subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)],
+ text=True, capture_output=True, check=True, env=env,
+ ).stdout
+ self.assertRegex(first, r"^[0-9a-f]{64}$")
+ self.assertRegex(second, r"^[0-9a-f]{64}$")
+ self.assertNotEqual(first, second)
+ extension.unlink()
+ outside = Path(temporary) / "outside.so"
+ outside.write_bytes(b"outside")
+ extension.symlink_to(outside)
+ self.assertNotEqual(
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)], env=env,
+ ).returncode,
+ 0,
+ )
+
+ def test_uccl_content_identity_excludes_install_generated_files(self) -> None:
+ keep = load_uccl_function(
+ "_is_uccl_runtime_payload", {"PurePosixPath": PurePosixPath}
+ )
+ self.assertTrue(keep("uccl/ep.abi3.so"))
+ self.assertTrue(keep("uccl.libs/libnuma.so"))
+ self.assertFalse(keep("uccl/__pycache__/collective.cpython-312.pyc"))
+ self.assertFalse(keep("uccl-0.1.1.dist-info/RECORD"))
+
+ def test_uccl_dependency_versions_are_exact(self) -> None:
+ installed = dict(contracts.UCCL_DEPENDENCY_VERSIONS)
+ dependency_versions = load_uccl_function(
+ "_uccl_dependency_versions",
+ {
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ version=lambda package: installed[package]
+ ),
+ },
+ )
+ self.assertEqual(dependency_versions(), contracts.UCCL_DEPENDENCY_VERSIONS)
+ installed["intervaltree"] = "3.2.0"
+ with self.assertRaisesRegex(RuntimeError, "differ from the v1 contract"):
+ dependency_versions()
+
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ dependency_schema = schema["properties"]["implementation"]["properties"][
+ "provenance"
+ ]["properties"]["uccl_dependency_versions"]
+ self.assertFalse(dependency_schema["additionalProperties"])
+ self.assertEqual(
+ {
+ package: definition["const"]
+ for package, definition in dependency_schema["properties"].items()
+ },
+ contracts.UCCL_DEPENDENCY_VERSIONS,
+ )
+
+ def test_uccl_support_dependency_content_is_path_free(self) -> None:
+ with tempfile.TemporaryDirectory() as directory:
+ root = Path(directory)
+ source_entry = PurePosixPath("intervaltree/__init__.py")
+ cache_entry = PurePosixPath("intervaltree/__pycache__/__init__.pyc")
+ metadata_entry = PurePosixPath("intervaltree-3.1.0.dist-info/RECORD")
+ for entry in (source_entry, cache_entry, metadata_entry):
+ path = root / entry
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_bytes(entry.as_posix().encode())
+ distribution = types.SimpleNamespace(
+ files=[source_entry, cache_entry, metadata_entry],
+ locate_file=lambda item: root / item,
+ )
+ evidence_for = load_uccl_function(
+ "_python_dependency_evidence",
+ {
+ "Path": Path,
+ "PurePosixPath": PurePosixPath,
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ distribution=lambda package: distribution
+ ),
+ },
+ )
+ evidence = evidence_for("intervaltree", "3.1.0")
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="intervaltree-distribution",
+ name="intervaltree-3.1.0",
+ files=[(source_entry.as_posix(), root / source_entry)],
+ ),
+ )
+ self.assertNotIn(str(root), json.dumps(evidence))
+
+ def test_uccl_hashes_the_mapped_pinned_libcudart_without_exposing_paths(
+ self,
+ ) -> None:
+ with tempfile.TemporaryDirectory() as directory:
+ root = Path(directory)
+ entry = PurePosixPath("nvidia/cuda_runtime/lib/libcudart.so.12")
+ library = root / entry
+ library.parent.mkdir(parents=True)
+ library.write_bytes(b"pinned CUDA 12 runtime")
+ distribution = types.SimpleNamespace(
+ files=[entry],
+ locate_file=lambda item: root / item,
+ )
+ evidence_for = load_uccl_function(
+ "_loaded_libcudart_evidence",
+ {
+ "Path": Path,
+ "PurePosixPath": PurePosixPath,
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ distribution=lambda package: distribution
+ ),
+ },
+ )
+ maps = root / "maps"
+ maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {library}\n")
+ evidence = evidence_for("12.9.79", maps)
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="cuda-runtime",
+ name="nvidia-cuda-runtime-cu12-12.9.79",
+ files=[("libcudart.so", library)],
+ ),
+ )
+ self.assertNotIn(str(root), json.dumps(evidence))
+
+ unowned = root / "unowned" / library.name
+ unowned.parent.mkdir()
+ unowned.write_bytes(library.read_bytes())
+ maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {unowned}\n")
+ with self.assertRaisesRegex(RuntimeError, "not owned") as raised:
+ evidence_for("12.9.79", maps)
+ self.assertNotIn(str(root), str(raised.exception))
+
+ def test_private_runtime_logs_are_not_public_artifacts(self) -> None:
+ path = subprocess.check_output(
+ [
+ "bash", "-c", 'source "$1"; cx_private_log_path test', "_",
+ str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": "contract-test"},
+ ).strip()
+ try:
+ log = Path(path)
+ self.assertEqual(stat.S_IMODE(log.stat().st_mode), 0o600)
+ self.assertEqual(stat.S_IMODE(log.parent.stat().st_mode), 0o700)
+ self.assertFalse(log.is_relative_to(ROOT))
+ finally:
+ shutil.rmtree(Path(path).parent, ignore_errors=True)
+
+ def test_private_runtime_logs_reject_traversal_and_symlinks(self) -> None:
+ common = str(ROOT / "runtime" / "common.sh")
+ for variable, value in (
+ ("COLLECTIVEX_EXECUTION_ID", ".."),
+ ("CX_TEST_LABEL", ".."),
+ ):
+ environment = {
+ **os.environ,
+ "COLLECTIVEX_EXECUTION_ID": "contract-adversarial",
+ "CX_TEST_LABEL": "test",
+ variable: value,
+ }
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path "$CX_TEST_LABEL"', "_", common],
+ text=True,
+ capture_output=True,
+ env=environment,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn(value, result.stderr)
+
+ private_root = Path(f"/tmp/inferencex-collectivex-{os.getuid()}")
+ private_root.mkdir(mode=0o700, exist_ok=True)
+ self.assertFalse(private_root.is_symlink())
+ os.chmod(private_root, 0o700)
+ with tempfile.TemporaryDirectory() as temporary:
+ target = Path(temporary)
+ tag = f"contract-symlink-{os.getpid()}"
+ link = private_root / tag
+ link.symlink_to(target, target_is_directory=True)
+ try:
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertEqual(list(target.iterdir()), [])
+ finally:
+ link.unlink(missing_ok=True)
+
+ tag = f"contract-log-symlink-{os.getpid()}"
+ directory = private_root / tag
+ directory.mkdir(mode=0o700)
+ target_file = target / "target"
+ target_file.write_text("unchanged")
+ log_link = directory / "test.log"
+ log_link.symlink_to(target_file)
+ try:
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertEqual(target_file.read_text(), "unchanged")
+ finally:
+ log_link.unlink(missing_ok=True)
+ directory.rmdir()
+
+ def test_operator_config_failure_is_value_free(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ config = Path(temporary) / "operator.env"
+ config.write_text("printf 'private-config-token\\n' >&2\nfalse\n")
+ config.chmod(0o600)
+ result = subprocess.run(
+ ["bash", "-c",
+ 'export COLLECTIVEX_EXECUTION_ID="operator-failure-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; source \"$1\"; "
+ "cx_load_operator_config", "_",
+ str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("runner-local configuration failed", result.stderr)
+ self.assertNotIn("private-config-token", result.stderr)
+
+ def test_ephemeral_operator_config_is_removed_after_source(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ config = Path(temporary) / "operator.env"
+ decoy = Path(temporary) / "decoy"
+ decoy.write_text("keep")
+ config.write_text(json.dumps(operator_config(Path(temporary) / "storage")))
+ config.chmod(0o600)
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'export COLLECTIVEX_EXECUTION_ID="operator-ephemeral-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; "
+ 'config="$COLLECTIVEX_OPERATOR_CONFIG"; source "$1"; '
+ 'cx_load_operator_config; test ! -e "$config"; '
+ 'test "$CX_PARTITION" = test; '
+ 'test -z "${COLLECTIVEX_OPERATOR_CONFIG+x}"',
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ "COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL": "1",
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertFalse(config.exists())
+ self.assertEqual(decoy.read_text(), "keep")
+
+ def test_operator_config_is_strict_per_runner_json(self) -> None:
+ command = (
+ 'source "$1"; export COLLECTIVEX_EXECUTION_ID="operator-config-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; "
+ 'test "$CX_PARTITION" = test; '
+ 'test -z "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT+x}"; '
+ 'test -z "${ENROOT_CACHE_PATH+x}"'
+ )
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ document = operator_config(root / "storage")
+ config = root / "operator.json"
+ config.write_text(json.dumps(document))
+ config.chmod(0o600)
+ for runner in capability.PLATFORMS:
+ with self.subTest(runner=runner):
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": runner,
+ "ENROOT_CACHE_PATH": "/private/stale-enroot-cache",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+
+ lock_dir = root / "amd-locks"
+ document["runners"]["mi355x"]["lock_dir"] = str(lock_dir)
+ config.write_text(json.dumps(document))
+ config.chmod(0o600)
+ canonical = subprocess.run(
+ [
+ "bash",
+ "-c",
+ 'source "$1"; export COLLECTIVEX_EXECUTION_ID="canonical-lock-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; "
+ 'cx_lock_canonical_gha_env mi355x; test "$CX_LOCK_DIR" = "$2"',
+ "_",
+ str(ROOT / "runtime" / "common.sh"),
+ str(lock_dir),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "mi355x",
+ "CX_SHARD_FILE": ".shards/test.json",
+ "CX_SHARD_SKU": "mi355x",
+ "CX_NODES": "1",
+ "CX_GPUS_PER_NODE": "8",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ "COLLECTIVEX_SOURCE_SHA": "a" * 40,
+ "GITHUB_ACTIONS": "true",
+ "GITHUB_RUN_ATTEMPT": "1",
+ "GITHUB_RUN_ID": "1",
+ "GITHUB_WORKSPACE": str(root.resolve()),
+ },
+ )
+ self.assertEqual(canonical.returncode, 0, canonical.stderr)
+
+ selected_only = {
+ "schema_version": 1,
+ "runners": {"h100-dgxc": document["runners"]["h100-dgxc"]},
+ }
+ result = subprocess.run(
+ [
+ "bash", "-c", command + '; test -z "${CX_STAGE_DIR+x}"', "_",
+ str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "CX_STAGE_DIR": "/private/stale-stage",
+ "ENROOT_CACHE_PATH": "/private/stale-enroot-cache",
+ "COLLECTIVEX_OPERATOR_CONFIG_LOADED": "1",
+ "COLLECTIVEX_OPERATOR_CONFIG_CONTENT": json.dumps(selected_only),
+ "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1",
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+
+ rejected = json.loads(json.dumps(document))
+ rejected["runners"]["h100-dgxc"]["shell"] = "private-command"
+ boolean_version = {**document, "schema_version": True}
+ for invalid in (rejected, boolean_version):
+ config.write_text(json.dumps(invalid))
+ config.chmod(0o600)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn("private-command", result.stderr)
+
+ config.write_text(json.dumps(document))
+ config.chmod(0o644)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/test_publisher.py b/experimental/CollectiveX/tests/test_publisher.py
new file mode 100644
index 0000000000..86b1e9607d
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_publisher.py
@@ -0,0 +1,2334 @@
+#!/usr/bin/env python3
+"""Focused end-to-end tests for the isolated CollectiveX publisher."""
+from __future__ import annotations
+
+import copy
+import hashlib
+import itertools
+import json
+import os
+from pathlib import Path
+import subprocess
+import sys
+import tempfile
+import types
+import unittest
+from unittest import mock
+import zipfile
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path[:0] = [str(ROOT), str(HERE)]
+
+import contracts # noqa: E402
+import identity # noqa: E402
+import publisher # noqa: E402
+import summarize # noqa: E402
+import sweep_matrix # noqa: E402
+
+
+RUN = {
+ "repository": "SemiAnalysisAI/InferenceX",
+ "run_id": "12345",
+ "run_attempt": 1,
+ "source_sha": "a" * 40,
+}
+
+
+def _unsupported_delivery(
+ root: Path, ordinals: tuple[int, ...] = (1,), run: dict = RUN,
+) -> tuple[Path, Path]:
+ matrix = sweep_matrix.resolve_matrix(backends="all")
+ wrapper = next(item for item in matrix["requested_cases"] if item["disposition"] == "unsupported")
+ matrix = {
+ "format": "collectivex.matrix.v1",
+ "schema_version": 1,
+ "requested_cases": [wrapper],
+ "include": [],
+ }
+ case = {key: value for key, value in wrapper["case"].items() if key != "case_id"}
+ artifact_name = f"cxunsupported-{run['run_id']}-{run['run_attempt']}"
+ git_run = {
+ "artifact": artifact_name,
+ "job": "setup",
+ "ref": "collectivex",
+ "repo": run["repository"],
+ "run_attempt": str(run["run_attempt"]),
+ "run_id": run["run_id"],
+ "source_sha": run["source_sha"],
+ }
+ allocation = {
+ "artifact": artifact_name,
+ "execution_id": f"{run['run_id']}_{run['run_attempt']}_unsupported",
+ "job": "setup",
+ "repo": run["repository"],
+ "run_attempt": str(run["run_attempt"]),
+ "run_id": run["run_id"],
+ "runner": "capability-resolver",
+ "source_sha": run["source_sha"],
+ }
+ matrix_path = root / "matrix.json"
+ artifact = root / artifact_name
+ artifact.mkdir()
+ matrix_path.write_text(json.dumps(matrix))
+ control_sha256 = hashlib.sha256(matrix_path.read_bytes()).hexdigest()
+ for ordinal in ordinals:
+ terminal = contracts.make_terminal_document(
+ allocation_factors=allocation, attempt_ordinal=ordinal, case=case,
+ case_factors={"case": case, "profile": identity.V1_CASE_PROFILE,
+ "sku": wrapper["sku"]},
+ control_sha256=control_sha256, failure_mode="capability",
+ generated_at="2026-07-04T00:00:00Z", git_run=git_run,
+ reason=wrapper["reason"], return_code=5, source="matrix-capability-resolver",
+ status="unsupported", expected_case_id=wrapper["case"]["case_id"],
+ )
+ (artifact / f"unsupported-{ordinal}.json").write_text(json.dumps(terminal))
+ return matrix_path, artifact
+
+
+def _args(
+ store: Path, matrix: Path, artifact: Path, run: dict = RUN
+) -> types.SimpleNamespace:
+ return types.SimpleNamespace(
+ store_root=str(store),
+ matrix=str(matrix),
+ artifact=[str(artifact)],
+ repository=run["repository"],
+ run_id=run["run_id"],
+ run_attempt=run["run_attempt"],
+ source_sha=run["source_sha"],
+ )
+
+
+def _ids(seed: str) -> tuple[str, str, str, str, str, str]:
+ case = identity.digest("case", {"seed": seed})
+ allocation = identity.allocation_id({"seed": seed})
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ series = identity.series_id({"seed": seed})
+ point = identity.point_id(series=series, tokens_per_rank=8)
+ evidence = identity.evidence_id(
+ point=point, allocation=allocation, attempt=attempt, sample_sha256="b" * 64
+ )
+ return case, allocation, attempt, series, point, evidence
+
+
+def _component(scale: float = 1.0) -> dict:
+ latency = {"p50": 10.0 * scale, "p90": 12.0 * scale,
+ "p95": 14.0 * scale, "p99": 20.0 * scale}
+ logical_bytes = 100_000
+ return {
+ "origin": "measured",
+ "latency_us": latency,
+ "logical_bytes": logical_bytes,
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ name: logical_bytes / (value * 1000.0) for name, value in latency.items()
+ },
+ "sample_count": 512,
+ }
+
+
+def _hybrid_provenance(ep_size: int = 1) -> dict:
+ realized = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS}
+ for field in contracts.HYBRID_REALIZED_BOOL_FIELDS:
+ realized[field] = True
+ realized.update({
+ "num_of_experts_per_rank": 1,
+ "num_of_nodes": 1,
+ "num_of_ranks_per_node": ep_size,
+ "token_data_type": "UINT16",
+ })
+ kernel_keys = ["combine-key", "dispatch-key", "preprocess-key"]
+ return {
+ "backend_lineage": "deepep-hybrid", "branch": "hybrid-ep",
+ "deepep_commit": "a" * 40, "deepep_tree": "b" * 40,
+ "device_sms": 1,
+ "jit_kernel_keys": kernel_keys,
+ "jit_shared_objects": [
+ {
+ "kernel_key": key,
+ "rank_artifacts": [
+ {"bytes": 1, "rank": rank, "sha256": f"{index + 1:x}" * 64}
+ for rank in range(ep_size)
+ ],
+ }
+ for index, key in enumerate(kernel_keys)
+ ],
+ "loaded_libraries": [
+ {"name": "deep_ep_cpp", "role": "deepep-extension", "sha256": "4" * 64},
+ {"name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", "sha256": "5" * 64},
+ ],
+ "realized_config": realized,
+ "resource_mode": "tuned",
+ "tuned_source": "deepep-hybrid-configurer-autotune-v1",
+ }
+
+
+def _native_fixture(backend: str = "nccl-ep") -> tuple[dict, dict]:
+ def digest(value: object) -> str:
+ return hashlib.sha256(contracts.canonical_json_bytes(value)).hexdigest()
+
+ scheduled = {
+ "backend": backend, "canonical": True, "eplb": False, "ep": 1,
+ "experts": 1, "gpus_per_node": 1, "hidden": 1, "ladder": "1", "nodes": 1,
+ "phase": "decode", "required_publication": "official", "routing": "uniform",
+ "samples_per_point": 512, "scale_up_domain": 1, "suite": "ep-core-v1",
+ "timing": "8:64:32", "topk": 1,
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ "workload": "deepseek-v3-v1",
+ }
+ case_factors = {"case": scheduled, "profile": identity.V1_CASE_PROFILE, "sku": "fixture"}
+ case_id = identity.digest("case", case_factors)
+ git_run = {
+ "artifact": "cxshard-fixture-999-1", "job": "sweep", "ref": "collectivex",
+ "repo": RUN["repository"], "run_attempt": "1", "run_id": "999",
+ "source_sha": RUN["source_sha"],
+ }
+ allocation_factors = {
+ "artifact": git_run["artifact"], "execution_id": "999_1_fixture",
+ "job": git_run["job"], "repo": git_run["repo"], "run_attempt": "1",
+ "run_id": "999", "runner": "fixture", "source_sha": git_run["source_sha"],
+ }
+ allocation_id = identity.allocation_id(allocation_factors)
+ attempt_id = identity.attempt_id(allocation=allocation_id, case=case_id, ordinal=1)
+ member_id, member_checksums, routing_hash, routing_rows, routing_weights = (
+ contracts._expected_canonical_trace(
+ "uniform", hidden=1, topk=1, logical_experts=1, physical_experts=1,
+ ep_size=1, tokens_per_rank=1, seed=67, eplb_enabled=False,
+ reference_tokens_per_rank=2048,
+ )
+ )
+ workload_id = identity.workload_id({
+ "members": [{"checksums": member_checksums, "workload_id": member_id}]
+ })
+ runtime = {
+ "accelerator_runtime": {"kind": "cuda", "version": "13.0"},
+ "collective_library": {"kind": "nccl", "version": "2.30.4"},
+ "device": {
+ "arch": "sm100", "compute_units": 1, "memory_bytes": 1,
+ "product": "Fixture GPU", "warp_size": 32,
+ },
+ "driver_version": "1", "framework": {"kind": "torch", "version": "2.10.0"},
+ "machine": "fixture", "python_version": "3.12", "vendor": "nvidia",
+ }
+ implementation_provenance = (
+ {
+ "backend": "nccl-ep", "backend_lineage": "nccl",
+ "collective_library": "nccl", "nccl_version": "2.30.4",
+ "reference_semantics": "fixture-v1",
+ }
+ if backend == "nccl-ep"
+ else _hybrid_provenance()
+ )
+ kernel_generation = "nccl" if backend == "nccl-ep" else "hybrid"
+ implementation = {
+ "kernel_generation": kernel_generation,
+ "name": backend,
+ "provenance": implementation_provenance,
+ "resource_profile": contracts.project_resource_profile(implementation_provenance),
+ }
+ public_config = contracts.public_series_config(
+ kernel_generation=implementation["kernel_generation"],
+ provenance=implementation_provenance,
+ resource_profile=implementation["resource_profile"],
+ resource_mode="tuned",
+ device_product=runtime["device"]["product"],
+ )
+ series_factors = {
+ "backend": backend, "case_id": case_id,
+ "image_digest": "sha256:" + "d" * 64,
+ "implementation_contract_sha256": digest({
+ **implementation,
+ "provenance": contracts.series_provenance(implementation_provenance),
+ }),
+ "public_config_sha256": contracts.public_series_config_sha256(public_config),
+ "routing_control_sha256": contracts.routing_implementation_control_sha256(
+ implementation
+ ),
+ "runtime_fingerprint_sha256": digest(runtime),
+ "source_sha": RUN["source_sha"], "squash_sha256": "e" * 64,
+ "workload_id": workload_id,
+ }
+ series_id = identity.series_id(series_factors)
+ point_id = identity.point_id(series=series_id, tokens_per_rank=1)
+ sample_components = {
+ name: {
+ "availability": "measured", "sample_count": 512,
+ "trials": [[latency] * 8 for _ in range(64)],
+ }
+ for name, latency in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0))
+ }
+ sample_sha = digest({"components": sample_components, "tokens_per_rank": 1})
+ evidence_id = identity.evidence_id(
+ point=point_id, allocation=allocation_id, attempt=attempt_id,
+ sample_sha256=sample_sha,
+ )
+ samples = {
+ "allocation_id": allocation_id, "attempt_id": attempt_id, "case_id": case_id,
+ "format": contracts.SAMPLES_FORMAT,
+ "points": [{
+ "components": sample_components, "evidence_id": evidence_id,
+ "point_id": point_id, "sample_sha256": sample_sha, "tokens_per_rank": 1,
+ }],
+ "sampling": {
+ "iterations_per_trial": 8, "reduction": "cross-rank-max-per-iteration",
+ "trials": 64,
+ },
+ "schema_version": 1, "series_id": series_id,
+ }
+ sample_bytes = contracts.canonical_json_bytes(samples)
+ oracle = {
+ "atol": 0.02,
+ "checks": {name: True for name in (
+ "combine_values", "counts", "metadata", "multiplicity", "payload",
+ "source_set", "weights",
+ )},
+ "combine_weight_semantics": "unweighted-rank-sum",
+ "contract": "expert-specific-transform-v1", "dispatch_sha256": "1" * 64,
+ "max_absolute_error": 0.0, "max_elementwise_relative_error": 0.0,
+ "max_relative_error": 0.0, "max_weight_error": 0.0,
+ "order_sha256": "2" * 64, "ordering_contract": "fixture-order-v1",
+ "passed": True, "receive_count": 1, "rtol": 0.05,
+ }
+ def pct(value: float) -> dict[str, float]:
+ return {name: value for name in ("p50", "p90", "p95", "p99")}
+
+ def measured(value: float) -> dict:
+ return {
+ "availability": "measured", "origin": "measured",
+ "percentiles_us": pct(value), "sample_count": 512,
+ }
+ row = {
+ "anomalies": [],
+ "components": {
+ "combine": measured(20.0), "dispatch": measured(10.0),
+ "isolated_sum": {
+ "availability": "derived", "origin": "derived-percentile-sum",
+ "percentiles_us": pct(30.0), "sample_count": 0,
+ },
+ "roundtrip": measured(40.0),
+ },
+ "correctness": {
+ "contract": "expert-specific-transform-v1", "max_relative_error": 0.0,
+ "passed": True,
+ "rank_evidence": [{
+ "input_unchanged": True, "order_stable": True,
+ "post_timing": copy.deepcopy(oracle), "pre_timing": copy.deepcopy(oracle),
+ "rank": 0,
+ }],
+ "scope": "dispatch-metadata-and-transformed-combine",
+ },
+ "evidence_id": evidence_id, "global_tokens": 1,
+ "logical_bytes": {"combine": 2, "dispatch": 2, "roundtrip": 4},
+ "point_id": point_id,
+ "receive": {"max": 1, "mean": 1.0, "min": 1, "total": 1},
+ "routing": contracts._expected_routing_summary(
+ routing_rows,
+ routing_weights,
+ physical_experts=1,
+ ep_size=1,
+ tokens_per_rank=1,
+ gpus_per_node=1,
+ scale_up_domain=1,
+ ),
+ "sample_histograms": {
+ name: contracts._expected_histogram([value] * 512)
+ for name, value in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0))
+ },
+ "sample_sha256": sample_sha,
+ "token_rate_at_latency_percentile": pct(25_000.0), "tokens_per_rank": 1,
+ }
+ raw = {
+ "case": {
+ "attempt_ordinal": 1, "backend": backend,
+ "eplb": {
+ "enabled": False, "imbalance_after": None, "imbalance_before": None,
+ "mapping_hash": None, "max_replicas": None, "num_logical_experts": 1,
+ "num_physical_experts": 1, "num_redundant": 0, "planner": None,
+ "reference_tokens_per_rank": None, "replicated_experts": 0,
+ },
+ "ep_size": 1, "mode": "normal", "phase": "decode",
+ "required_publication": "official", "resource_mode": "tuned", "runner": "fixture",
+ "shape": {
+ "activation_profile": "canonical-counter-source-v3", "dispatch_dtype": "bf16",
+ "eplb": False, "experts": 1, "experts_per_rank": 1, "hidden": 1,
+ "kernel_gen": kernel_generation, "num_logical_experts": 1,
+ "quant": {
+ "combine_accum_dtype": "fp32", "combine_input_dtype": "bf16",
+ "combine_output_dtype": "bf16", "combine_quant_mode": "none",
+ "scale_layout": None,
+ },
+ "routing": "uniform", "topk": 1,
+ },
+ "suite": "ep-core-v1", "workload_name": "deepseek-v3-v1",
+ },
+ "format": contracts.RAW_FORMAT, "generated_at": "2026-07-04T00:00:00Z",
+ "identity": {
+ "allocation_factors": allocation_factors, "allocation_id": allocation_id,
+ "attempt_id": attempt_id, "attempt_ordinal": 1, "case_factors": case_factors,
+ "case_id": case_id, "series_factors": series_factors, "series_id": series_id,
+ },
+ "implementation": implementation,
+ "measurement": {
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning": {
+ "contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "ladder": [1, 2, 4, 8, 16, 32, 64, 128],
+ "roundtrips_per_shape": 8,
+ },
+ "contract": "layout-and-dispatch-v1",
+ "rows": [row],
+ "sampling": {
+ "contract": "fixed-512-v1", "iterations_per_trial": 8,
+ "percentile_method": "nearest-rank",
+ "reduction": "cross-rank-max-per-iteration", "samples_per_component": 512,
+ "trials": 64, "warmup_iterations": 32,
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ },
+ "source_allocation": "even",
+ },
+ "outcome": {
+ "publication_status": "diagnostic", "reasons": [], "status": "success",
+ "validity": {
+ "anomaly_free": True, "execution_status": "complete",
+ "measurement_conformance": "conformant", "provenance_complete": True,
+ "resource_conformance": implementation["resource_profile"]["conformance_class"],
+ "sampling_conformance": "conformant",
+ "semantic_correctness": "pass",
+ "workload_identity": "consistent-across-ranks",
+ "workload_source": "canonical-serialized",
+ },
+ },
+ "provenance": {
+ "command": "run_ep", "distributed_launcher": "torchrun", "git_run": git_run,
+ "image": {
+ "arch": "amd64", "digest": "sha256:" + "d" * 64,
+ "digest_verified": True, "reference": "fixture:1", "squash_sha256": "e" * 64,
+ },
+ "redaction": "sanitized-v1",
+ },
+ "record_type": "case-attempt",
+ "runtime_fingerprint": runtime,
+ "sample_artifact": {
+ "bytes": len(sample_bytes), "format": contracts.SAMPLES_FORMAT,
+ "path": "samples.json", "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ },
+ "schema_version": 1,
+ "topology": {
+ "device_count": 1, "device_product": "Fixture GPU", "gpus_per_node": 1,
+ "nodes": 1, "placement": "packed",
+ "realized_placement": {
+ "gpus_per_node": 1, "nodes": 1, "ranks_per_node": 1,
+ "unique_local_ranks": True, "valid": True,
+ },
+ "scale_up_domain": 1, "topology_class": "fixture", "transport": "nvlink",
+ "world_size": 1,
+ },
+ "workload": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_identity": hashlib.sha256(
+ b"counter|seed=67|hidden=1|gen=collectivex-activation-counter-v3"
+ ).hexdigest(),
+ "activation_profile": "canonical-counter-source-v3", "cross_rank_consistent": True,
+ "manifest_checksums": {member_id: member_checksums}, "members": [member_id],
+ "routing_generator": "collectivex-routing-counter-v3", "source": "canonical-serialized",
+ "trace_hashes": [routing_hash],
+ "trace_signature": hashlib.sha256(routing_hash.encode()).hexdigest(),
+ "workload_id": workload_id,
+ },
+ }
+ return raw, samples
+
+
+def _series(seed: str, backend: str, *, decision_grade: bool = False) -> tuple[dict, dict]:
+ case, allocation, attempt, series_id, point_id, evidence = _ids(seed)
+ allocations = [identity.allocation_id({"seed": seed, "run": run}) for run in range(3)]
+ eligibility = publisher._eligibility_record(
+ allocations if decision_grade else [allocation],
+ complete=decision_grade,
+ correct=True,
+ measured=True,
+ stable_ordering=True,
+ p50_ratio=1.01 if decision_grade else None,
+ p99_ratio=1.02 if decision_grade else None,
+ )
+ component = _component(1.0 if backend == "deepep" else 1.2)
+ item = {
+ "series_id": series_id,
+ "label": f"H100 / {backend}",
+ "status": "decision-grade" if decision_grade else "diagnostic",
+ "case_ids": [case],
+ "allocation_ids": allocations if decision_grade else [allocation],
+ "model": "deepseek-v3-v1",
+ "suite": "ep-core-v1",
+ "phase": "decode",
+ "publication_tier": "official",
+ "backend": {
+ "id": backend, "label": publisher.BACKEND_LABELS[backend],
+ "role": "reference" if backend == "nccl-ep" else "library",
+ "generation": "nccl" if backend == "nccl-ep" else None,
+ "version": "1.0"},
+ "build": {
+ "implementation_contract_sha256": hashlib.sha256(backend.encode()).hexdigest(),
+ "public_config_sha256": "0" * 64,
+ "routing_control_sha256": hashlib.sha256(backend.encode()).hexdigest(),
+ "runtime_fingerprint_sha256": "3" * 64,
+ "image_digest": "sha256:" + "1" * 64,
+ "source_sha": "a" * 40,
+ "squash_sha256": "2" * 64,
+ },
+ "system": {
+ "sku": "h100-dgxc", "label": "NVIDIA H100", "vendor": "nvidia",
+ "topology_class": "h100-nvlink-island", "transport": "nvlink",
+ "world_size": 8, "ep_size": 8, "placement": "packed",
+ },
+ "workload": {
+ "workload_id": identity.workload_id({"shape": "shared"}),
+ "hidden": 7168, "top_k": 8, "experts": 256,
+ "routing": "uniform", "eplb": False,
+ "dispatch_dtype": "bf16", "combine_dtype": "bf16",
+ "activation_profile": "canonical-counter-source-v3",
+ },
+ "eplb": {
+ "enabled": False, "planner": None, "mapping_sha256": None,
+ "logical_experts": 256, "physical_experts": 256,
+ "redundant_experts": 0, "reference_tokens_per_rank": None,
+ "replicated_experts": 0, "max_replicas": None,
+ "imbalance_before": None, "imbalance_after": None,
+ },
+ "resource": {"mode": "tuned", "profile": "profile-1", "comm_units_kind": "sm", "configured_units": 24},
+ "measurement": {
+ "contract": "layout-and-dispatch-v1", "sampling_contract": "fixed-512-v1",
+ "iters": 8, "trials": 64, "warmups": 32, "samples_per_component": 512,
+ "headline_component": "roundtrip", "headline_percentile": "p99",
+ },
+ "points": [{
+ "point_id": point_id, "tokens_per_rank": 8, "global_tokens": 64,
+ "correct": True,
+ "routing": {
+ "fanout_mean": 4.0, "recv_tokens_max": 64,
+ "expert_load_cv": 0.5, "payload_rank_cv": 0.25,
+ "hotspot_ratio": 2.0, "empty_expert_count": 0,
+ "empty_rank_count": 0, "routed_copies": 256,
+ },
+ "components": {"dispatch": None, "combine": None,
+ "roundtrip": component, "isolated_sum": None},
+ "roundtrip_token_rate_at_latency_percentile": {
+ name: 64 / (latency * 1e-6)
+ for name, latency in component["latency_us"].items()
+ },
+ "evidence_ids": [evidence],
+ }],
+ "eligibility": eligibility,
+ }
+ item["build"]["public_config_sha256"] = contracts.public_series_config_sha256(
+ publisher._public_series_config(item)
+ )
+ case = identity.digest("case", publisher._public_case_factors(item))
+ item["case_ids"] = [case]
+ build = item["build"]
+ series_id = identity.series_id({
+ "backend": item["backend"]["id"],
+ "case_id": case,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build["implementation_contract_sha256"],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": item["workload"]["workload_id"],
+ })
+ item["series_id"] = series_id
+ point_id = identity.point_id(series=series_id, tokens_per_rank=8)
+ item["points"][0]["point_id"] = point_id
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ evidence = identity.evidence_id(
+ point=point_id, allocation=allocation, attempt=attempt,
+ sample_sha256=hashlib.sha256(seed.encode()).hexdigest(),
+ )
+ item["points"][0]["evidence_ids"] = [evidence]
+ runs = {
+ str(run): {8: {
+ "latency_us": {
+ statistic: component["latency_us"][statistic] * (1 + run / 100)
+ for statistic in ("p50", "p99")
+ },
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ statistic: component["logical_payload_rate_gbps_at_latency_percentile"][statistic] / (1 + run / 100)
+ for statistic in ("p50", "p99")
+ },
+ }}
+ for run in range(3)
+ }
+ internal = {"run_metrics": runs}
+ return item, internal
+
+
+def _dataset() -> dict:
+ item, _ = _series("one", "deepep")
+ case = item["case_ids"][0]
+ allocation = item["allocation_ids"][0]
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ evidence = item["points"][0]["evidence_ids"][0]
+ return {
+ "format": "collectivex.public.v1", "schema_version": 1,
+ "generated_at": "2026-07-04T00:00:00Z", "source_bundle_ids": ["c" * 64],
+ "promotion": {
+ "status": "diagnostic", "reason": None, "matrix_id": "d" * 64,
+ "allocation_ids": [allocation], "required_allocations": 3,
+ "requested_cases": 1, "terminal_cases": 1,
+ "policy": "collectivex-decision-grade-v1",
+ },
+ "coverage": [{
+ "case_id": case, "label": "case", "required": True, "sku": "h100-dgxc",
+ "backend": "deepep", "phase": "decode", "disposition": "runnable",
+ "selected_attempt_id": attempt,
+ "outcome": "success", "failure_mode": None, "reason": None,
+ "attempt_ids": [attempt],
+ }],
+ "attempts": [{
+ "attempt_id": attempt,
+ "evidence": [{"evidence_id": evidence,
+ "point_id": item["points"][0]["point_id"]}],
+ "case_id": case,
+ "allocation_id": allocation, "run_id": "1", "run_attempt": 1,
+ "attempt_index": 1,
+ "selected": True, "outcome": "success", "failure_mode": None, "reason": None,
+ "series_id": item["series_id"],
+ "completed_at": "2026-07-04T00:00:00Z",
+ }],
+ "series": [item], "cohorts": [], "rankings": [], "recommendations": [],
+ "sensitivities": [],
+ }
+
+
+def _promoted_dataset() -> dict:
+ specifications = (
+ ("library-fast", "deepep", None, False),
+ ("library-slow", "uccl", None, False),
+ ("chip-peer", "deepep", "h200-dgxc", False),
+ ("system-one", "nccl-ep", None, True),
+ ("system-two", "nccl-ep", "h200-dgxc", True),
+ ("routing-zipf", "deepep", None, False),
+ ("routing-zipf-eplb", "deepep", None, False),
+ )
+ series = []
+ internals = {}
+ attempts = []
+ coverage = []
+ for seed, backend, peer_sku, reference in specifications:
+ item, internal = _series(seed, backend, decision_grade=True)
+ if peer_sku:
+ platform = publisher.capability.PLATFORMS[peer_sku]
+ item["system"].update({
+ "sku": peer_sku,
+ "label": f"NVIDIA {platform['product'].upper()}",
+ "topology_class": platform["topology_class"],
+ "transport": platform["transport"],
+ })
+ if reference:
+ item["backend"]["role"] = "reference"
+ if seed.startswith("routing-zipf"):
+ item["suite"] = "ep-routing-v1"
+ item["publication_tier"] = "comparable-experimental"
+ item["workload"]["routing"] = "zipf"
+ if seed == "routing-zipf-eplb":
+ item["workload"]["eplb"] = True
+ plan = contracts._expected_eplb_plan(
+ "zipf", 8, 256, 288, item["system"]["ep_size"], 67, 2048
+ )
+ item["eplb"] = {
+ "enabled": True, "planner": "greedy-rank-major-v1",
+ "mapping_sha256": contracts.eplb_contract.mapping_hash(plan),
+ "logical_experts": 256, "physical_experts": 288,
+ "redundant_experts": 32, "reference_tokens_per_rank": 2048,
+ "replicated_experts": plan["replicated_experts"],
+ "max_replicas": plan["max_replicas"],
+ "imbalance_before": plan["imbalance_before"],
+ "imbalance_after": plan["imbalance_after"],
+ }
+ item["build"]["implementation_contract_sha256"] = "8" * 64
+ case_id = identity.digest("case", publisher._public_case_factors(item))
+ item["case_ids"] = [case_id]
+ build = item["build"]
+ build["public_config_sha256"] = contracts.public_series_config_sha256(
+ publisher._public_series_config(item)
+ )
+ item["series_id"] = identity.series_id({
+ "backend": item["backend"]["id"],
+ "case_id": case_id,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build["implementation_contract_sha256"],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": item["workload"]["workload_id"],
+ })
+ point = item["points"][0]
+ point["point_id"] = identity.point_id(
+ series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]
+ )
+ case_attempts = []
+ evidence_ids = []
+ for run_id, allocation_id in enumerate(item["allocation_ids"], 1):
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=1
+ )
+ evidence_id = identity.evidence_id(
+ point=point["point_id"], allocation=allocation_id,
+ attempt=attempt_id,
+ sample_sha256=hashlib.sha256(f"{seed}-{run_id}".encode()).hexdigest(),
+ )
+ attempts.append({
+ "attempt_id": attempt_id,
+ "evidence": [{"evidence_id": evidence_id, "point_id": point["point_id"]}],
+ "case_id": case_id, "allocation_id": allocation_id,
+ "run_id": str(run_id), "run_attempt": 1,
+ "attempt_index": 1, "selected": True,
+ "outcome": "success", "failure_mode": None, "reason": None,
+ "series_id": item["series_id"],
+ "completed_at": "2026-07-04T00:00:00Z",
+ })
+ case_attempts.append(attempt_id)
+ evidence_ids.append(evidence_id)
+ point["evidence_ids"] = evidence_ids
+ coverage.append({
+ "case_id": case_id, "label": seed, "required": True,
+ "sku": item["system"]["sku"], "backend": backend,
+ "phase": item["phase"], "disposition": "runnable",
+ "selected_attempt_id": case_attempts[-1], "outcome": "success",
+ "failure_mode": None, "reason": None, "attempt_ids": case_attempts,
+ })
+ series.append(item)
+ internals[item["series_id"]] = internal
+
+ unsupported_case = identity.digest("case", {"seed": "planned-unsupported"})
+ unsupported_attempts = []
+ for run_id in range(1, 4):
+ allocation_id = identity.allocation_id(
+ {"seed": "planned-unsupported", "run": run_id}
+ )
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=unsupported_case, ordinal=1
+ )
+ attempts.append({
+ "attempt_id": attempt_id, "evidence": [], "case_id": unsupported_case,
+ "allocation_id": allocation_id, "run_id": str(run_id),
+ "run_attempt": 1,
+ "attempt_index": 1, "selected": True, "outcome": "unsupported",
+ "failure_mode": "capability", "reason": "backend-platform-unsupported",
+ "series_id": None, "completed_at": "2026-07-04T00:00:00Z",
+ })
+ unsupported_attempts.append(attempt_id)
+ coverage.append({
+ "case_id": unsupported_case, "label": "planned unsupported", "required": True,
+ "sku": "mi355x", "backend": "deepep", "phase": "decode",
+ "disposition": "unsupported", "selected_attempt_id": unsupported_attempts[-1],
+ "outcome": "unsupported", "failure_mode": "capability",
+ "reason": "backend-platform-unsupported", "attempt_ids": unsupported_attempts,
+ })
+ cohorts, rankings, recommendations, sensitivities = publisher.build_decisions(
+ series, internals
+ )
+ return {
+ "format": "collectivex.public.v1", "schema_version": 1,
+ "generated_at": "2026-07-04T00:00:00Z",
+ "source_bundle_ids": ["a" * 64, "b" * 64, "c" * 64],
+ "promotion": {
+ "status": "promoted", "reason": None,
+ "matrix_id": publisher.CANONICAL_FULL_V1_MATRIX_SHA256,
+ "allocation_ids": sorted({item["allocation_id"] for item in attempts}),
+ "required_allocations": 3, "requested_cases": len(coverage),
+ "terminal_cases": len(coverage), "policy": "collectivex-decision-grade-v1",
+ },
+ "coverage": sorted(coverage, key=lambda item: item["case_id"]),
+ "attempts": sorted(attempts, key=lambda item: item["attempt_id"]),
+ "series": sorted(series, key=lambda item: item["series_id"]),
+ "cohorts": cohorts, "rankings": rankings,
+ "recommendations": recommendations, "sensitivities": sensitivities,
+ }
+
+
+def _cohort_counts(dataset: dict) -> dict[str, int]:
+ return {
+ kind: sum(item["kind"] == kind for item in dataset["cohorts"])
+ for kind in ("library", "system", "routing")
+ }
+
+
+class PublisherTest(unittest.TestCase):
+ def test_terminal_allocation_and_source_status_are_bound(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ path = next(artifact.glob("*.json"))
+ terminal = contracts.strict_load(path)
+ self.assertIs(contracts.validate_terminal_document(terminal), terminal)
+ self.assertEqual(
+ contracts.validate_delivery(
+ [str(path)], str(matrix), disposition="unsupported"
+ ),
+ 1,
+ )
+
+ for control_sha256 in (None, "0" * 64):
+ broken = copy.deepcopy(terminal)
+ broken["provenance"]["control_sha256"] = control_sha256
+ path.write_text(json.dumps(broken))
+ with self.assertRaisesRegex(contracts.ContractError, "exact control document"):
+ contracts.validate_delivery(
+ [str(path)], str(matrix), disposition="unsupported"
+ )
+ path.write_text(json.dumps(terminal))
+
+ for field in (
+ "artifact", "job", "repo", "run_attempt", "run_id", "source_sha", "runner"
+ ):
+ broken = copy.deepcopy(terminal)
+ broken["identity"]["allocation_factors"][field] = f"forged-{field}"
+ allocation_id = identity.allocation_id(
+ broken["identity"]["allocation_factors"]
+ )
+ broken["identity"]["allocation_id"] = allocation_id
+ broken["identity"]["attempt_id"] = identity.attempt_id(
+ allocation=allocation_id,
+ case=broken["identity"]["case_id"],
+ ordinal=broken["identity"]["attempt_ordinal"],
+ )
+ with self.assertRaisesRegex(
+ contracts.ContractError, "allocation factors differ"
+ ):
+ contracts.validate_terminal_document(broken)
+
+ broken = copy.deepcopy(terminal)
+ broken["outcome"]["status"] = "failed"
+ with self.assertRaisesRegex(contracts.ContractError, "source and outcome"):
+ contracts.validate_terminal_document(broken)
+ broken = copy.deepcopy(terminal)
+ broken["provenance"]["source"] = "runtime-emitter"
+ with self.assertRaisesRegex(contracts.ContractError, "source and outcome"):
+ contracts.validate_terminal_document(broken)
+
+ for path_parts, replacement in (
+ (("provenance", "source"), "unregistered-producer"),
+ (("outcome", "failure_mode"), "unsupported-capability"),
+ (("outcome", "reason"), "unregistered-capability"),
+ ):
+ with self.subTest(path=path_parts):
+ broken = copy.deepcopy(terminal)
+ broken[path_parts[0]][path_parts[1]] = replacement
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ runtime_allocation = copy.deepcopy(
+ terminal["identity"]["allocation_factors"]
+ )
+ runtime_allocation["runner"] = terminal["identity"]["case_factors"]["sku"]
+ runtime = contracts.make_terminal_document(
+ allocation_factors=runtime_allocation,
+ attempt_ordinal=1,
+ case=terminal["case"],
+ case_factors=terminal["identity"]["case_factors"],
+ control_sha256=terminal["provenance"]["control_sha256"],
+ failure_mode="setup",
+ generated_at=terminal["generated_at"],
+ git_run=terminal["provenance"]["git_run"],
+ reason="launcher-setup-failed",
+ return_code=1,
+ source="runtime-emitter",
+ status="failed",
+ expected_case_id=terminal["identity"]["case_id"],
+ )
+ publisher._schema("terminal-outcome-v1.schema.json", runtime)
+ broken = copy.deepcopy(runtime)
+ broken["outcome"]["reason"] = "backend-setup-failed"
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ def test_post_emit_demotion_uses_closed_failure_taxonomy(self) -> None:
+ raw, _ = _native_fixture()
+ expected = {
+ 5: "runtime-identity",
+ 6: "execution",
+ 124: "timeout",
+ 137: "execution",
+ 134: "execution",
+ 9: "execution",
+ }
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ for return_code, failure_mode in expected.items():
+ with self.subTest(return_code=return_code):
+ path = root / f"attempt-{return_code}.json"
+ path.write_text(json.dumps(raw))
+ terminal = contracts.demote_raw_attempt(path, return_code)
+ self.assertEqual(
+ terminal["outcome"],
+ {
+ "failure_mode": failure_mode,
+ "reason": "post-emit-distributed-command-failed",
+ "return_code": return_code,
+ "status": "failed",
+ },
+ )
+ self.assertEqual(terminal["provenance"]["source"], "post-emit-command")
+ publisher._schema("terminal-outcome-v1.schema.json", terminal)
+
+ broken = copy.deepcopy(terminal)
+ broken["outcome"]["reason"] = "distributed-command-failed"
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ def test_artifact_safety_accepts_current_v1_fixtures(self) -> None:
+ raw, samples = _native_fixture()
+ publisher.artifact_safety.assert_publication_safe([
+ sweep_matrix.resolve_matrix(backends="all"),
+ raw,
+ samples,
+ _dataset(),
+ _promoted_dataset(),
+ ])
+
+ def test_native_raw_and_sample_schema_match_semantic_validator(self) -> None:
+ raw, samples = _native_fixture()
+ publisher._schema("samples-v1.schema.json", samples)
+ publisher._schema("raw-case-v1.schema.json", raw)
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ (root / "samples.json").write_bytes(contracts.canonical_json_bytes(samples))
+ (root / "raw.json").write_bytes(contracts.canonical_json_bytes(raw))
+ self.assertEqual(contracts.load_raw_attempt(root / "raw.json"), raw)
+ for target in ("raw", "samples"):
+ broken_raw, broken_samples = copy.deepcopy((raw, samples))
+ broken = broken_raw if target == "raw" else broken_samples
+ broken["unexpected"] = True
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema(
+ "raw-case-v1.schema.json" if target == "raw" else "samples-v1.schema.json",
+ broken,
+ )
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(broken_raw, broken_samples)
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2
+ with self.assertRaisesRegex(contracts.ContractError, "token_rate_at_latency_percentile"):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ tampered["case"]["shape"]["hidden"] = 2
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ configured = tampered["implementation"]["resource_profile"]["configured_units"]
+ tampered["implementation"]["resource_profile"]["configured_units"] = (
+ 1 if configured is None else configured + 1
+ )
+ with self.assertRaisesRegex(contracts.ContractError, "resource profile"):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ oracle = tampered["measurement"]["rows"][0]["correctness"]["rank_evidence"][0]
+ oracle["pre_timing"]["checks"]["combine_values"] = False
+ with self.assertRaisesRegex(contracts.ContractError, "passed differs"):
+ contracts.validate_raw_document(tampered, samples)
+
+ def test_hybrid_raw_binds_realized_config_and_every_rank_artifact(self) -> None:
+ raw, samples = _native_fixture("deepep-hybrid")
+ publisher._schema("raw-case-v1.schema.json", raw)
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+
+ mutations = {
+ "hidden_dim": lambda provenance: provenance["realized_config"].update(
+ hidden_dim=2
+ ),
+ "experts_per_rank": lambda provenance: provenance["realized_config"].update(
+ num_of_experts_per_rank=2
+ ),
+ "ranks_per_node": lambda provenance: provenance["realized_config"].update(
+ num_of_ranks_per_node=2
+ ),
+ "num_nodes": lambda provenance: provenance["realized_config"].update(
+ num_of_nodes=2
+ ),
+ "token_data_type": lambda provenance: provenance["realized_config"].update(
+ token_data_type="UINT8"
+ ),
+ "rank_coverage": lambda provenance: [
+ artifact["rank_artifacts"].append({
+ "bytes": 1, "rank": 1, "sha256": "9" * 64,
+ })
+ for artifact in provenance["jit_shared_objects"]
+ ],
+ }
+ for name, mutate in mutations.items():
+ with self.subTest(name=name):
+ changed = copy.deepcopy(raw)
+ mutate(changed["implementation"]["provenance"])
+ with self.assertRaisesRegex(
+ contracts.ContractError,
+ "DeepEP Hybrid realized config/JIT evidence differs",
+ ):
+ contracts.validate_raw_document(changed, samples)
+
+ def test_native_contract_recomputes_routing_receive_histograms_and_anomalies(self) -> None:
+ raw, samples = _native_fixture()
+
+ tampered = copy.deepcopy(raw)
+ changed = tampered["measurement"]["rows"][0]
+ changed["routing"]["routed_copies"] *= 2
+ for name in ("combine", "dispatch", "roundtrip"):
+ changed["logical_bytes"][name] *= 2
+ with self.assertRaisesRegex(contracts.ContractError, "routing.routed_copies"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ changed = tampered["measurement"]["rows"][0]
+ changed["routing"]["payload_copies_per_rank"] = [2]
+ changed["receive"] = {"max": 2, "mean": 2.0, "min": 2, "total": 2}
+ with self.assertRaisesRegex(contracts.ContractError, "payload_copies_per_rank"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["sample_histograms"]["roundtrip"][
+ "counts"
+ ] = [511]
+ with self.assertRaisesRegex(contracts.ContractError, "sample_histograms"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["anomalies"] = [{
+ "type": "roundtrip_gt_isolated_sum",
+ "T": 1,
+ "roundtrip_p99": 40.0,
+ "isolated_sum_p99": 30.0,
+ "ratio": 1.33,
+ "threshold": 3.0,
+ }]
+ tampered["outcome"]["validity"]["anomaly_free"] = False
+ with self.assertRaisesRegex(contracts.ContractError, "anomalies"):
+ contracts.validate_raw_document(tampered, samples)
+
+ anomalous_raw, anomalous_samples = copy.deepcopy((raw, samples))
+ sample_point = anomalous_samples["points"][0]
+ sample_point["components"]["roundtrip"]["trials"] = [
+ [100.0] * 8 for _ in range(64)
+ ]
+ sample_core = {
+ "components": sample_point["components"],
+ "tokens_per_rank": sample_point["tokens_per_rank"],
+ }
+ sample_sha = hashlib.sha256(
+ contracts.canonical_json_bytes(sample_core)
+ ).hexdigest()
+ point_id = sample_point["point_id"]
+ evidence_id = identity.evidence_id(
+ point=point_id,
+ allocation=anomalous_raw["identity"]["allocation_id"],
+ attempt=anomalous_raw["identity"]["attempt_id"],
+ sample_sha256=sample_sha,
+ )
+ sample_point.update({"sample_sha256": sample_sha, "evidence_id": evidence_id})
+ changed = anomalous_raw["measurement"]["rows"][0]
+ changed["sample_sha256"] = sample_sha
+ changed["evidence_id"] = evidence_id
+ changed["components"]["roundtrip"]["percentiles_us"] = {
+ name: 100.0 for name in ("p50", "p90", "p95", "p99")
+ }
+ changed["token_rate_at_latency_percentile"] = {
+ name: 10_000.0 for name in ("p50", "p90", "p95", "p99")
+ }
+ changed["sample_histograms"]["roundtrip"] = contracts._expected_histogram(
+ [100.0] * 512
+ )
+ changed["anomalies"] = contracts._expected_anomalies(1, changed["components"])
+ anomalous_raw["outcome"]["validity"]["anomaly_free"] = False
+ sample_bytes = contracts.canonical_json_bytes(anomalous_samples)
+ anomalous_raw["sample_artifact"].update({
+ "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ self.assertIs(
+ contracts.validate_raw_document(anomalous_raw, anomalous_samples),
+ anomalous_raw,
+ )
+ changed["anomalies"] = []
+ anomalous_raw["outcome"]["validity"]["anomaly_free"] = True
+ with self.assertRaisesRegex(contracts.ContractError, "anomalies"):
+ contracts.validate_raw_document(anomalous_raw, anomalous_samples)
+
+ def test_native_contract_rejects_every_schema_only_nested_mutation(self) -> None:
+ raw, samples = _native_fixture()
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+
+ def locate(document: object, path: tuple[object, ...]) -> object:
+ value = document
+ for part in path:
+ value = value[part] # type: ignore[index]
+ return value
+
+ def reject_raw(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("raw-case-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(document, samples)
+
+ required_fields = (
+ (("measurement", "rows", 0, "receive"), "total"),
+ (("measurement", "rows", 0, "routing"), "fanout_mean"),
+ (("measurement", "rows", 0, "routing", "source_token_stats"), "ranks"),
+ (("measurement", "rows", 0, "sample_histograms"), "roundtrip"),
+ (("measurement", "rows", 0, "sample_histograms", "roundtrip"), "n"),
+ (("runtime_fingerprint", "accelerator_runtime"), "kind"),
+ (("runtime_fingerprint", "collective_library"), "kind"),
+ (("runtime_fingerprint", "framework"), "kind"),
+ )
+ for path, required in required_fields:
+ with self.subTest(path=path, mutation="missing"):
+ broken = copy.deepcopy(raw)
+ del locate(broken, path)[required] # type: ignore[index]
+ reject_raw(broken)
+ with self.subTest(path=path, mutation="extra"):
+ broken = copy.deepcopy(raw)
+ locate(broken, path)["unexpected"] = None # type: ignore[index]
+ reject_raw(broken)
+
+ invalid_values = (
+ (("measurement", "rows", 0, "receive", "mean"), "one"),
+ (("measurement", "rows", 0, "routing", "fanout_mean"), "one"),
+ (("measurement", "rows", 0, "sample_histograms", "roundtrip", "bins"), 0),
+ (("provenance", "image", "arch"), "AMD64"),
+ (("runtime_fingerprint", "accelerator_runtime", "kind"), "rocm"),
+ )
+ for path, invalid in invalid_values:
+ with self.subTest(path=path, mutation="value"):
+ broken = copy.deepcopy(raw)
+ parent = locate(broken, path[:-1])
+ parent[path[-1]] = invalid # type: ignore[index]
+ reject_raw(broken)
+
+ def reject_samples(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("samples-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_samples_document(document)
+
+ for path, required in (
+ (("points", 0), "evidence_id"),
+ (("points", 0, "components"), "roundtrip"),
+ (("points", 0, "components", "roundtrip"), "trials"),
+ (("sampling",), "reduction"),
+ ):
+ with self.subTest(path=path, artifact="samples"):
+ broken = copy.deepcopy(samples)
+ del locate(broken, path)[required] # type: ignore[index]
+ reject_samples(broken)
+
+ def test_terminal_contract_and_schema_reject_the_same_shape_gaps(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ _, artifact = _unsupported_delivery(Path(temporary).resolve())
+ terminal = contracts.strict_load(next(artifact.glob("*.json")))
+ publisher._schema("terminal-outcome-v1.schema.json", terminal)
+ self.assertIs(contracts.validate_terminal_document(terminal), terminal)
+
+ def reject(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(document)
+
+ for path, invalid in (
+ (("outcome", "failure_mode"), "Not Safe"),
+ (("outcome", "reason"), "x" * 241),
+ (("provenance", "source"), "Not Safe"),
+ (("provenance", "git_run", "ref"), ""),
+ ):
+ with self.subTest(path=path):
+ broken = copy.deepcopy(terminal)
+ parent = broken
+ for part in path[:-1]:
+ parent = parent[part]
+ parent[path[-1]] = invalid
+ reject(broken)
+
+ def test_invalid_retry_is_quarantined_before_valid_retry_upload(self) -> None:
+ raw, samples = _native_fixture()
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ sample_bytes = contracts.canonical_json_bytes(samples)
+ bad = copy.deepcopy(raw)
+ bad["sample_artifact"].update({
+ "path": "a01.samples.json", "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ bad["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2
+ (root / "a01.samples.json").write_bytes(sample_bytes)
+ (root / "a01.json").write_bytes(contracts.canonical_json_bytes(bad))
+ self.assertTrue(contracts.quarantine_invalid_attempt(root / "a01.json"))
+ valid = copy.deepcopy(raw)
+ valid["sample_artifact"].update({
+ "path": "a02.samples.json", "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ (root / "a02.samples.json").write_bytes(sample_bytes)
+ (root / "a02.json").write_bytes(contracts.canonical_json_bytes(valid))
+ paths = sorted(str(path) for path in root.glob("*.json"))
+ self.assertEqual(contracts.validate_attempt_paths(paths), 1)
+ self.assertTrue((root / "a01.json.quarantine").is_file())
+ self.assertTrue((root / "a01.samples.json.quarantine").is_file())
+
+ def test_ingest_archives_first_and_publishes_latest_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ self.assertEqual(len(summarize.load_results(str(artifact), None, None)), 1)
+ result = publisher.ingest_command(_args(root / "store", matrix, artifact))
+ store = publisher.Store(root / "store")
+ pointer = store.verify_channel("latest-attempt")
+ self.assertEqual(result["status"], "accepted")
+ self.assertEqual(pointer["dataset"]["sha256"], result["dataset_sha256"])
+ self.assertTrue((store.incoming / result["incoming_id"] / "COMPLETE").is_file())
+ self.assertTrue((store.bundles / result["bundle_id"] / "COMPLETE").is_file())
+ self.assertFalse((store.channels / "dev-latest.json").exists())
+ self.assertEqual(os.stat(store.private).st_mode & 0o777, 0o700)
+ self.assertEqual(os.stat(store.public).st_mode & 0o777, 0o755)
+ self.assertEqual(os.stat(store.bundles / result["bundle_id"]).st_mode & 0o777, 0o500)
+ dataset_dir = store.datasets / result["dataset_sha256"]
+ self.assertEqual(os.stat(dataset_dir).st_mode & 0o777, 0o555)
+ self.assertEqual(os.stat(dataset_dir / "dataset.json").st_mode & 0o777, 0o444)
+
+ def test_repeated_ingest_is_content_idempotent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ args = _args(root / "store", matrix, artifact)
+ first = publisher.ingest_command(args)
+ store = publisher.Store(root / "store")
+ pointer_before = (store.channels / "latest-attempt.json").read_bytes()
+ second = publisher.ingest_command(args)
+ self.assertEqual(second, first)
+ self.assertEqual(
+ (store.channels / "latest-attempt.json").read_bytes(), pointer_before
+ )
+ self.assertEqual(len(list(store.incoming.iterdir())), 1)
+ self.assertEqual(len(list(store.bundles.iterdir())), 1)
+ self.assertEqual(len(list(store.datasets.iterdir())), 1)
+ bundle = publisher.strict_load(
+ store.bundles / first["bundle_id"] / "bundle.json"
+ )
+ terminal = publisher.strict_load(next(artifact.glob("*.json")))
+ self.assertEqual(bundle["created_at"], terminal["generated_at"])
+
+ def test_dataset_is_invariant_to_bundle_argument_order(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ store_root = root / "store"
+ bundle_ids = []
+ for run_id in (9, 11, 10):
+ run = {**RUN, "run_id": str(run_id)}
+ delivery = root / f"run-{run_id}"
+ delivery.mkdir()
+ matrix, artifact = _unsupported_delivery(delivery, run=run)
+ result = publisher.ingest_command(
+ _args(store_root, matrix, artifact, run=run)
+ )
+ bundle_ids.append(result["bundle_id"])
+ datasets = [
+ publisher.build_dataset(
+ publisher.Store(store_root), order, promote=False,
+ )
+ for order in itertools.permutations(bundle_ids)
+ ]
+ self.assertTrue(all(dataset == datasets[0] for dataset in datasets[1:]))
+ self.assertEqual(datasets[0]["generated_at"], "2026-07-04T00:00:00Z")
+ selected = datasets[0]["coverage"][0]["selected_attempt_id"]
+ selected_attempt = next(
+ item for item in datasets[0]["attempts"]
+ if item["attempt_id"] == selected
+ )
+ self.assertEqual(selected_attempt["run_id"], "11")
+
+ def test_diagnostic_dataset_orders_reruns_by_run_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ store_root = root / "store"
+ bundle_ids = []
+ for run_attempt in (1, 2):
+ run = {**RUN, "run_attempt": run_attempt}
+ delivery = root / f"attempt-{run_attempt}"
+ delivery.mkdir()
+ matrix, artifact = _unsupported_delivery(delivery, run=run)
+ result = publisher.ingest_command(
+ _args(store_root, matrix, artifact, run=run)
+ )
+ bundle_ids.append(result["bundle_id"])
+ dataset = publisher.build_dataset(
+ publisher.Store(store_root), bundle_ids, promote=False
+ )
+ selected_id = dataset["coverage"][0]["selected_attempt_id"]
+ selected = next(
+ item for item in dataset["attempts"]
+ if item["attempt_id"] == selected_id
+ )
+ self.assertEqual(selected["run_attempt"], 2)
+
+ def test_promotion_requires_every_runnable_case_to_succeed_in_every_bundle(self) -> None:
+ cases = {
+ "runnable": {"_disposition": "runnable"},
+ "planned-unsupported": {"_disposition": "unsupported"},
+ }
+ bundles = []
+ for _ in range(3):
+ runnable = {
+ "identity": {"case_id": "runnable"},
+ "outcome": {"status": "success"},
+ }
+ unsupported = {
+ "identity": {"case_id": "planned-unsupported"},
+ "outcome": {"status": "unsupported"},
+ }
+ bundles.append({
+ "selected": {"runnable": runnable, "planned-unsupported": unsupported},
+ "documents": {"runnable": runnable, "planned-unsupported": unsupported},
+ })
+ publisher._require_runnable_promotion_success(bundles, cases)
+
+ for status in ("failed", "invalid", "unsupported", "diagnostic"):
+ with self.subTest(status=status):
+ broken = copy.deepcopy(bundles)
+ broken[1]["selected"]["runnable"]["outcome"]["status"] = status
+ with self.assertRaisesRegex(
+ publisher.PublisherError, "every runnable matrix case"
+ ):
+ publisher._require_runnable_promotion_success(broken, cases)
+
+ broken = copy.deepcopy(bundles)
+ broken[1]["documents"]["retry"] = {
+ "identity": {"case_id": "runnable"},
+ "outcome": {"status": "failed"},
+ }
+ with self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"):
+ publisher._require_runnable_promotion_success(broken, cases)
+
+ def test_promoted_public_dataset_rejects_failed_retry_history(self) -> None:
+ dataset = _promoted_dataset()
+ successful = next(
+ item for item in dataset["attempts"]
+ if item["outcome"] == "success"
+ )
+ failed = copy.deepcopy(successful)
+ old_attempt_id = successful["attempt_id"]
+ successful["attempt_index"] = 2
+ successful["attempt_id"] = identity.attempt_id(
+ allocation=successful["allocation_id"], case=successful["case_id"], ordinal=2
+ )
+ failed.update({
+ "attempt_id": old_attempt_id,
+ "attempt_index": 1,
+ "outcome": "failed",
+ "failure_mode": "execution",
+ "reason": "execution-failed",
+ "series_id": None,
+ "selected": False,
+ "evidence": [],
+ })
+ dataset["attempts"].append(failed)
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ coverage = next(
+ item for item in dataset["coverage"]
+ if item["case_id"] == failed["case_id"]
+ )
+ coverage["attempt_ids"] = [
+ successful["attempt_id"] if value == old_attempt_id else value
+ for value in coverage["attempt_ids"]
+ ]
+ coverage["attempt_ids"].append(failed["attempt_id"])
+ coverage["attempt_ids"].sort()
+ if coverage["selected_attempt_id"] == old_attempt_id:
+ coverage["selected_attempt_id"] = successful["attempt_id"]
+
+ fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"])
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_unselected_success_does_not_reference_an_unpublished_series(self) -> None:
+ raw, _ = _native_fixture()
+ retained = publisher._public_attempt(raw, selected=False)
+ selected = publisher._public_attempt(raw, selected=True)
+ self.assertEqual(retained["outcome"], "success")
+ self.assertIsNone(retained["series_id"])
+ self.assertEqual(selected["series_id"], raw["identity"]["series_id"])
+
+ def test_public_dataset_selects_latest_derived_retry(self) -> None:
+ dataset = _dataset()
+ first = dataset["attempts"][0]
+ second = copy.deepcopy(first)
+ second.update({
+ "attempt_id": identity.attempt_id(
+ allocation=first["allocation_id"], case=first["case_id"], ordinal=2
+ ),
+ "attempt_index": 2,
+ "selected": False,
+ "series_id": None,
+ "evidence": [],
+ })
+ dataset["attempts"].append(second)
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"].append(second["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"].sort()
+ with self.assertRaisesRegex(publisher.PublisherError, "select the latest retry"):
+ publisher.validate_public_dataset(dataset)
+
+ second["attempt_id"] = identity.digest("attempt", {"not": "derived"})
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"] = [
+ item["attempt_id"] for item in dataset["attempts"]
+ ]
+ with self.assertRaisesRegex(publisher.PublisherError, "retry identity differs"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_promotion_requires_an_eligible_cohort_for_every_comparison_kind(self) -> None:
+ stable_fast, stable_fast_internal = _series(
+ "stable-fast", "deepep", decision_grade=True
+ )
+ stable_slow, stable_slow_internal = _series(
+ "stable-slow", "uccl", decision_grade=True
+ )
+ unstable_fast, unstable_fast_internal = _series(
+ "unstable-fast", "deepep", decision_grade=True
+ )
+ unstable_slow, unstable_slow_internal = _series(
+ "unstable-slow", "uccl", decision_grade=True
+ )
+ unstable_fast["phase"] = unstable_slow["phase"] = "prefill"
+ unstable_fast["series_id"] = identity.series_id({"test": "unstable-fast"})
+ unstable_slow["series_id"] = identity.series_id({"test": "unstable-slow"})
+ for statistic in ("p50", "p99"):
+ unstable_slow_internal["run_metrics"]["1"][8]["latency_us"][statistic] = (
+ unstable_fast_internal["run_metrics"]["1"][8]["latency_us"][statistic]
+ / 2
+ )
+ unstable_slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = (
+ unstable_fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic]
+ * 2
+ )
+ series = [stable_fast, stable_slow, unstable_fast, unstable_slow]
+ internals = {
+ stable_fast["series_id"]: stable_fast_internal,
+ stable_slow["series_id"]: stable_slow_internal,
+ unstable_fast["series_id"]: unstable_fast_internal,
+ unstable_slow["series_id"]: unstable_slow_internal,
+ }
+ cohorts, _, _, _ = publisher.build_decisions(series, internals)
+ eligible = [item for item in cohorts if item["eligibility"]["decision_grade"]]
+ ineligible = [item for item in cohorts if not item["eligibility"]["decision_grade"]]
+ self.assertEqual({item["kind"] for item in eligible}, {"library"})
+ self.assertTrue(ineligible)
+ anchor_series = [
+ {
+ "series_id": name,
+ "workload": {"routing": routing, "eplb": eplb},
+ "build": {"implementation_contract_sha256": "1" * 64},
+ }
+ for name, routing, eplb in (
+ ("uniform", "uniform", False),
+ ("zipf", "zipf", False),
+ ("zipf-eplb", "zipf", True),
+ )
+ ]
+ required = eligible + [
+ {
+ "kind": kind,
+ "eligibility": {"decision_grade": True},
+ **({"series_ids": [item["series_id"] for item in anchor_series]}
+ if kind == "routing" else {}),
+ }
+ for kind in publisher.REQUIRED_COHORT_KINDS
+ if kind != "library"
+ ]
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", {}
+ ), mock.patch.object(
+ publisher, "_expected_chip_cohort_count", return_value=1
+ ):
+ publisher._require_promotion_cohorts(
+ required + ineligible, anchor_series
+ )
+ for kind in publisher.REQUIRED_COHORT_KINDS:
+ with self.subTest(missing_kind=kind), self.assertRaisesRegex(
+ publisher.PublisherError, rf"cohort kinds:.*{kind}"
+ ):
+ publisher._require_promotion_cohorts([
+ item for item in required + ineligible
+ if item["kind"] != kind or not item["eligibility"]["decision_grade"]
+ ], anchor_series)
+
+ def test_promotion_requires_exact_counts_and_routing_anchors(self) -> None:
+ dataset = _promoted_dataset()
+ counts = _cohort_counts(dataset)
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+ routing = next(
+ item for item in dataset["cohorts"] if item["kind"] == "routing"
+ )
+ eplb = next(
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ and item["workload"]["eplb"]
+ )
+ eplb["workload"]["eplb"] = False
+ with self.assertRaisesRegex(publisher.PublisherError, "exact uniform"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ dataset = _promoted_dataset()
+ routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ zipf = next(
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ and item["workload"]["routing"] == "zipf"
+ and not item["workload"]["eplb"]
+ )
+ zipf["build"]["implementation_contract_sha256"] = "f" * 64
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "identical off-EPLB"):
+ publisher._require_promotion_cohorts(dataset["cohorts"], dataset["series"])
+
+ wrong_counts = {**counts, "library": counts["library"] + 1}
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", wrong_counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "exactly"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ def test_promotion_requires_every_derived_chip_cohort_to_be_stable(self) -> None:
+ dataset = _promoted_dataset()
+ chip = next(item for item in dataset["cohorts"] if item["kind"] == "chip")
+ self.assertEqual(
+ publisher._expected_chip_cohort_count(dataset["series"]),
+ sum(item["kind"] == "chip" for item in dataset["cohorts"]),
+ )
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset)
+ ):
+ missing = [item for item in dataset["cohorts"] if item is not chip]
+ with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"):
+ publisher._require_promotion_cohorts(missing, dataset["series"])
+
+ chip["eligibility"]["decision_grade"] = False
+ with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ def test_promotion_rejects_more_than_three_bundles(self) -> None:
+ bundles = {
+ str(run_id): {
+ "id": str(run_id), "cases": [],
+ "manifest": {
+ "matrix": {"sha256": publisher.CANONICAL_FULL_V1_MATRIX_SHA256},
+ "run": {"run_id": str(run_id), "run_attempt": 1},
+ },
+ }
+ for run_id in range(1, 5)
+ }
+ with mock.patch.object(
+ publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id]
+ ), self.assertRaisesRegex(publisher.PublisherError, "three independent"):
+ publisher.build_dataset(object(), list(bundles), promote=True)
+
+ dataset = _promoted_dataset()
+ dataset["source_bundle_ids"].append("d" * 64)
+ counts = _cohort_counts(dataset)
+ with mock.patch.object(
+ publisher,
+ "CANONICAL_FULL_V1_CASE_CATALOG_SHA256",
+ publisher._case_disposition_catalog_sha256(dataset["coverage"]),
+ ), mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "complete coverage"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_standalone_promotion_binds_matrix_and_requested_dispositions(self) -> None:
+ dataset = _promoted_dataset()
+ fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"])
+ with self.assertRaisesRegex(
+ publisher.PublisherError, "canonical case/disposition catalog"
+ ):
+ publisher.validate_public_dataset(dataset)
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), mock.patch.object(
+ publisher,
+ "REQUIRED_PROMOTION_COHORT_COUNTS",
+ _cohort_counts(dataset),
+ ):
+ publisher.validate_public_dataset(dataset)
+
+ diagnostic = copy.deepcopy(dataset)
+ item = diagnostic["series"][0]
+ item["status"] = "diagnostic"
+ item["eligibility"].update({
+ "decision_grade": False,
+ "stable_p50": False,
+ "p50_max_min_ratio": 1.20,
+ "reasons": ["unstable-p50"],
+ })
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), mock.patch.object(
+ publisher,
+ "REQUIRED_PROMOTION_COHORT_COUNTS",
+ _cohort_counts(dataset),
+ ), self.assertRaisesRegex(
+ publisher.PublisherError, "unstable or incomplete required series"
+ ):
+ publisher.validate_public_dataset(diagnostic)
+
+ broken = copy.deepcopy(dataset)
+ broken["promotion"]["matrix_id"] = "d" * 64
+ with self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"):
+ publisher.validate_public_dataset(broken)
+
+ for original, replacement in (("runnable", "unsupported"),
+ ("unsupported", "runnable")):
+ with self.subTest(original=original):
+ broken = copy.deepcopy(dataset)
+ item = next(
+ coverage for coverage in broken["coverage"]
+ if coverage["disposition"] == original
+ )
+ item["disposition"] = replacement
+ with mock.patch.object(
+ publisher,
+ "CANONICAL_FULL_V1_CASE_CATALOG_SHA256",
+ publisher._case_disposition_catalog_sha256(broken["coverage"]),
+ ), self.assertRaisesRegex(
+ publisher.PublisherError, "requested dispositions"
+ ):
+ publisher.validate_public_dataset(broken)
+
+ def test_workflow_matrix_and_catalog_digests_do_not_drift(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ matrix_path = Path(temporary) / "matrix_full.json"
+ result = subprocess.run(
+ [
+ sys.executable, str(ROOT / "sweep_matrix.py"),
+ "--suites", "all", "--max-cases", "128",
+ "--backends", "all", "--out", str(matrix_path),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(
+ hashlib.sha256(matrix_path.read_bytes()).hexdigest(),
+ publisher.CANONICAL_FULL_V1_MATRIX_SHA256,
+ )
+ matrix = contracts.strict_load(matrix_path)
+ coverage = [
+ {
+ "case_id": item["case"]["case_id"],
+ "disposition": item["disposition"],
+ }
+ for item in matrix["requested_cases"]
+ ]
+ self.assertEqual(
+ publisher._case_disposition_catalog_sha256(coverage),
+ publisher.CANONICAL_FULL_V1_CASE_CATALOG_SHA256,
+ )
+ self.assertEqual(
+ (
+ len(matrix["include"]), len(coverage),
+ sum(item["disposition"] == "runnable" for item in coverage),
+ sum(item["disposition"] == "unsupported" for item in coverage),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ ),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ if item["disposition"] == "runnable"
+ ),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ if item["disposition"] == "unsupported"
+ ),
+ ),
+ (38, 360, 228, 132, 840, 532, 308),
+ )
+ library: dict[tuple, set[str]] = {}
+ system: dict[tuple, set[str]] = {}
+ routing: dict[tuple, list[tuple[str, bool]]] = {}
+ for requested in matrix["requested_cases"]:
+ if requested["disposition"] != "runnable":
+ continue
+ case = requested["case"]
+ shape = tuple(
+ case[field]
+ for field in ("workload", "hidden", "topk", "experts", "ep", "phase")
+ )
+ route = (case["routing"], case["eplb"])
+ if case["backend"] != "nccl-ep":
+ library.setdefault((requested["sku"], shape, route), set()).add(
+ case["backend"]
+ )
+ else:
+ system.setdefault((shape, route), set()).add(requested["sku"])
+ routing.setdefault(
+ (requested["sku"], case["backend"], shape), []
+ ).append(route)
+ anchors = {("uniform", False), ("zipf", False), ("zipf", True)}
+ self.assertEqual(
+ {
+ "library": sum(len(variants) >= 2 for variants in library.values()),
+ "system": sum(len(variants) >= 2 for variants in system.values()),
+ "routing": sum(
+ len(variants) == 3 and set(variants) == anchors
+ for variants in routing.values()
+ ),
+ },
+ publisher.REQUIRED_PROMOTION_COHORT_COUNTS,
+ )
+
+ def test_build_promotion_requires_canonical_full_matrix(self) -> None:
+ bundles = {
+ str(run_id): {
+ "id": str(run_id), "cases": [],
+ "manifest": {
+ "matrix": {"sha256": "d" * 64},
+ "run": {"run_id": str(run_id), "run_attempt": 1},
+ },
+ }
+ for run_id in range(1, 4)
+ }
+ with mock.patch.object(
+ publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id]
+ ), self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"):
+ publisher.build_dataset(object(), list(bundles), promote=True)
+
+ def test_rejection_updates_latest_but_never_dev_latest(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ sentinel = b"existing-promoted-pointer\n"
+ (store.channels / "dev-latest.json").write_bytes(sentinel)
+ (artifact / "unknown.json").write_text('{"format":"unknown"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ self.assertEqual((store.channels / "dev-latest.json").read_bytes(), sentinel)
+ pointer = store.verify_channel("latest-attempt")
+ dataset = publisher.strict_load(store.public / pointer["dataset"]["path"])
+ self.assertEqual(dataset["promotion"]["status"], "quarantined")
+ self.assertTrue(any(store.quarantine.iterdir()))
+
+ def test_repeated_rejection_is_content_idempotent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ (artifact / "unknown.json").write_text('{"format":"unknown"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ pointer = (store.channels / "latest-attempt.json").read_bytes()
+ counts = tuple(
+ len(list(path.iterdir()))
+ for path in (store.incoming, store.quarantine, store.datasets)
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ self.assertEqual((store.channels / "latest-attempt.json").read_bytes(), pointer)
+ self.assertEqual(
+ tuple(
+ len(list(path.iterdir()))
+ for path in (store.incoming, store.quarantine, store.datasets)
+ ),
+ counts,
+ )
+
+ def test_distinct_rejections_advance_latest_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ unknown = artifact / "unknown.json"
+ unknown.write_text('{"format":"unknown-one"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ first = (store.channels / "latest-attempt.json").read_bytes()
+ unknown.write_text('{"format":"unknown-two"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ second = (store.channels / "latest-attempt.json").read_bytes()
+ self.assertNotEqual(second, first)
+ self.assertEqual(len(list(store.datasets.iterdir())), 2)
+
+ def test_zip_traversal_is_rejected(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "bad.zip"
+ with zipfile.ZipFile(archive, "w") as handle:
+ handle.writestr("../escape.json", "{}")
+ with self.assertRaisesRegex(publisher.PublisherError, "escapes"):
+ publisher.extract_archive(archive, root / "out")
+
+ def test_store_and_directory_archive_reject_symlinks(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ real = root / "real"
+ real.mkdir()
+ alias = root / "alias"
+ alias.symlink_to(real, target_is_directory=True)
+ with self.assertRaisesRegex(publisher.PublisherError, "symlinked parent"):
+ publisher.Store(alias / "store")
+ self.assertFalse((real / "store").exists())
+ artifact = root / f"cxunsupported-{RUN['run_id']}-{RUN['run_attempt']}"
+ artifact.mkdir()
+ target = root / "target.json"
+ target.write_text("{}")
+ (artifact / "linked.json").symlink_to(target)
+ with self.assertRaisesRegex(publisher.PublisherError, "symlink"):
+ publisher._archive_download_directory(artifact, root / "artifact.zip")
+
+ def test_offline_caller_metadata_is_validated_before_store_creation(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store_root = root / "store"
+ args = _args(store_root, matrix, artifact)
+ args.run_id = "0"
+ with self.assertRaisesRegex(publisher.PublisherError, "run-id"):
+ publisher.ingest_command(args)
+ self.assertFalse(store_root.exists())
+
+ promote = types.SimpleNamespace(
+ store_root=str(store_root), bundle=["not-a-digest"]
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "bundle IDs"):
+ publisher.promote_command(promote)
+ self.assertFalse(store_root.exists())
+ with self.assertRaisesRegex(publisher.PublisherError, "absolute path"):
+ publisher._store_from_args(types.SimpleNamespace(store_root="relative-store"))
+
+ def test_store_rejects_group_or_world_writable_root(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve() / "unsafe-store"
+ root.mkdir()
+ root.chmod(0o772)
+ with self.assertRaisesRegex(publisher.PublisherError, "group/world writable"):
+ publisher.Store(root)
+
+ def test_retry_ordinals_must_be_contiguous_from_one(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root, (1, 3))
+ with self.assertRaisesRegex(publisher.PublisherError, "contiguous ordinals"):
+ publisher.ingest_command(_args(root / "store", matrix, artifact))
+
+ def test_delivery_rejects_extra_archive_and_non_native_member(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ extra = root / f"cxshard-extra-{RUN['run_id']}-{RUN['run_attempt']}"
+ extra.mkdir()
+ (extra / "extra.json").write_text("{}")
+ args = _args(root / "store-extra", matrix, artifact)
+ args.artifact.append(str(extra))
+ with self.assertRaisesRegex(publisher.PublisherError, "archive set"):
+ publisher.ingest_command(args)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ (artifact / "notes.txt").write_text("not native evidence")
+ with self.assertRaisesRegex(publisher.PublisherError, "unconsumed"):
+ publisher.ingest_command(_args(root / "store-member", matrix, artifact))
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ path = next(artifact.glob("*.json"))
+ terminal = json.loads(path.read_text())
+ terminal["outcome"]["reason"] = next(
+ reason for reason in contracts.CAPABILITY_FAILURE_REASONS
+ if reason != terminal["outcome"]["reason"]
+ )
+ path.write_text(json.dumps(terminal))
+ with self.assertRaisesRegex(publisher.PublisherError, "reason differs"):
+ publisher.ingest_command(_args(root / "store-reason", matrix, artifact))
+
+ def test_rates_invert_latency_and_global_tokens_use_ep_size(self) -> None:
+ dataset = _dataset()
+ publisher.validate_public_dataset(dataset)
+ rates = dataset["series"][0]["points"][0]["components"]["roundtrip"]["logical_payload_rate_gbps_at_latency_percentile"]
+ self.assertGreater(rates["p50"], rates["p99"])
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["points"][0]["global_tokens"] = 128
+ with self.assertRaisesRegex(publisher.PublisherError, "EP size"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["points"][0]["roundtrip_token_rate_at_latency_percentile"]["p99"] *= 2
+ with self.assertRaisesRegex(publisher.PublisherError, "token throughput"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["attempts"][0]["evidence"][0]["point_id"] = identity.point_id(
+ series=broken["series"][0]["series_id"], tokens_per_rank=16
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "point evidence"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["attempts"][0]["series_id"] = None
+ with self.assertRaisesRegex(publisher.PublisherError, "present exactly for selected success"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ component = broken["series"][0]["points"][0]["components"]["roundtrip"]
+ component["logical_bytes"] = None
+ component["logical_payload_rate_gbps_at_latency_percentile"] = None
+ with self.assertRaisesRegex(publisher.PublisherError, "logical bandwidth is missing"):
+ publisher.validate_public_dataset(broken)
+
+ for mutate in (
+ lambda item: item.update({"model": "different-model"}),
+ lambda item: item["workload"].update({"hidden": 4096}),
+ lambda item: item["workload"].update({"top_k": 4}),
+ lambda item: item["workload"].update({"experts": 128}),
+ ):
+ broken = copy.deepcopy(dataset)
+ mutate(broken["series"][0])
+ with self.assertRaisesRegex(publisher.PublisherError, "frozen v1"):
+ publisher.validate_public_dataset(broken)
+
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["eplb"]["mapping_sha256"] = "f" * 64
+ with self.assertRaisesRegex(publisher.PublisherError, "claims a plan"):
+ publisher.validate_public_dataset(broken)
+
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["backend"].update({
+ "id": "nccl-ep", "label": publisher.BACKEND_LABELS["nccl-ep"],
+ "role": "reference", "generation": "rccl",
+ })
+ broken["coverage"][0]["backend"] = "nccl-ep"
+ with self.assertRaisesRegex(publisher.PublisherError, "configuration"):
+ publisher.validate_public_dataset(broken)
+
+ def test_routing_and_eplb_facts_must_match_across_repeats(self) -> None:
+ raw, _ = _native_fixture()
+ descriptor = publisher._eplb_descriptor(raw)
+ facts = publisher._routing_facts(raw["measurement"]["rows"][0])
+ self.assertEqual(
+ publisher._exact_repeat_value([descriptor, copy.deepcopy(descriptor)], "EPLB"),
+ descriptor,
+ )
+ self.assertEqual(
+ publisher._exact_repeat_value([facts, copy.deepcopy(facts)], "routing"),
+ facts,
+ )
+ changed = copy.deepcopy(facts)
+ changed["hotspot_ratio"] += 0.1
+ with self.assertRaisesRegex(publisher.PublisherError, "routing differs"):
+ publisher._exact_repeat_value([facts, changed], "routing")
+
+ dataset = _promoted_dataset()
+ dataset["promotion"]["status"] = "diagnostic"
+ eplb = next(item for item in dataset["series"] if item["eplb"]["enabled"])
+ eplb["points"][0]["routing"]["empty_expert_count"] = 280
+ publisher.validate_public_dataset(dataset)
+ eplb["points"][0]["routing"]["empty_expert_count"] = 288
+ with self.assertRaisesRegex(publisher.PublisherError, "routing/load facts"):
+ publisher.validate_public_dataset(dataset)
+
+ for field, value in (
+ ("mapping_sha256", "0" * 64),
+ ("redundant_experts", 31),
+ ("replicated_experts", 1),
+ ("max_replicas", 2),
+ ("replicated_experts", 257),
+ ("max_replicas", 999),
+ ("imbalance_after", 0.4),
+ ("planner", "different-planner"),
+ ("reference_tokens_per_rank", 1024),
+ ):
+ broken = _promoted_dataset()
+ broken["promotion"]["status"] = "diagnostic"
+ descriptor = next(
+ item["eplb"] for item in broken["series"] if item["eplb"]["enabled"]
+ )
+ descriptor[field] = value
+ with self.subTest(eplb_field=field), self.assertRaisesRegex(
+ publisher.PublisherError, "EPLB descriptor"
+ ):
+ publisher.validate_public_dataset(broken)
+
+ def test_publisher_owns_stable_rankings_and_recommendations(self) -> None:
+ fast, fast_internal = _series("fast", "deepep", decision_grade=True)
+ slow, slow_internal = _series("slow", "uccl", decision_grade=True)
+ reference, reference_internal = _series("reference", "nccl-ep", decision_grade=True)
+ reference_peer, reference_peer_internal = _series(
+ "reference-peer", "nccl-ep", decision_grade=True
+ )
+ reference["backend"]["role"] = "reference"
+ reference_peer["backend"]["role"] = "reference"
+ reference_peer["system"].update({"sku": "h200-dgxc", "label": "NVIDIA H200"})
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow, reference, reference_peer], {
+ fast["series_id"]: fast_internal,
+ slow["series_id"]: slow_internal,
+ reference["series_id"]: reference_internal,
+ reference_peer["series_id"]: reference_peer_internal,
+ }
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ ranking = next(item for item in rankings if item["cohort_id"] == library["cohort_id"]
+ and item["metric"]["measure"] == "latency_us"
+ and item["metric"]["statistic"] == "p99")
+ self.assertTrue(library["eligibility"]["decision_grade"])
+ self.assertEqual(ranking["entries"][0]["series_id"], fast["series_id"])
+ self.assertTrue(any(item["series_id"] == fast["series_id"] for item in recommendations))
+ self.assertFalse(any(
+ entry["series_id"] == reference["series_id"]
+ for item in rankings if item["cohort_id"] == library["cohort_id"]
+ for entry in item["entries"]
+ ))
+ self.assertTrue(any(
+ item["kind"] == "system" and reference["series_id"] in item["series_ids"]
+ for item in cohorts
+ ))
+
+ def test_routing_evidence_is_experimental_and_not_a_configuration_recommendation(self) -> None:
+ dataset = _promoted_dataset()
+ routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ members = [
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ ]
+ self.assertEqual(
+ {(item["workload"]["routing"], item["workload"]["eplb"]) for item in members},
+ {("uniform", False), ("zipf", False), ("zipf", True)},
+ )
+ self.assertIn("implementation-static-build", routing["controlled_factors"])
+ self.assertIn("resource", routing["controlled_factors"])
+ self.assertEqual(
+ routing["varying_factors"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ )
+ self.assertEqual(
+ len({item["build"]["routing_control_sha256"] for item in members}),
+ 1,
+ )
+ self.assertGreater(
+ len({item["build"]["implementation_contract_sha256"] for item in members}),
+ 1,
+ )
+ self.assertEqual(len({json.dumps(item["resource"], sort_keys=True) for item in members}), 1)
+ self.assertEqual(routing["publication_tier"], "comparable-experimental")
+ self.assertTrue(any(
+ item["cohort_id"] == routing["cohort_id"] for item in dataset["rankings"]
+ ))
+ self.assertFalse(any(
+ item["cohort_id"] == routing["cohort_id"] for item in dataset["recommendations"]
+ ))
+ self.assertTrue(all(
+ item["publication_tier"] == "official"
+ for item in dataset["recommendations"]
+ ))
+ self.assertFalse(any(
+ dataset_cohort["publication_tier"] == "comparable-experimental"
+ and item["cohort_id"] == dataset_cohort["cohort_id"]
+ for item in dataset["recommendations"]
+ for dataset_cohort in dataset["cohorts"]
+ ))
+ self.assertTrue(all(
+ item["publication_tier"] == "comparable-experimental"
+ for item in dataset["sensitivities"]
+ if item["cohort_id"] == routing["cohort_id"]
+ ))
+
+ def test_routing_implementation_mismatch_blocks_all_decisions(self) -> None:
+ dataset = _promoted_dataset()
+ published = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ members = [
+ item for item in dataset["series"]
+ if item["series_id"] in published["series_ids"]
+ ]
+ zipf = next(
+ item for item in members
+ if item["workload"]["routing"] == "zipf" and not item["workload"]["eplb"]
+ )
+ zipf["build"]["implementation_contract_sha256"] = "f" * 64
+ internals = {}
+ for member in members:
+ point = member["points"][0]
+ roundtrip = point["components"]["roundtrip"]
+ metrics = {
+ "latency_us": {
+ name: roundtrip["latency_us"][name] for name in ("p50", "p99")
+ },
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ name: roundtrip[
+ "logical_payload_rate_gbps_at_latency_percentile"
+ ][name]
+ for name in ("p50", "p99")
+ },
+ }
+ internals[member["series_id"]] = {
+ "run_metrics": {
+ str(run): {point["tokens_per_rank"]: metrics}
+ for run in range(3)
+ }
+ }
+ cohorts, rankings, recommendations, sensitivities = publisher.build_decisions(
+ members, internals
+ )
+ routing = next(item for item in cohorts if item["kind"] == "routing")
+ self.assertFalse(routing["eligibility"]["decision_grade"])
+ self.assertIn(
+ "implementation-config-mismatch", routing["eligibility"]["reasons"]
+ )
+ self.assertEqual((rankings, recommendations, sensitivities), ([], [], []))
+
+ def test_promoted_series_fields_are_bound_to_case_and_series_identities(self) -> None:
+ dataset = _promoted_dataset()
+ changed = copy.deepcopy(dataset)
+ series = next(
+ item for item in changed["series"]
+ if item["system"]["sku"] == "h100-dgxc"
+ )
+ series["system"].update({
+ "sku": "h200-dgxc", "label": "NVIDIA H200",
+ "topology_class": "h200-nvlink-island",
+ })
+ for case_id in series["case_ids"]:
+ next(
+ item for item in changed["coverage"] if item["case_id"] == case_id
+ )["sku"] = "h200-dgxc"
+ with self.assertRaisesRegex(publisher.PublisherError, "configuration|case identity"):
+ publisher.validate_public_dataset(changed)
+
+ for field, value in (
+ ("source_sha", "b" * 40),
+ ("image_digest", "sha256:" + "4" * 64),
+ ("squash_sha256", "5" * 64),
+ ("runtime_fingerprint_sha256", "6" * 64),
+ ("implementation_contract_sha256", "7" * 64),
+ ("public_config_sha256", "9" * 64),
+ ("routing_control_sha256", "8" * 64),
+ ):
+ changed = copy.deepcopy(dataset)
+ changed["series"][0]["build"][field] = value
+ with self.subTest(build_field=field), self.assertRaisesRegex(
+ publisher.PublisherError, "commit"
+ ):
+ publisher.validate_public_dataset(changed)
+ changed = copy.deepcopy(dataset)
+ changed["series"][0]["workload"]["workload_id"] = identity.workload_id(
+ {"changed": True}
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "committed factors"):
+ publisher.validate_public_dataset(changed)
+
+ for mutate, message in (
+ (lambda item: item["backend"].update({
+ "generation": "fabricated", "version": "fabricated-999",
+ }), "configuration"),
+ (lambda item: item["resource"].update({
+ "profile": "profile-fabricated", "configured_units": 99,
+ }), "configuration"),
+ (lambda item: item["system"].update({"label": "Fabricated H100"}), "projection"),
+ ):
+ changed = copy.deepcopy(dataset)
+ mutate(changed["series"][0])
+ with self.assertRaisesRegex(publisher.PublisherError, message):
+ publisher.validate_public_dataset(changed)
+
+ diagnostic = _dataset()
+ diagnostic["series"][0]["build"]["source_sha"] = "b" * 40
+ with self.assertRaisesRegex(publisher.PublisherError, "committed factors"):
+ publisher.validate_public_dataset(diagnostic)
+
+ def test_all_decision_metrics_require_stable_repeat_ordering(self) -> None:
+ fast, fast_internal = _series("ordering-fast", "deepep", decision_grade=True)
+ slow, slow_internal = _series("ordering-slow", "uccl", decision_grade=True)
+ internals = {
+ fast["series_id"]: fast_internal,
+ slow["series_id"]: slow_internal,
+ }
+
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow], internals
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ self.assertTrue(library["eligibility"]["decision_grade"])
+ self.assertEqual(
+ len([item for item in rankings if item["cohort_id"] == library["cohort_id"]]),
+ 4,
+ )
+ self.assertEqual(
+ len([
+ item for item in recommendations
+ if item["cohort_id"] == library["cohort_id"]
+ ]),
+ 4,
+ )
+
+ for statistic in ("p50", "p99"):
+ slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = (
+ fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] * 2
+ )
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow], internals
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ self.assertFalse(library["eligibility"]["decision_grade"])
+ self.assertIn("unstable-ordering", library["eligibility"]["reasons"])
+ self.assertFalse(any(
+ item["cohort_id"] == library["cohort_id"] for item in rankings
+ ))
+ self.assertFalse(any(
+ item["cohort_id"] == library["cohort_id"] for item in recommendations
+ ))
+
+ def test_extra_eligibility_reason_blocks_decision_grade(self) -> None:
+ allocations = [identity.allocation_id({"run": run}) for run in range(3)]
+ eligibility = publisher._eligibility_record(
+ allocations, complete=True, correct=True, measured=True,
+ stable_ordering=True, p50_ratio=1.01, p99_ratio=1.02,
+ extra_reasons=["incomplete-provenance"],
+ )
+ self.assertFalse(eligibility["decision_grade"])
+ self.assertEqual(eligibility["reasons"], ["incomplete-provenance"])
+ self.assertIs(publisher._eligibility(eligibility, "fixture"), eligibility)
+ broken = {**eligibility, "decision_grade": True}
+ with self.assertRaisesRegex(publisher.PublisherError, "promotion gates"):
+ publisher._eligibility(broken, "fixture")
+
+ def test_schema_is_strict_and_channel_target_must_be_complete(self) -> None:
+ dataset = _dataset()
+ dataset["unexpected"] = True
+ with self.assertRaises(publisher.PublisherError):
+ publisher.validate_public_dataset(dataset)
+ with mock.patch.object(publisher, "MAX_PUBLIC_DATASET_BYTES", 1), self.assertRaisesRegex(
+ publisher.PublisherError, "serving size limit"
+ ):
+ publisher.validate_public_dataset(_dataset())
+ with tempfile.TemporaryDirectory() as temporary:
+ store = publisher.Store(Path(temporary).resolve())
+ dataset = _dataset()
+ digest, size = store.install_dataset(dataset)
+ store.update_channel("latest-attempt", digest, size, dataset["generated_at"])
+ self.assertEqual(store.verify_channel("latest-attempt")["dataset"]["sha256"], digest)
+ channel_path = store.channels / "latest-attempt.json"
+ pointer = publisher.strict_load(channel_path)
+ pointer["generated_at"] = "2099-01-01T00:00:00Z"
+ channel_path.write_bytes(contracts.canonical_json_bytes(pointer))
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.verify_channel("latest-attempt")
+ store.update_channel("latest-attempt", digest, size, dataset["generated_at"])
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.update_channel(
+ "latest-attempt", digest, size + 1, dataset["generated_at"]
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.update_channel(
+ "latest-attempt", digest, size, "2026-07-05T00:00:00Z"
+ )
+ os.chmod(channel_path, 0o666)
+ with self.assertRaisesRegex(publisher.PublisherError, "regular 644"):
+ store.verify_channel("latest-attempt")
+ os.chmod(channel_path, 0o644)
+ dataset_dir = store.datasets / digest
+ os.chmod(dataset_dir, 0o755)
+ with self.assertRaisesRegex(publisher.PublisherError, "mode differs"):
+ store.verify_channel("latest-attempt")
+ os.chmod(dataset_dir, 0o555)
+ os.chmod(dataset_dir / "dataset.json", 0o644)
+ with self.assertRaisesRegex(publisher.PublisherError, "mode differs"):
+ store.verify_channel("latest-attempt")
+ os.chmod(dataset_dir / "dataset.json", 0o444)
+ os.chmod(dataset_dir, 0o755)
+ (dataset_dir / "COMPLETE").unlink()
+ os.chmod(dataset_dir, 0o555)
+ with self.assertRaisesRegex(publisher.PublisherError, "incomplete"):
+ store.verify_channel("latest-attempt")
+
+ def test_store_modes_do_not_depend_on_process_umask(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ previous = os.umask(0o077)
+ try:
+ store = publisher.Store(Path(temporary).resolve())
+ dataset = _dataset()
+ digest, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", digest, size, dataset["generated_at"]
+ )
+ with store.locked():
+ pass
+ finally:
+ os.umask(previous)
+ self.assertEqual(
+ store.root.stat().st_mode & 0o777,
+ 0o750,
+ )
+ self.assertEqual(
+ (store.channels / "latest-attempt.json").stat().st_mode & 0o777,
+ 0o644,
+ )
+ self.assertEqual(
+ (store.datasets / digest / "dataset.json").stat().st_mode & 0o777,
+ 0o444,
+ )
+ self.assertEqual(
+ (store.locks / "publisher.lock").stat().st_mode & 0o777,
+ 0o600,
+ )
+
+ def test_verify_requires_bootstrap_but_dev_latest_is_optional(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ args = types.SimpleNamespace(
+ store_root=str(root / "store"), channel=None, bundle=[]
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.verify_command(args)
+ store = publisher.Store(args.store_root)
+ dataset = publisher._quarantine_dataset(
+ "awaiting-v1-runs", "2026-07-04T00:00:00Z"
+ )
+ digest, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", digest, size, "2026-07-04T00:00:00Z"
+ )
+ result = publisher.verify_command(args)
+ self.assertEqual(set(result["channels"]), {"latest-attempt"})
+ explicit = types.SimpleNamespace(
+ store_root=args.store_root, channel=["dev-latest"], bundle=[]
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.verify_command(explicit)
+ dev_pointer = copy.deepcopy(store.verify_channel("latest-attempt"))
+ dev_pointer["channel"] = "dev-latest"
+ (store.channels / "dev-latest.json").write_bytes(
+ contracts.canonical_json_bytes(dev_pointer)
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "non-promoted"):
+ publisher.verify_command(args)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/test_sampling_contract.py b/experimental/CollectiveX/tests/test_sampling_contract.py
new file mode 100644
index 0000000000..fa4b4005ea
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_sampling_contract.py
@@ -0,0 +1,2287 @@
+#!/usr/bin/env python3
+"""CPU-only behavioral tests for the CollectiveX v1 execution contract."""
+from __future__ import annotations
+
+import argparse
+import ast
+import copy
+import hashlib
+import io
+import json
+import os
+from pathlib import Path
+import re
+import subprocess
+import sys
+import tempfile
+import types
+import unittest
+from unittest import mock
+
+import numpy as np
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path[:0] = [str(ROOT), str(HERE)]
+
+import artifact_safety # noqa: E402
+import capability # noqa: E402
+import contracts # noqa: E402
+import eplb # noqa: E402
+import ep_harness # noqa: E402
+import identity # noqa: E402
+import run_ep # noqa: E402
+import summarize # noqa: E402
+import sweep_matrix # noqa: E402
+import workload # noqa: E402
+
+
+class SamplingContractTest(unittest.TestCase):
+ def test_identity_and_fixed_sampling_profile(self) -> None:
+ identity.verify_test_vector()
+ self.assertTrue(identity.is_typed_id(identity.IDENTITY_TEST_VECTOR["series_id"], "series"))
+ self.assertEqual(ep_harness.SAMPLING_CONTRACT, "fixed-512-v1")
+ self.assertEqual(
+ (
+ ep_harness.TIMED_ITERS_PER_TRIAL,
+ ep_harness.TRIALS_PER_POINT,
+ ep_harness.TIMED_SAMPLES_PER_POINT,
+ ep_harness.WARMUP_ITERS_PER_TRIAL,
+ ),
+ (8, 64, 512, 32),
+ )
+ self.assertEqual(identity.V1_CASE_PROFILE["activation_profile"], "canonical-counter-source-v3")
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["activation_generator"],
+ "collectivex-activation-counter-v3",
+ )
+ self.assertEqual(identity.V1_CASE_PROFILE["sampling_contract"], "fixed-512-v1")
+ self.assertEqual(identity.V1_CASE_PROFILE["percentile_method"], "nearest-rank")
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["rank_reduction"],
+ "cross-rank-max-per-iteration",
+ )
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["oracle_contract"],
+ "expert-specific-transform-v1",
+ )
+ parser = argparse.ArgumentParser()
+ ep_harness.add_common_args(parser)
+ args = parser.parse_args(
+ ["--runner", "test", "--topology-class", "test", "--out", "result.json"]
+ )
+ self.assertEqual((args.iters, args.trials, args.warmup), (8, 64, 32))
+ for profile in ((8, 64, 32), (128, 4, 32), (8, 1, 4), (0, 64, 32)):
+ with self.subTest(profile=profile):
+ self.assertEqual(
+ ep_harness.sampling_contract_error(*profile) is None,
+ profile == (8, 64, 32),
+ )
+
+ def test_nearest_rank_percentiles_use_all_512_samples(self) -> None:
+ samples = list(range(1, 513))
+ self.assertEqual(ep_harness.percentile(samples, 50), 256)
+ self.assertEqual(ep_harness.percentile(samples, 99), 507)
+
+ def test_terminal_summary_uses_bound_sku_and_route(self) -> None:
+ terminal = {
+ "format": contracts.TERMINAL_FORMAT,
+ "case": {
+ "backend": "deepep", "phase": "prefill", "ep": 8,
+ "suite": "ep-routing-v1", "routing": "zipf", "eplb": True,
+ "required_publication": "comparable-experimental",
+ },
+ "identity": {"case_factors": {"sku": "h100-dgxc"}},
+ }
+ self.assertEqual(
+ summarize._identity(terminal),
+ (
+ "h100-dgxc", "ep-routing-v1", "zipf", "prefill", True,
+ "comparable-experimental", 8,
+ ),
+ )
+
+ def test_matrix_cases_and_shards_are_identity_bound(self) -> None:
+ matrix = sweep_matrix.validate_matrix_document(
+ sweep_matrix.resolve_matrix(backends="all")
+ )
+ requested = {item["case"]["case_id"]: item for item in matrix["requested_cases"]}
+ assigned = [case_id for shard in matrix["include"] for case_id in shard["case_ids"]]
+ runnable = {
+ case_id for case_id, item in requested.items()
+ if item["disposition"] == "runnable"
+ }
+ self.assertEqual(
+ (
+ len(matrix["include"]),
+ len(matrix["requested_cases"]),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ ),
+ ),
+ (38, 360, 840),
+ )
+ routing_points = {
+ phase: {
+ int(point)
+ for item in matrix["requested_cases"]
+ if item["case"]["suite"] == "ep-routing-v1"
+ and item["case"]["phase"] == phase
+ for point in item["case"]["ladder"].split()
+ }
+ for phase in ("decode", "prefill")
+ }
+ self.assertEqual(routing_points, {"decode": {128}, "prefill": {512}})
+ skus = sorted({shard["sku"] for shard in matrix["include"]})
+ self.assertEqual(
+ [shard["sku"] for shard in matrix["include"][:len(skus)]],
+ skus,
+ )
+ self.assertEqual(set(assigned), runnable)
+ self.assertEqual(len(assigned), len(set(assigned)))
+ excluded = {
+ "uccl": {"b200-dgxc", "b300"},
+ }
+ for backend, skus in excluded.items():
+ for sku in skus:
+ with self.subTest(backend=backend, sku=sku):
+ self.assertFalse(capability.resolve(sku, backend)[0])
+ for case_id, item in requested.items():
+ case = {key: value for key, value in item["case"].items() if key != "case_id"}
+ self.assertEqual(
+ case_id,
+ identity.case_id(sku=item["sku"], profile=identity.V1_CASE_PROFILE, case=case),
+ )
+ self.assertEqual(case["timing"], "8:64:32")
+ self.assertEqual(case["samples_per_point"], 512)
+
+ bad_matrix = copy.deepcopy(matrix)
+ bad_matrix["schema_version"] = True
+ with self.assertRaises(sweep_matrix.MatrixError):
+ sweep_matrix.validate_matrix_document(bad_matrix)
+
+ bad_catalog = copy.deepcopy(matrix)
+ wrapper = next(
+ item for item in bad_catalog["requested_cases"]
+ if item["disposition"] == "runnable"
+ )
+ old_id = wrapper["case"]["case_id"]
+ wrapper["case"]["hidden"] = 1
+ factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"}
+ new_id = identity.case_id(
+ sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors
+ )
+ wrapper["case"]["case_id"] = new_id
+ for shard in bad_catalog["include"]:
+ shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]]
+ with self.assertRaisesRegex(sweep_matrix.MatrixError, "frozen v1"):
+ sweep_matrix.validate_matrix_document(bad_catalog)
+
+ shard_meta = matrix["include"][0]
+ requested_cases = {item["case"]["case_id"]: item["case"] for item in matrix["requested_cases"]}
+ shard = {
+ "schema_version": True,
+ "id": shard_meta["id"],
+ "sku": shard_meta["sku"],
+ "backend": shard_meta["backend"],
+ "nodes": shard_meta["nodes"],
+ "n": shard_meta["n"],
+ "cases": [requested_cases[value] for value in shard_meta["case_ids"]],
+ }
+ with self.assertRaises(sweep_matrix.MatrixError):
+ sweep_matrix.validate_shard_control(
+ shard, sku=shard_meta["sku"], backend=shard_meta["backend"],
+ nodes=shard_meta["nodes"],
+ )
+
+ def test_matrix_yaml_and_config_validation_are_strict(self) -> None:
+ suites = sweep_matrix._load("suites.yaml")
+ workloads = sweep_matrix._load("workloads.yaml")
+ invalid = (
+ ("unknown top", lambda s, _w: s.update({"typo": True})),
+ (
+ "unknown suite field",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"modes": ["normal"]}),
+ ),
+ (
+ "unknown workload field",
+ lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"unused": 1}),
+ ),
+ (
+ "string phases",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"phases": "decode"}),
+ ),
+ (
+ "unknown routing",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"routings": ["random"]}),
+ ),
+ (
+ "integer EPLB",
+ lambda s, _w: s["suites"]["ep-routing-v1"].update({"eplb": [0, 1]}),
+ ),
+ (
+ "duplicate platform",
+ lambda s, _w: s["suites"]["ep-core-v1"]["platforms"].append("h100-dgxc"),
+ ),
+ ("missing top field", lambda s, _w: s.pop("schema_version")),
+ (
+ "string dimension",
+ lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"hidden": "7168"}),
+ ),
+ (
+ "unreachable phase ladder",
+ lambda s, _w: s["suites"]["ep-routing-v1"].update({"phases": ["prefill"]}),
+ ),
+ )
+ for label, mutate in invalid:
+ with self.subTest(label=label), self.assertRaises(SystemExit):
+ bad_suites, bad_workloads = copy.deepcopy(suites), copy.deepcopy(workloads)
+ mutate(bad_suites, bad_workloads)
+ sweep_matrix.validate_config_documents(bad_suites, bad_workloads)
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "configs").mkdir()
+ (root / "configs" / "duplicate.yaml").write_text(
+ "schema_version: 1\nsuites:\n same: 1\n same: 2\n"
+ )
+ with mock.patch.object(sweep_matrix, "HERE", root), self.assertRaisesRegex(
+ SystemExit, "duplicate YAML key"
+ ):
+ sweep_matrix._load("duplicate.yaml")
+
+ def test_semantically_duplicate_suite_points_are_rejected(self) -> None:
+ suites = sweep_matrix._load("suites.yaml")
+ workloads = sweep_matrix._load("workloads.yaml")
+ suites["suites"]["ep-core-copy-v1"] = copy.deepcopy(
+ suites["suites"]["ep-core-v1"]
+ )
+
+ def load(name: str) -> dict[str, object]:
+ return workloads if name == "workloads.yaml" else suites
+
+ with mock.patch.object(sweep_matrix, "_load", side_effect=load), self.assertRaisesRegex(
+ SystemExit, "duplicate semantic point"
+ ):
+ sweep_matrix.resolve_matrix()
+
+ def test_only_three_shared_launchers_are_registered(self) -> None:
+ expected = {
+ "launch_single-slurm.sh",
+ "launch_gb-nv.sh",
+ "launch_mi-amds.sh",
+ }
+ self.assertEqual({path.name for path in (ROOT / "launchers").glob("launch_*.sh")}, expected)
+ self.assertEqual(
+ {platform["launcher"] for platform in capability.PLATFORMS.values()},
+ {"single-slurm", "gb-nv", "mi-amds"},
+ )
+ for platform in capability.PLATFORMS.values():
+ launcher = ROOT / "launchers" / f"launch_{platform['launcher']}.sh"
+ self.assertTrue(launcher.is_file())
+ source = launcher.read_text()
+ self.assertNotIn("RUNNER_NAME", source)
+ self.assertIn("cx_preflight_allocation", source)
+ lock_environment = 'cx_lock_canonical_gha_env "$RUNNER"'
+ self.assertIn(lock_environment, source)
+ self.assertLess(
+ source.index("cx_load_operator_config"),
+ source.index(lock_environment),
+ )
+ validate = 'cx_validate_shard_control "$CX_DIR"'
+ stage = 'MOUNT_SRC="$(cx_stage_repo '
+ self.assertIn(validate, source)
+ self.assertLess(source.index(validate), source.index(stage))
+ self.assertLess(source.index(validate), source.index("cx_require_vars"))
+
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text()
+ self.assertNotIn("RUNNER_NAME", common)
+ self.assertNotIn("RUNNER_NAME:", workflow)
+ self.assertNotIn("flashinfer", capability.BACKENDS)
+ self.assertFalse((HERE / "ep_flashinfer.py").exists())
+
+ def test_image_pinned_deepep_and_input_integrity_order_are_explicit(self) -> None:
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ probe = runtime[runtime.index("cx_probe_deepep()"):
+ runtime.index("cx_activate_deepep_v2()")]
+ self.assertIn('expected_version="1.2.1"', probe)
+ self.assertIn('expected_version="1.1.0+814e508"', probe)
+ self.assertNotIn("pip install", probe)
+ self.assertNotIn("cx_fetch_revision", probe)
+ self.assertIn("Path(deep_ep.__file__).resolve() in recorded_files", probe)
+ self.assertIn("Path(buffer_module.__file__).resolve() in recorded_files", probe)
+
+ harness = (HERE / "ep_harness.py").read_text()
+ pass_one = harness[harness.index("# ---- Pass 1"):
+ harness.index("# ---- Pass 2")]
+ self.assertLess(
+ pass_one.index("input_snapshots[T] ="),
+ pass_one.index("oracle = _run_expert_oracle"),
+ )
+ self.assertIn("pre_input_unchanged", pass_one)
+ self.assertIn("hh = prep()\n torch.cuda.synchronize()", harness)
+
+ def test_squash_imports_are_reproducible_and_use_a_fresh_cache_key(self) -> None:
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ self.assertIn('CX_SQUASH_FORMAT_VERSION="repro-v1"', common)
+ self.assertIn("SOURCE_DATE_EPOCH=\"$CX_SQUASH_SOURCE_DATE_EPOCH\"", common)
+ self.assertIn("${COLLECTIVEX_IMAGE_DIGEST#sha256:}", common)
+ self.assertIn("cx_ensure_squash_on_job", amd)
+ self.assertIn('"${CX_LOCK_DIR:-}"', amd)
+ self.assertNotIn('"${CX_LOCK_DIR:-/tmp}"', amd)
+ self.assertIn('[ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"', common)
+ self.assertGreaterEqual(common.count("--chdir=/tmp"), 2)
+ self.assertGreaterEqual(amd.count("--chdir=/tmp"), 2)
+ self.assertIn('ENROOT_CACHE_PATH="$compute_home/enroot-cache"', common)
+ self.assertIn('ENROOT_RUNTIME_PATH="$compute_home/enroot-run"', common)
+ self.assertEqual(common.count('cx_reverify_registry_image "$image"'), 2)
+ result = subprocess.run(
+ [
+ "bash",
+ "-c",
+ f'source "{ROOT / "runtime" / "common.sh"}"; '
+ 'COLLECTIVEX_IMAGE_DIGEST="sha256:$(printf b%.0s {1..64})"; '
+ 'CX_IMAGE_PLATFORM=linux/amd64; cx_squash_path /cache repo/image:tag; '
+ 'printf "\\n"; CX_IMAGE_PLATFORM=linux/arm64; '
+ 'cx_squash_path /cache repo/image:tag',
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ digest = "b" * 64
+ self.assertEqual(
+ result.stdout.splitlines(),
+ [
+ f"/cache/repro-v1_{digest}_repo_image_tag.sqsh",
+ f"/cache/repro-v1_linux_arm64_{digest}_repo_image_tag.sqsh",
+ ],
+ )
+
+ def test_launchers_preserve_platform_specific_runtime_requirements(self) -> None:
+ single = (ROOT / "launchers" / "launch_single-slurm.sh").read_text()
+ gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text()
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ self.assertIn("ALLOC_EXTRA=(--mem=0)", single)
+ self.assertIn("ALLOC_EXTRA=(-N 1 --mem=0)", single)
+ self.assertIn("SRUN_EXTRA=(--mpi=none --container-remap-root)", single)
+ self.assertIn("CX_ENROOT_LOCAL_IMPORT=1", single)
+ self.assertIn('PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-', gb)
+ self.assertIn("cx_ensure_squash_on_job", gb)
+ self.assertIn("--mem=0 --cpus-per-task=72", gb)
+ self.assertIn("--mem=0 --cpus-per-task=35", gb)
+ self.assertIn("--container-writable", gb)
+ self.assertIn("--container-remap-root", gb)
+ workload_stage = gb[
+ gb.index("workload_args=("):gb.index("workload_log=", gb.index("workload_args=("))
+ ]
+ self.assertNotIn("--workload", workload_stage)
+ self.assertIn("mi325x) CPUS_PER_TASK=256", amd)
+ self.assertIn("/dev/kfd:/dev/kfd,/dev/dri:/dev/dri", amd)
+ collect = common[common.index("cx_collect_results()"):
+ common.index("cx_cleanup_stage()")]
+ cleanup = common[common.index("cx_launcher_cleanup()"):
+ common.index("cx_install_launcher_fail_safe()")]
+ self.assertNotIn("cx_cleanup_stage", collect)
+ self.assertLess(cleanup.index("cx_cancel_job"), cleanup.index("cx_cleanup_stage"))
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ self.assertIn('distribution.read_text("direct_url.json")', runtime)
+ self.assertIn("6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac", runtime)
+ self.assertIn("2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882", runtime)
+
+ def test_deferred_backend_provenance_resolves_before_measurement(self) -> None:
+ harness = (ROOT / "tests" / "ep_harness.py").read_text()
+ conditioning = harness.index("for wt in conditioning_ladder")
+ provenance = harness.index("# Setup may materialize deferred provenance")
+ measurement = harness.index("# ---- Pass 1: build each deterministic problem")
+ self.assertLess(conditioning, provenance)
+ self.assertLess(provenance, measurement)
+
+ def test_backend_specific_routing_contracts_are_explicit(self) -> None:
+ hybrid = (ROOT / "tests" / "ep_deepep_hybrid.py").read_text()
+ self.assertIn("self.domain_rank = int(self.buffer.local_rank)", hybrid)
+ self.assertIn(
+ "probability_columns = self.domain_rank * self.local_experts + local_expert_ids",
+ hybrid,
+ )
+ self.assertIn("h.recv_probs[:count][rows, probability_columns]", hybrid)
+
+ mori = (ROOT / "tests" / "ep_mori.py").read_text()
+ self.assertIn("topk_idx=indices", mori)
+ self.assertIn("indices=indices", mori)
+ self.assertIn(
+ "combine_indices = p.indices if self._async_ll else h.dispatch_indices",
+ mori,
+ )
+ self.assertIn("h.combine_input,\n None,\n combine_indices", mori)
+ self.assertIn('"use_external_inp_buf": self._async_ll', mori)
+ self.assertIn("self.block_num = self._block_target = 64", mori)
+ self.assertIn('config_kwargs["block_num"] = self.block_num', mori)
+ self.assertIn(
+ 'config_kwargs["warp_num_per_block"] = self.dispatch_warps', mori
+ )
+ self.assertIn("count > tensor.size(0)", mori)
+ self.assertIn("return combined[:p.T]", mori)
+ self.assertNotIn("return combined\n", mori)
+ self.assertIn(
+ "raw_expert_ids < local_start + experts_per_rank",
+ mori,
+ )
+ self.assertNotIn("MoRI returned a non-local expert", mori)
+ harness = (ROOT / "tests" / "ep_harness.py").read_text()
+ self.assertIn("problem.recv_tokens = backend.recv_tokens(handle)", harness)
+
+ def test_mori_masks_global_topk_metadata_to_the_local_rank(self) -> None:
+ path = HERE / "ep_mori.py"
+ tree = ast.parse(path.read_text(), str(path))
+ helper = next(
+ node
+ for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_project_local_metadata"
+ )
+ namespace: dict[str, object] = {}
+ exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace)
+ raw_ids = np.array([[0, 32, 63, -1], [64, 95, 7, 96]], dtype=np.int64)
+ raw_weights = np.arange(8, dtype=np.float32).reshape(2, 4)
+ torch_module = types.SimpleNamespace(
+ where=np.where,
+ full_like=np.full_like,
+ zeros_like=np.zeros_like,
+ )
+ ids, weights, local_ids = namespace["_project_local_metadata"](
+ torch_module, raw_ids, raw_weights, 1, 32
+ )
+ np.testing.assert_array_equal(
+ ids,
+ np.array([[-1, 32, 63, -1], [-1, -1, -1, -1]], dtype=np.int64),
+ )
+ np.testing.assert_array_equal(
+ weights,
+ np.array([[0, 1, 2, 0], [0, 0, 0, 0]], dtype=np.float32),
+ )
+ counts = np.bincount(local_ids, minlength=32)
+ self.assertEqual((counts[0], counts[31], int(counts.sum())), (1, 1, 2))
+ commit_helper = next(
+ node for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_mori_source_commit"
+ )
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ module = root / "python" / "mori" / "__init__.py"
+ module.parent.mkdir(parents=True)
+ module.touch()
+ git = root / ".git"
+ git.mkdir()
+ (git / "HEAD").write_text("a" * 40 + "\n")
+ commit_namespace = {
+ "Path": Path,
+ "re": re,
+ "mori": types.SimpleNamespace(__file__=str(module)),
+ }
+ exec(
+ compile(ast.Module(body=[commit_helper], type_ignores=[]), str(path), "exec"),
+ commit_namespace,
+ )
+ self.assertEqual(commit_namespace["_mori_source_commit"](), "a" * 40)
+ (git / "HEAD").write_text("ref: refs/heads/main\n")
+ with self.assertRaisesRegex(RuntimeError, "detached commit"):
+ commit_namespace["_mori_source_commit"]()
+
+ profile = contracts.project_resource_profile(
+ {
+ "block_num": 64,
+ "device_cus": 304,
+ "kernel_type": "AsyncLL",
+ "tuned_source": "upstream-asyncll-64x8-external-input",
+ }
+ )
+ self.assertIsNone(profile["comm_units_kind"])
+ self.assertIsNone(profile["configured_units"])
+
+ def test_squash_identity_rehashes_instead_of_trusting_a_sidecar(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ image = Path(temporary) / "image.sqsh"
+ image.write_bytes(b"current squash bytes")
+ sidecar = Path(f"{image}.sha256")
+ sidecar.write_text("a" * 64)
+ os.utime(sidecar, (image.stat().st_mtime + 10, image.stat().st_mtime + 10))
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; COLLECTIVEX_EXECUTION_ID="squash-hash-$$"; '
+ 'cx_export_squash_identity "$2"; cx_cleanup_private_logs 0; '
+ 'printf "%s" "$COLLECTIVEX_SQUASH_SHA256"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(image),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(result.stdout, hashlib.sha256(image.read_bytes()).hexdigest())
+
+ def test_salloc_job_id_parser_uses_the_portable_grant_message(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ directory = Path(temporary)
+ arguments = directory / "arguments"
+ salloc = directory / "salloc"
+ salloc.write_text(
+ "#!/usr/bin/env bash\n"
+ "printf '%s\\n' \"$@\" > \"$CX_TEST_SALLOC_ARGUMENTS\"\n"
+ "printf 'salloc: Granted job allocation 4242\\n' >&2\n"
+ )
+ salloc.chmod(0o700)
+ result = subprocess.run(
+ [
+ "bash",
+ "-c",
+ f'source "{ROOT / "runtime" / "common.sh"}"; '
+ 'COLLECTIVEX_EXECUTION_ID="scheduler-parser-$$"; '
+ 'JOB_ID=""; cx_salloc_jobid --partition=compute; '
+ 'cx_cleanup_private_logs 0; printf "%s:%s" "$JOB_ID" "$CX_ALLOCATION_REQUESTED"',
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "PATH": f"{directory}:{os.environ['PATH']}",
+ "CX_TEST_SALLOC_ARGUMENTS": str(arguments),
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(result.stdout, "4242:1")
+ self.assertEqual(
+ arguments.read_text().splitlines(),
+ ["--partition=compute", "--no-shell"],
+ )
+
+ def test_allocation_cleanup_fails_closed_when_scheduler_queries_fail(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ directory = Path(temporary)
+ for name, body in {
+ "scancel": "exit 0",
+ "squeue": "exit 2",
+ "sleep": "exit 0",
+ }.items():
+ command = directory / name
+ command.write_text(f"#!/usr/bin/env bash\n{body}\n")
+ command.chmod(0o700)
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_cancel_job 4242',
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "PATH": f"{directory}:{os.environ['PATH']}"},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("did not terminate", result.stderr)
+
+ workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text()
+ self.assertIn("cleanup-unsafe", workflow)
+ self.assertIn("cleanup-safe", workflow)
+ self.assertIn("Confirm allocation cleanup", workflow)
+ self.assertIn("Prepare pinned backend source archive", workflow)
+ self.assertIn("Install pinned backend source seed", workflow)
+ self.assertIn("CX_BACKEND_SOURCE_SEED_ROOT", workflow)
+ self.assertIn("steps.gen.outputs.source_backends", workflow)
+ self.assertIn("with tarfile.open", workflow)
+ artifact_validation = workflow[workflow.index("- name: Validate shard artifact safety"):]
+ self.assertIn("steps.allocation_cleanup.outcome == 'success'", artifact_validation)
+ sweep_workflow = workflow[workflow.index(" sweep:"):]
+ self.assertNotIn("GITHUB_WORKSPACE", sweep_workflow)
+ self.assertNotIn("RUNNER_WORKSPACE", sweep_workflow)
+ self.assertIn('CX_SOURCE_ROOT: /tmp/inferencex-collectivex-', sweep_workflow)
+ source_step = sweep_workflow[:sweep_workflow.index("- uses: actions/download-artifact")]
+ self.assertNotIn("unsafe_guards=", source_step)
+ self.assertIn("cutoff = time.time() - 86400", source_step)
+ self.assertIn("stat.S_IMODE(metadata.st_mode) != 0o700", source_step)
+ self.assertIn('for marker_name in ("cleanup-safe", "cleanup-unsafe")', source_step)
+ self.assertIn("stat.S_IMODE(marker.st_mode) == 0o600", source_step)
+ self.assertIn("shutil.rmtree(entry.path)", source_step)
+ self.assertLess(
+ source_step.index('rev-parse HEAD'),
+ source_step.index("echo 'prepared=true'"),
+ )
+ upload = workflow[workflow.index("- name: Stage shard artifact"):]
+ self.assertIn("id: stage_artifact", upload)
+ self.assertIn("id: upload_artifact", upload)
+ self.assertIn("steps.stage_artifact.outcome == 'success'", upload)
+ cleanup = workflow[workflow.index("- name: Cleanup isolated workspace"):]
+ for step in (
+ "sweep_shard", "allocation_cleanup", "artifact_safety",
+ "delivery_contracts", "stage_artifact", "upload_artifact",
+ ):
+ self.assertIn(f"steps.{step}.outcome", cleanup)
+ self.assertLess(
+ cleanup.index('cleanup-safe" ]'),
+ cleanup.index('rm -rf -- "$CX_JOB_ROOT"'),
+ )
+
+ def test_runtime_identity_and_realized_placement_are_behavioral(self) -> None:
+ self.assertFalse(capability.runtime_identity_issues(
+ "mi325x", vendor="amd", arch="gfx942", machine="amd64",
+ device_name="AMD Instinct MI325X", device_count=8, world_size=8,
+ ))
+ self.assertTrue(capability.runtime_identity_issues(
+ "mi355x", vendor="amd", arch="gfx942", machine="amd64",
+ device_name="AMD Instinct MI325X", device_count=8, world_size=8,
+ ))
+ records = [("private-a", rank) for rank in range(4)] + [
+ ("private-b", rank) for rank in range(4)
+ ]
+ self.assertEqual(
+ run_ep._summarize_realized_placement(
+ records, expected_nodes=2, expected_gpus_per_node=4, expected_world_size=8
+ ),
+ {
+ "gpus_per_node": 4,
+ "nodes": 2,
+ "ranks_per_node": 4,
+ "unique_local_ranks": True,
+ "valid": True,
+ },
+ )
+ with self.assertRaises(ValueError):
+ run_ep._summarize_realized_placement(
+ records[:-1] + [("private-b", 2)],
+ expected_nodes=2,
+ expected_gpus_per_node=4,
+ expected_world_size=8,
+ )
+
+ def test_collective_version_and_rccl_fingerprint_are_normalized(self) -> None:
+ self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4")
+ self.assertEqual(ep_harness.format_collective_version(21805), "2.18.5")
+ self.assertEqual(ep_harness.format_collective_version((2, 21, 5)), "2.21.5")
+
+ properties = types.SimpleNamespace(
+ multi_processor_count=304, total_memory=1024, warp_size=64
+ )
+ fake = types.SimpleNamespace(
+ __version__="2.9.0",
+ version=types.SimpleNamespace(cuda=None, hip="7.2"),
+ cuda=types.SimpleNamespace(
+ get_device_properties=lambda _device: properties,
+ get_device_name=lambda _device: "AMD Instinct MI325X",
+ nccl=types.SimpleNamespace(version=lambda: 21805),
+ ),
+ )
+ with mock.patch.object(
+ run_ep, "_loaded_collective_version", return_value="2.18.5"
+ ):
+ fingerprint = run_ep._runtime_fingerprint(
+ fake, "device", machine="amd64", vendor="amd", arch="gfx942"
+ )
+ self.assertEqual(fingerprint["collective_library"], {"kind": "rccl", "version": "2.18.5"})
+ self.assertEqual(fingerprint["accelerator_runtime"], {"kind": "hip", "version": "7.2"})
+
+ class FakeCollective:
+ @staticmethod
+ def ncclGetVersion(pointer) -> int:
+ pointer._obj.value = 23004
+ return 0
+
+ maps = "0-1 r-xp 0 00:00 0 /runtime/libnccl.so.2\n"
+ with (
+ mock.patch("builtins.open", return_value=io.StringIO(maps)),
+ mock.patch.object(run_ep.os.path, "isfile", return_value=True),
+ mock.patch.object(
+ run_ep.os.path, "realpath", return_value="/runtime/libnccl.so.2"
+ ),
+ mock.patch.object(run_ep.ctypes, "CDLL", return_value=FakeCollective()),
+ ):
+ self.assertEqual(run_ep._loaded_collective_version(), "2.30.4")
+
+ path = HERE / "ep_nccl.py"
+ tree = ast.parse(path.read_text(), str(path))
+ helper = next(
+ node for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_runtime_collective"
+ )
+ namespace = {"re": re}
+ exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace)
+ args = types.SimpleNamespace(
+ runtime_fingerprint={
+ "collective_library": {"kind": "nccl", "version": "2.30.4"}
+ }
+ )
+ cuda = types.SimpleNamespace(version=types.SimpleNamespace(hip=None))
+ self.assertEqual(namespace["_runtime_collective"](args, cuda), ("nccl", "2.30.4"))
+ args.runtime_fingerprint["collective_library"]["version"] = None
+ with self.assertRaisesRegex(RuntimeError, "runtime identity is unavailable"):
+ namespace["_runtime_collective"](args, cuda)
+ self.assertNotIn("torch.cuda.nccl.version", path.read_text())
+
+ def test_workloads_bind_generator_activation_and_trace(self) -> None:
+ args = ("uniform", 7168, 8, 256, 8, 64, 67)
+ first = workload.compute_workload_id(*args)
+ self.assertTrue(identity.is_typed_id(first, "workload"))
+ self.assertEqual(first, workload.compute_workload_id(*args))
+ self.assertNotEqual(first, workload.compute_workload_id(*args[:-1], 68))
+ self.assertNotEqual(
+ first,
+ workload.compute_workload_id(*args, trace_checksum="a" * 64),
+ )
+ _, _, manifest = workload.build_workload(8, 2, 4, "uniform", 4, 67, 2)
+ member, checksums, _, _ = workload.canonical_member(
+ "uniform", 8, 2, 4, 2, 2, 67
+ )
+ self.assertEqual(member, manifest["workload_id"])
+ self.assertEqual(checksums, manifest["checksums"])
+
+ def test_canonical_members_are_bound_to_each_scheduled_row(self) -> None:
+ case = {
+ "routing": "uniform", "hidden": 8, "topk": 2, "experts": 4, "ep": 2,
+ }
+ eplb_record = {
+ "enabled": False, "mapping_hash": None, "num_physical_experts": 4,
+ }
+
+ def expected(
+ *, tokens: int = 1, hidden: int = 8
+ ) -> tuple[str, dict[str, str], str]:
+ member, checksums, row_hash, _, _ = contracts._expected_canonical_trace(
+ "uniform", hidden, 2, 4, 4, 2, tokens, 67, False, 2048
+ )
+ return member, checksums, row_hash
+
+ member, checksums, row_hash = expected()
+ rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}]
+ proof = {
+ "manifest_checksums": {member: checksums},
+ "members": [member],
+ "workload_id": identity.workload_id({
+ "members": [{"checksums": checksums, "workload_id": member}]
+ }),
+ }
+ contracts._validate_canonical_workload(proof, case, rows, eplb_record)
+
+ def replace_member(document: dict, replacement: tuple[str, dict[str, str], str]) -> None:
+ replacement_id, replacement_checksums, _ = replacement
+ document["members"] = [replacement_id]
+ document["manifest_checksums"] = {replacement_id: replacement_checksums}
+ document["workload_id"] = identity.workload_id({
+ "members": [{
+ "checksums": replacement_checksums,
+ "workload_id": replacement_id,
+ }]
+ })
+
+ mutations = {
+ "wrong member token": lambda document, mutated_rows: replace_member(
+ document, expected(tokens=2)
+ ),
+ "wrong member dimensions": lambda document, mutated_rows: replace_member(
+ document, expected(hidden=16)
+ ),
+ "wrong member checksum": lambda document, mutated_rows: replace_member(
+ document,
+ (
+ member,
+ {**checksums, "topk_idx": "0" * 64},
+ row_hash,
+ ),
+ ),
+ "row hash unrelated to member": lambda document, mutated_rows: mutated_rows[0][
+ "routing"
+ ].update({"hash": "f" * 64}),
+ }
+ for label, mutate in mutations.items():
+ with self.subTest(label=label), self.assertRaises(contracts.ContractError):
+ bad_proof, bad_rows = copy.deepcopy(proof), copy.deepcopy(rows)
+ mutate(bad_proof, bad_rows)
+ contracts._validate_canonical_workload(
+ bad_proof, case, bad_rows, eplb_record
+ )
+
+ def test_eplb_row_hash_is_bound_to_the_frozen_remap(self) -> None:
+ case = {"routing": "zipf", "hidden": 8, "topk": 2, "experts": 4, "ep": 2}
+ physical = eplb.physical_count(4, 32, 2)
+ plan = contracts._expected_eplb_plan("zipf", 2, 4, physical, 2, 67, 2048)
+ eplb_record = {
+ "enabled": True,
+ "mapping_hash": eplb.mapping_hash(plan),
+ "num_physical_experts": physical,
+ }
+ member, checksums, row_hash, _, _ = contracts._expected_canonical_trace(
+ "zipf", 8, 2, 4, physical, 2, 1, 67, True, 2048
+ )
+ self.assertNotEqual(row_hash, checksums["trace"])
+ workload_proof = {
+ "manifest_checksums": {member: checksums},
+ "members": [member],
+ "workload_id": identity.workload_id({
+ "members": [{"checksums": checksums, "workload_id": member}]
+ }),
+ }
+ rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}]
+ contracts._validate_canonical_workload(workload_proof, case, rows, eplb_record)
+ with self.assertRaisesRegex(contracts.ContractError, "EPLB mapping"):
+ contracts._validate_canonical_workload(
+ workload_proof, case, rows, {**eplb_record, "mapping_hash": "0" * 64}
+ )
+
+ def test_oracle_pass_cannot_ignore_combined_value_failure(self) -> None:
+ oracle = {
+ "atol": ep_harness.ORACLE_ATOL,
+ "checks": {
+ "combine_values": True,
+ "counts": True,
+ "metadata": True,
+ "multiplicity": True,
+ "payload": True,
+ "source_set": True,
+ "weights": True,
+ },
+ "combine_weight_semantics": "unweighted-rank-sum",
+ "contract": ep_harness.ORACLE_CONTRACT,
+ "dispatch_sha256": "a" * 64,
+ "max_absolute_error": 0.0,
+ "max_elementwise_relative_error": 0.0,
+ "max_relative_error": 0.0,
+ "max_weight_error": 0.0,
+ "order_sha256": "b" * 64,
+ "ordering_contract": "stable-v1",
+ "passed": True,
+ "receive_count": 1,
+ "rtol": ep_harness.ORACLE_RTOL,
+ }
+ contracts._validate_oracle(oracle, "oracle")
+ weighted = copy.deepcopy(oracle)
+ weighted["combine_weight_semantics"] = "native-gate-weighted"
+ with self.assertRaisesRegex(contracts.ContractError, "differs from v1"):
+ contracts._validate_oracle(weighted, "oracle")
+ tampered = copy.deepcopy(oracle)
+ tampered["checks"]["combine_values"] = False
+ with self.assertRaises(contracts.ContractError):
+ contracts._validate_oracle(tampered, "oracle")
+
+ def test_oracle_stability_canonicalizes_native_receive_order(self) -> None:
+ source = (HERE / "ep_harness.py").read_text()
+ canonical = source[source.index("canonical_order = torch.argsort"):
+ source.index("problem.recv_tokens = receive_count")]
+ self.assertIn("canonical_sources", canonical)
+ self.assertIn("canonical_ids", canonical)
+ self.assertIn("canonical_weights", canonical)
+ self.assertNotIn("_tensor_sha256(source_ids", canonical)
+ mori = (HERE / "ep_mori.py").read_text()
+ self.assertIn(
+ 'self.kernel_generation = "async-ll" if self._async_ll else "intranode"',
+ mori,
+ )
+ backend = types.SimpleNamespace(name="mori", kernel_generation="async-ll")
+ self.assertEqual(ep_harness.kernel_generation(backend), "async-ll")
+
+ def test_terminal_fail_safe_fills_only_missing_shard_cases(self) -> None:
+ matrix = sweep_matrix.resolve_matrix(backends="all", max_cases=128)
+ shard = next(item for item in matrix["include"] if item["n"] >= 2)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ matrix_path = root / "matrix.json"
+ control_path = root / "control.json"
+ out_dir = root / "results"
+ matrix_path.write_text(json.dumps(matrix))
+ control = sweep_matrix.extract_shard(
+ matrix_path, shard["id"], control_path,
+ sku=shard["sku"], backend=shard["backend"], nodes=shard["nodes"],
+ )
+ control["cases"] = control["cases"][:2]
+ control["n"] = 2
+ control_path.write_text(json.dumps(control))
+ first = {key: value for key, value in control["cases"][0].items() if key != "case_id"}
+ git_run = {
+ "artifact": "artifact", "job": "job", "ref": "collectivex",
+ "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1",
+ "run_id": "123", "source_sha": "a" * 40,
+ }
+ allocation = {
+ "artifact": "artifact", "execution_id": "execution", "job": "job",
+ "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", "run_id": "123",
+ "runner": shard["sku"], "source_sha": "a" * 40,
+ }
+ out_dir.mkdir()
+ existing = contracts.make_terminal_document(
+ allocation_factors=allocation, attempt_ordinal=1, case=first,
+ case_factors={"case": first, "profile": identity.V1_CASE_PROFILE, "sku": shard["sku"]},
+ control_sha256=hashlib.sha256(control_path.read_bytes()).hexdigest(),
+ failure_mode="setup", generated_at="2026-07-04T00:00:00Z", git_run=git_run,
+ reason="launcher-setup-failed", return_code=7, source="runtime-emitter",
+ status="failed",
+ expected_case_id=control["cases"][0]["case_id"],
+ )
+ (out_dir / "existing.json").write_text(json.dumps(existing))
+ (out_dir / "partial.json").write_text(json.dumps({
+ "format": contracts.RAW_FORMAT,
+ "identity": {"case_id": control["cases"][1]["case_id"]},
+ "sample_artifact": {"path": "partial.samples.json"},
+ }))
+ (out_dir / "partial.samples.json").write_text("{broken")
+ environment = {
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SHARD_FILE": str(control_path),
+ "CX_SHARD_SKU": shard["sku"],
+ "CX_RUNNER": shard["sku"],
+ "CX_BENCH": shard["backend"],
+ "CX_NODES": str(shard["nodes"]),
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ "COLLECTIVEX_ARTIFACT_NAME": "artifact",
+ "GITHUB_JOB": "job", "GITHUB_REF_NAME": "collectivex",
+ "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX",
+ "GITHUB_RUN_ATTEMPT": "1", "GITHUB_RUN_ID": "123",
+ "GITHUB_SHA": "a" * 40,
+ }
+ subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_emit_setup_failures "$2" "$3" "$4" 7',
+ "_", str(ROOT / "runtime" / "common.sh"), str(ROOT),
+ str(out_dir), shard["backend"],
+ ],
+ check=True,
+ env=environment,
+ )
+ attempts = [contracts.strict_load(path) for path in out_dir.glob("*.json")]
+ self.assertEqual(len(attempts), 2)
+ self.assertEqual(
+ contracts.validate_attempt_paths([str(path) for path in out_dir.glob("*.json")]),
+ 2,
+ )
+ delivery = [str(path) for path in out_dir.glob("*.json")]
+ self.assertEqual(contracts.validate_delivery(delivery, str(control_path)), 2)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_delivery(delivery[:1], str(control_path))
+ self.assertEqual(
+ {attempt["identity"]["case_id"] for attempt in attempts},
+ {case["case_id"] for case in control["cases"]},
+ )
+ self.assertTrue((out_dir / "partial.json.quarantine").is_file())
+ self.assertTrue((out_dir / "partial.samples.json.quarantine").is_file())
+
+ preallocation = root / "preallocation"
+ preallocation_results = preallocation / "experimental" / "CollectiveX" / "results"
+ preallocation_results.mkdir(parents=True)
+ failed = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; REPO_ROOT="$2"; export REPO_ROOT; '
+ 'cx_install_launcher_fail_safe; cx_load_operator_config',
+ "_", str(ROOT / "runtime" / "common.sh"), str(preallocation),
+ ],
+ env={**environment, "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1"},
+ )
+ self.assertNotEqual(failed.returncode, 0)
+ preallocation_attempts = [
+ contracts.validate_terminal_document(contracts.strict_load(path))
+ for path in preallocation_results.glob("*.json")
+ ]
+ self.assertEqual(
+ {attempt["identity"]["case_id"] for attempt in preallocation_attempts},
+ {case["case_id"] for case in control["cases"]},
+ )
+
+ def test_runtime_identity_mismatch_is_failed_not_unsupported(self) -> None:
+ wrapper = next(
+ item for item in sweep_matrix.resolve_matrix()["requested_cases"]
+ if item["disposition"] == "runnable"
+ )
+ case = wrapper["case"]
+ environment = {
+ "CX_RUNNER": wrapper["sku"], "CX_CASE_ID": case["case_id"],
+ "CX_SUITE": case["suite"], "CX_WORKLOAD_NAME": case["workload"],
+ "CX_REQUIRED_PUBLICATION": case["required_publication"],
+ "CX_ROUTING": case["routing"], "CX_EPLB": "1" if case["eplb"] else "",
+ "CX_EP": str(case["ep"]), "CX_NGPUS": str(case["ep"]),
+ "CX_HIDDEN": str(case["hidden"]), "CX_TOPK": str(case["topk"]),
+ "CX_EXPERTS": str(case["experts"]), "CX_NODES": str(case["nodes"]),
+ "CX_GPUS_PER_NODE": str(case["gpus_per_node"]),
+ "CX_SCALE_UP_DOMAIN": str(case["scale_up_domain"]),
+ "CX_TOKENS_LADDER": case["ladder"], "CX_CANONICAL": "1",
+ "CX_ITERS": "8", "CX_TRIALS": "64", "CX_WARMUP": "32",
+ "CX_SAMPLES_PER_POINT": "512", "GITHUB_RUN_ID": "123",
+ "GITHUB_RUN_ATTEMPT": "1", "GITHUB_REF_NAME": "collectivex",
+ "GITHUB_SHA": "a" * 40, "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX",
+ "GITHUB_JOB": "sweep", "COLLECTIVEX_ARTIFACT_NAME": "artifact",
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ }
+ with mock.patch.dict(os.environ, environment, clear=False):
+ terminal = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=5
+ )
+ self.assertEqual(terminal["identity"]["case_id"], case["case_id"])
+ self.assertEqual(
+ terminal["outcome"],
+ {
+ "failure_mode": "runtime-identity",
+ "reason": "runtime-identity-mismatch",
+ "return_code": 5,
+ "status": "failed",
+ },
+ )
+ for mode, reason in contracts.RUNTIME_FAILURE_REASONS.items():
+ with self.subTest(mode=mode), mock.patch.dict(os.environ, environment, clear=False):
+ staged = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=1,
+ failure_mode=mode,
+ )
+ self.assertEqual(staged["outcome"]["reason"], reason)
+ mismatched = copy.deepcopy(staged)
+ mismatched["outcome"]["reason"] = "distributed-command-failed"
+ if reason == "distributed-command-failed":
+ mismatched["outcome"]["reason"] = "backend-setup-failed"
+ with self.assertRaisesRegex(
+ contracts.ContractError, "source and outcome are not registered"
+ ):
+ contracts.validate_terminal_document(mismatched)
+ with mock.patch.dict(os.environ, environment, clear=False):
+ with self.assertRaisesRegex(
+ contracts.ContractError, "runtime failure mode is not registered"
+ ) as raised:
+ contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=1,
+ failure_mode="raw-private-error",
+ )
+ self.assertNotIn("raw-private-error", str(raised.exception))
+ with mock.patch.dict(os.environ, environment, clear=False):
+ generic = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=6,
+ )
+ self.assertEqual(
+ generic["outcome"],
+ {
+ "failure_mode": "execution",
+ "reason": "distributed-command-failed",
+ "return_code": 6,
+ "status": "failed",
+ },
+ )
+
+ def test_launchers_use_private_logs_and_allowlisted_failure_stages(self) -> None:
+ expected = {
+ "launch_single-slurm.sh": {
+ "setup", "registry-verification", "container-import", "container-hash",
+ "repository-stage", "scheduler-allocation", "container-launch",
+ "artifact-collection",
+ },
+ "launch_gb-nv.sh": {
+ "setup", "registry-verification", "container-import", "container-hash",
+ "repository-stage", "scheduler-allocation", "container-launch", "backend-setup",
+ "execution", "artifact-collection",
+ },
+ "launch_mi-amds.sh": {
+ "setup", "repository-stage", "registry-verification", "scheduler-allocation",
+ "container-import", "container-hash", "container-launch", "artifact-collection",
+ },
+ }
+ for name, stages in expected.items():
+ launcher = (ROOT / "launchers" / name).read_text()
+ self.assertNotIn("--export=ALL", launcher)
+ self.assertIn("cx_container_exports", launcher)
+ self.assertIn("collect_rc=0", launcher)
+ for stage in stages:
+ with self.subTest(launcher=name, stage=stage):
+ self.assertIn(f"cx_set_failure_stage {stage}", launcher)
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ self.assertIn("cx_ensure_squash_on_job", amd)
+ self.assertIn("cx_fail_stage container-hash", amd)
+ self.assertNotIn('cat "$import_log"', amd)
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ self.assertIn('bash -s -- "$sq" "$lock" "$image"', common)
+ self.assertIn("> \"$log\" 2>&1 <<'BASH'", common)
+ self.assertIn("cx_fail_stage container-import", common)
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ export_start = common.index("\ncx_container_exports() {")
+ exports = common[export_start:common.index("\n}", export_start)]
+ export_names = {
+ name
+ for payload in re.findall(r"printf '%s' '([^']*)'", exports)
+ for name in payload.split(",") if name
+ }
+ for private_name in (
+ "COLLECTIVEX_OPERATOR_CONFIG", "GITHUB_TOKEN", "GITHUB_WORKSPACE", "HOME",
+ "CX_PARTITION", "CX_ACCOUNT", "CX_SQUASH_DIR", "CX_STAGE_DIR",
+ ):
+ self.assertNotIn(private_name, export_names)
+ self.assertIn("CX_BACKEND_CACHE_ROOT", export_names)
+ self.assertIn("CX_BACKEND_CACHE_SENTINEL_SHA256", export_names)
+ self.assertNotIn("CX_PREPARED_BACKEND_CACHE", export_names)
+ self.assertIn("MORI_COMMIT", export_names)
+ self.assertIn("cx_write_runtime_stage backend-setup", runtime)
+ self.assertIn("cx_write_runtime_stage execution", runtime)
+ gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text()
+ self.assertIn("cx_private_log_path shard-summary", gb)
+ self.assertIn("cx_fail_stage execution", gb)
+
+ def test_case_failure_diagnostic_precedes_normal_srun_footer(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ log.write_text(
+ "WARN: deepep decode run failed rc=1 (CX_RUN_TIMEOUT=900s)\n"
+ "SHARD done: 6/6 case(s) failed\n"
+ "srun: error: task exited 1\n"
+ )
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertIn("diagnostic=benchmark-case-failure", result.stderr)
+
+ def test_non_timeout_failure_warning_is_classified_as_case_failure(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ log.write_text("WARN: deepep decode run failed rc=1\nsrun: task exited 1\n")
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertNotIn("diagnostic=network-or-timeout", result.stderr)
+ self.assertIn("diagnostic=benchmark-case-failure", result.stderr)
+
+ def test_private_runtime_failure_signatures_override_case_footer(self) -> None:
+ signatures = {
+ "DeepEP V2 no-GIN run is outside one realized LSA domain":
+ "accelerator-topology",
+ "CUDA error: call requires newer driver": "accelerator-driver",
+ "NCCL failure in ncclCommWindowRegister": "nccl-device-api",
+ "NVCC compilation failed": "jit-toolchain",
+ "CUDA out of memory": "accelerator-memory",
+ "torch rendezvous timed out": "network-or-timeout",
+ }
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ for signature, diagnostic in signatures.items():
+ log.write_text(f"{signature}\nSHARD done: 6/6 case(s) failed\n")
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertIn(f"diagnostic={diagnostic}", result.stderr)
+
+ def test_runtime_stage_marker_distinguishes_launch_from_execution(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ mount = Path(temporary)
+ root = mount / "experimental" / "CollectiveX"
+ root.mkdir(parents=True)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_EXECUTION_ID=test_1_shard CX_TS=test
+ cx_set_failure_stage container-launch
+ cx_prepare_runtime_marker "$2"
+ (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage backend-setup)
+ cx_adopt_runtime_stage "$2"
+ test "$CX_FAILSAFE_MODE" = backend-setup
+ (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage execution)
+ cx_adopt_runtime_stage "$2"
+ test "$CX_FAILSAFE_MODE" = execution
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(mount)],
+ check=True,
+ )
+
+ def test_canonical_gha_environment_is_locked_but_manual_overrides_survive(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8
+ export CX_IMAGE=untrusted CX_IMAGE_DIGEST=untrusted CX_NGPUS=99
+ export CX_NCCL_HOME=/untrusted CX_LOCK_DIR=/tmp CX_SQUASH_DIR=/shared/containers
+ export CX_STAGE_DIR=/private/stale-stage
+ export CX_MORI_KERNEL_TYPE=intranode MORI_ENABLE_SDMA=0
+ export NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 CX_DRYRUN=1
+ export CX_BACKEND_CACHE_ROOT=/untrusted CX_BACKEND_CACHE_SENTINEL_SHA256=bad
+ export CX_PREPARED_BACKEND_CACHE=/untrusted CX_BACKEND_SOURCE_ROOT=/untrusted
+ cx_lock_canonical_gha_env mi325x
+ test "$CX_IMAGE" = "$CX_IMAGE_AMD_MORI_MI325"
+ test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_AMD_MORI_MI325_DIGEST"
+ test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:1800
+ test "$CX_MORI_KERNEL_TYPE:$MORI_DISABLE_AUTO_XGMI:$MORI_ENABLE_SDMA" = asyncll:0:1
+ test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI325"
+ test "$MORI_APP_LOG_LEVEL:$MORI_SHMEM_LOG_LEVEL:$MORI_IO_LOG_LEVEL" = info:info:info
+ test "$CX_STAGE_DIR" = "$GITHUB_WORKSPACE"
+ test -z "${CX_NCCL_HOME+x}${CX_LOCK_DIR+x}${NCCL_MNNVL_ENABLE+x}${MC_FORCE_MNNVL+x}"
+ test -z "${CX_BACKEND_CACHE_ROOT+x}${CX_BACKEND_CACHE_SENTINEL_SHA256+x}"
+ test -z "${CX_PREPARED_BACKEND_CACHE+x}${CX_BACKEND_SOURCE_ROOT+x}"
+ test -z "${CX_DRYRUN+x}"
+
+ unset CX_STAGE_DIR
+ export CX_SHARD_SKU=gb300 CX_NODES=2 CX_GPUS_PER_NODE=4
+ export CX_IMAGE=untrusted CX_NGPUS=1 CX_MORI_KERNEL_TYPE=untrusted
+ export MORI_ENABLE_SDMA=0 CX_NCCL_HOME=/untrusted CX_MASTER_PORT=1
+ cx_lock_canonical_gha_env gb300
+ test "$CX_IMAGE" = "$CX_IMAGE_MULTIARCH"
+ test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST"
+ test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:900
+ test "$CX_NCCL_HOME:$CX_MASTER_PORT" = /usr:29551
+ test "$CX_STAGE_DIR" = /shared/containers/.stage
+ test -z "${CX_MORI_KERNEL_TYPE+x}${MORI_ENABLE_SDMA+x}"
+
+ export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$
+ export CX_SHARD_SKU=mi355x CX_NODES=1 CX_GPUS_PER_NODE=8
+ export CX_LOCK_DIR=/validated/amd-locks
+ cx_lock_canonical_gha_env mi355x
+ test "$CX_LOCK_DIR" = /validated/amd-locks
+ test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI355"
+
+ unset COLLECTIVEX_CANONICAL_GHA
+ unset COLLECTIVEX_OPERATOR_CONFIG_LOADED
+ CX_IMAGE=manual CX_IMAGE_DIGEST=manual CX_NGPUS=3
+ CX_MORI_KERNEL_TYPE=manual
+ cx_lock_canonical_gha_env mi355x
+ test "$CX_IMAGE:$CX_IMAGE_DIGEST:$CX_NGPUS:$CX_MORI_KERNEL_TYPE" = manual:manual:3:manual
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as workspace:
+ Path(workspace).chmod(0o720)
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": workspace,
+ },
+ )
+ self.assertEqual(list(Path(workspace).iterdir()), [])
+
+ def test_canonical_amd_stage_rejects_a_world_writable_workspace(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers
+ cx_lock_canonical_gha_env mi325x
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as workspace:
+ Path(workspace).chmod(0o702)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": workspace,
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("canonical AMD staging workspace is unsafe", result.stderr)
+ self.assertNotIn(workspace, result.stderr)
+
+ def test_canonical_amd_stage_rejects_a_symlinked_workspace(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers
+ cx_lock_canonical_gha_env mi325x
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as temporary:
+ root = Path(temporary)
+ real = root / "real"
+ real.mkdir()
+ link = root / "workspace"
+ link.symlink_to(real, target_is_directory=True)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": str(link),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("canonical AMD staging workspace is unsafe", result.stderr)
+ self.assertNotIn(str(root), result.stderr)
+
+ def test_image_selection_and_registry_verification_are_fail_closed(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ test "$(cx_default_image mi325x)" = "$CX_IMAGE_AMD_MORI_MI325"
+ test "$(cx_default_image mi355x)" = "$CX_IMAGE_AMD_MORI"
+ pinned="sha256:$(printf 'a%.0s' {1..64})"
+ curl() {
+ case "$*" in
+ *auth.docker.io*) printf '{"token":"test"}' ;;
+ *) printf 'Docker-Content-Digest: %s\r\n' "$pinned" ;;
+ esac
+ }
+ test "$(cx_resolve_registry_digest ubuntu:latest)" = "$pinned"
+ test "$(cx_resolve_registry_digest docker.io/library/ubuntu:latest)" = "$pinned"
+ ! (cx_resolve_registry_digest "ubuntu@$pinned")
+ ! (cx_resolve_registry_digest ghcr.io/example/image:tag)
+ ! (cx_resolve_registry_digest 'ubuntu@sha256:bad')
+ curl() {
+ case "$*" in *auth.docker.io*) printf '{"token":"test"}';; esac
+ }
+ ! (cx_resolve_registry_digest ubuntu:latest)
+ cx_resolve_registry_digest() { printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST"; }
+ cx_verify_registry_image "$CX_IMAGE_MULTIARCH"
+ test "$COLLECTIVEX_IMAGE_DIGEST_VERIFIED" = 1
+ test "$COLLECTIVEX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST"
+ cx_reverify_registry_image "$CX_IMAGE_MULTIARCH"
+ cx_resolve_registry_digest() { printf 'sha256:%064d' 0; }
+ ! (cx_reverify_registry_image "$CX_IMAGE_MULTIARCH")
+ ! (cx_verify_registry_image "$CX_IMAGE_MULTIARCH")
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ check=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+
+ def test_canonical_gha_requires_compute_visible_staging(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ repo = Path(temporary) / "repo"
+ squash = Path(temporary) / "squash"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ squash.mkdir()
+ (source / "public.py").write_text("public\n")
+ (source / "private-infra.md").write_text("private\n")
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ unset CX_SHARD_FILE CX_STAGE_DIR
+ ! (COLLECTIVEX_CANONICAL_GHA=1; cx_stage_repo "$2" "")
+ staged="$(COLLECTIVEX_CANONICAL_GHA=0; cx_stage_repo "$2" "")"
+ test "$staged" != "$2"
+ test -f "$staged/experimental/CollectiveX/public.py"
+ test ! -e "$staged/experimental/CollectiveX/private-infra.md"
+ cx_cleanup_stage "$staged" "$2"
+ test ! -e "$staged"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(repo)],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SQUASH_DIR": str(squash),
+ },
+ )
+ self.assertEqual(list(squash.iterdir()), [])
+
+ def test_manual_stage_does_not_write_to_checkout_parent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary) / "readonly-parent"
+ repo = parent / "repo"
+ squash = parent / "squash"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ squash.mkdir(mode=0o700)
+ (source / "public.py").write_text("public\n")
+ original_mode = parent.stat().st_mode & 0o777
+ parent.chmod(0o555)
+ try:
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ unset CX_STAGE_DIR
+ staged="$(cx_stage_repo "$2" "")"
+ case "$staged" in "$3"/.collectivex-stage-*) ;; *) exit 1 ;; esac
+ test -f "$staged/experimental/CollectiveX/public.py"
+ test ! -e "$4/.collectivex-stage"
+ cx_cleanup_stage "$staged" "$2"
+ test ! -e "$staged"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo),
+ str(squash), str(parent),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SQUASH_DIR": str(squash),
+ },
+ )
+ finally:
+ parent.chmod(original_mode)
+ self.assertEqual(
+ sorted(path.name for path in parent.iterdir()),
+ ["repo", "squash"],
+ )
+ self.assertEqual(list(squash.iterdir()), [])
+
+ def test_stage_refuses_to_reuse_an_execution_child(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ base = root / "stage"
+ child = base / "job_collision"
+ child.mkdir(parents=True, mode=0o700)
+ sentinel = child / "keep"
+ sentinel.write_text("keep")
+ command = r'''
+ source "$1"
+ ! (cx_stage_repo "$2" "$3")
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "COLLECTIVEX_EXECUTION_ID": "collision",
+ "CX_STAGE_DIR": str(base),
+ },
+ )
+ self.assertEqual(sentinel.read_text(), "keep")
+
+ def test_stage_removes_its_execution_child_when_rsync_fails(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ base = root / "stage"
+ sentinel = root / "rsync-called"
+ command = r'''
+ source "$1"
+ rsync() { : > "$RSYNC_CALLED"; return 1; }
+ ! cx_stage_repo "$2" "$3"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "CX_STAGE_DIR": str(base),
+ "RSYNC_CALLED": str(sentinel),
+ },
+ )
+ self.assertTrue(sentinel.is_file())
+ self.assertEqual(list(base.iterdir()), [])
+
+ def test_backend_cache_reuses_v3_and_falls_back_once_without_repair(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary) / "stage"
+ parent.mkdir(mode=0o700)
+ concurrent = Path(temporary) / "concurrent"
+ concurrent.mkdir(mode=0o700)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ for worker in 1 2 3; do
+ (
+ cx_prepare_backend_cache "$2"
+ printf '%s %s\n' "$CX_BACKEND_CACHE_SENTINEL_SHA256" \
+ "$CX_PREPARED_BACKEND_CACHE" > "$3/$worker"
+ ) &
+ done
+ wait
+ cmp "$3/1" "$3/2"
+ cmp "$3/1" "$3/3"
+ cx_prepare_backend_cache "$2"
+ first="$CX_PREPARED_BACKEND_CACHE"
+ first_digest="$CX_BACKEND_CACHE_SENTINEL_SHA256"
+ chmod 2700 "$first"
+ cx_prepare_backend_cache "$2"
+ second="$CX_PREPARED_BACKEND_CACHE"
+ test "$first" = "$second"
+ test "$first_digest" = "$CX_BACKEND_CACHE_SENTINEL_SHA256"
+ test "$first" = "$(cd "$2" && pwd -P)/.collectivex-backend-cache-v3-$(id -u)"
+ export CX_BACKEND_CACHE_ROOT="$first"
+ cx_verify_backend_cache_mount
+ export CX_BACKEND_CACHE_SENTINEL_SHA256="$(printf '0%.0s' {1..64})"
+ ! cx_verify_backend_cache_mount
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(parent),
+ str(concurrent),
+ ],
+ check=True,
+ )
+ cache = parent / f".collectivex-backend-cache-v3-{os.getuid()}"
+ self.assertTrue(cache.is_dir())
+ self.assertEqual(cache.stat().st_mode & 0o777, 0o700)
+ self.assertEqual(
+ list(cache.glob(".collectivex-mount-sentinel-v1.tmp.*")), []
+ )
+ alias = Path(temporary) / "stage-alias"
+ alias.symlink_to(parent, target_is_directory=True)
+ canonical = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_prepare_backend_cache "$2"; '
+ 'printf "%s\\n%s\\n" "$CX_PREPARED_BACKEND_CACHE" '
+ '"$CX_BACKEND_CACHE_SENTINEL_SHA256"',
+ "_", str(common), str(alias),
+ ],
+ text=True,
+ capture_output=True,
+ check=True,
+ )
+ cache_path, digest = canonical.stdout.splitlines()
+ self.assertEqual(cache_path, str(cache.resolve()))
+ self.assertRegex(digest, r"^[0-9a-f]{64}$")
+ saved = parent / "saved-cache"
+ cache.rename(saved)
+ cache.mkdir(mode=0o700)
+ replacement = cache / ".collectivex-mount-sentinel-v1"
+ replacement.write_bytes(b"replacement".ljust(32, b"!"))
+ replacement.chmod(0o600)
+ replaced = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; export CX_BACKEND_CACHE_ROOT="$2" '
+ 'CX_BACKEND_CACHE_SENTINEL_SHA256="$3"; '
+ 'cx_verify_backend_cache_mount',
+ "_", str(common), str(cache), digest,
+ ]
+ )
+ self.assertNotEqual(replaced.returncode, 0)
+ replacement.unlink()
+ cache.rmdir()
+ saved.rename(cache)
+ (cache / ".collectivex-mount-sentinel-v1").unlink()
+ cache.rmdir()
+ target = Path(temporary) / "target"
+ target.mkdir(mode=0o700)
+ cache.symlink_to(target, target_is_directory=True)
+ fallback = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_prepare_backend_cache "$2"; '
+ 'printf "%s\\n" "$CX_PREPARED_BACKEND_CACHE"',
+ "_", str(common), str(parent),
+ ],
+ text=True,
+ capture_output=True,
+ check=True,
+ )
+ v4 = parent / f".collectivex-backend-cache-v4-{os.getuid()}"
+ self.assertEqual(fallback.stdout.strip(), str(v4.resolve()))
+ self.assertTrue(cache.is_symlink())
+ self.assertTrue(v4.is_dir())
+ (v4 / ".collectivex-mount-sentinel-v1").unlink()
+ v4.rmdir()
+ v4.symlink_to(target, target_is_directory=True)
+ result = subprocess.run(
+ [
+ "bash", "-c", 'source "$1"; cx_prepare_backend_cache "$2"',
+ "_", str(common), str(parent),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn(str(parent), result.stderr)
+ self.assertTrue(cache.is_symlink())
+ self.assertTrue(v4.is_symlink())
+
+ source = common.read_text().split("cx_prepare_backend_cache() {", 1)[1]
+ program = source.split("<<'PY'\n", 1)[1].split("\nPY\n", 1)[0]
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary) / "stage"
+ parent.mkdir(mode=0o700)
+ fake_os = types.ModuleType("os")
+ fake_os.__dict__.update(os.__dict__)
+ fake_os.fsync = mock.Mock(side_effect=OSError("forced fsync failure"))
+ with (
+ mock.patch.dict(sys.modules, {"os": fake_os}),
+ mock.patch.object(sys, "argv", ["-", str(parent)]),
+ mock.patch.object(sys, "stdout", io.StringIO()),
+ self.assertRaises(SystemExit) as failure,
+ ):
+ exec(compile(program, "", "exec"), {})
+ self.assertEqual(failure.exception.code, 1)
+ self.assertEqual(
+ list(parent.rglob(".collectivex-mount-sentinel-v1.tmp.*")), []
+ )
+
+ def test_nvidia_namespace_package_roots_come_from_distribution_files(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ site = Path(temporary) / "site"
+ package = site / "nvidia" / "nccl"
+ (package / "include").mkdir(parents=True)
+ (package / "lib").mkdir()
+ (package / "include" / "nccl.h").write_text("header\n")
+ (package / "lib" / "libnccl.so.2").write_text("library\n")
+ info = site / "nvidia_nccl_cu13-2.30.4.dist-info"
+ info.mkdir()
+ (info / "METADATA").write_text(
+ "Metadata-Version: 2.1\nName: nvidia-nccl-cu13\nVersion: 2.30.4\n"
+ )
+ (info / "RECORD").write_text(
+ "nvidia/nccl/include/nccl.h,,\n"
+ "nvidia/nccl/lib/libnccl.so.2,,\n"
+ "nvidia_nccl_cu13-2.30.4.dist-info/METADATA,,\n"
+ "nvidia_nccl_cu13-2.30.4.dist-info/RECORD,,\n"
+ )
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_nvidia_package_root()/,/^}/p' "$1")"
+ root="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)"
+ test "$root" = "$2/nvidia/nccl"
+ ! cx_nvidia_package_root nvidia-nccl-cu13 nvshmem
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(site.resolve())],
+ check=True,
+ env={**os.environ, "PYTHONPATH": str(site)},
+ )
+
+ def test_cuda_cccl_exports_the_resolved_jit_toolchain_root(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ toolkit = root / "cuda-13.0"
+ (toolkit / "bin").mkdir(parents=True)
+ (toolkit / "include").mkdir()
+ (toolkit / "lib64").mkdir()
+ cccl = toolkit / "targets" / "x86_64-linux" / "include" / "cccl"
+ cccl.mkdir(parents=True)
+ nvcc = toolkit / "bin" / "nvcc"
+ nvcc.write_text("#!/bin/sh\nexit 0\n")
+ nvcc.chmod(0o755)
+ alias = root / "cuda"
+ alias.symlink_to(toolkit, target_is_directory=True)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_prepare_cuda_cccl()/,/^}/p' "$1")"
+ cx_prepare_cuda_cccl
+ test "$CUDA_HOME" = "$2"
+ test "$CX_CUDA_CCCL" = "$2/targets/x86_64-linux/include/cccl"
+ test "$CPATH" = "$2/targets/x86_64-linux/include/cccl:"
+ test "$NVCC_PREPEND_FLAGS" = "-I$2/targets/x86_64-linux/include/cccl "
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(toolkit.resolve())],
+ check=True,
+ env={
+ **os.environ,
+ "PATH": f"{alias / 'bin'}:{os.environ['PATH']}",
+ "CPATH": "",
+ "NVCC_PREPEND_FLAGS": "",
+ },
+ )
+
+ def test_deepep_v2_toolchain_rejects_overlay_lock_failure(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_prepare_deepep_toolchain()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_nvidia_package_root() { printf '%s' /unused; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_log() { :; }
+ flock() { return 1; }
+ ! cx_prepare_deepep_toolchain
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), temporary],
+ check=True,
+ )
+
+ def test_pinned_source_fetch_retries_transient_failures(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_git()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_fetch_revision()/,/^}/p' "$1")"
+ attempts=0
+ expected_directory="$(cd -P -- "$3" && pwd -P)"
+ sleep() { :; }
+ git() {
+ local argument has_directory=0 has_trust=0
+ if [ "$1" = '-c' ] && [ "$3" = init ]; then
+ mkdir -p "${@: -1}"
+ return 0
+ fi
+ for argument in "$@"; do
+ [ "$argument" != '-C' ] || has_directory=1
+ [ "$argument" != "safe.directory=$expected_directory" ] || has_trust=1
+ [ "$argument" != 'safe.directory=*' ] || return 1
+ done
+ [ "$has_directory" = 0 ] || [ "$has_trust" = 1 ] || return 1
+ case " $* " in
+ *' fetch '*)
+ attempts=$((attempts + 1))
+ [ "$attempts" = 3 ]
+ ;;
+ *' rev-parse HEAD '*) printf '%s\n' "$revision" ;;
+ *) return 0 ;;
+ esac
+ }
+ cx_fetch_revision https://example.invalid/repo "$2" "$3"
+ test "$attempts" = 3
+ '''
+ revision = "a" * 40
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), revision, temporary],
+ check=True,
+ )
+
+ def test_git_tree_trust_is_exact_and_command_scoped(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ repository = root / "repo"
+ repository.mkdir()
+ alias = root / "alias"
+ alias.symlink_to(repository, target_is_directory=True)
+ wildcard = root / "*"
+ wildcard.mkdir()
+ arguments = root / "arguments"
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_git()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")"
+ arguments="$4"
+ git() { printf '%s\n' "$@" > "$arguments"; }
+ cx_git_in_tree "$2" status --porcelain
+ ! cx_git_in_tree relative status
+ ! cx_git_in_tree "$3" status
+ ! cx_git_in_tree "$5" status
+ '''
+ subprocess.run(
+ [
+ "bash",
+ "-c",
+ command,
+ "_",
+ str(common),
+ str(repository),
+ str(alias),
+ str(arguments),
+ str(wildcard),
+ ],
+ check=True,
+ )
+ self.assertEqual(
+ arguments.read_text().splitlines(),
+ [
+ "-c",
+ "credential.helper=",
+ "-c",
+ f"safe.directory={repository.resolve()}",
+ "-C",
+ str(repository.resolve()),
+ "status",
+ "--porcelain",
+ ],
+ )
+ self.assertNotIn("safe.directory=*", arguments.read_text())
+
+ def test_runtime_materializes_the_verified_host_source_without_network(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ seed = root / "seed"
+ seed.mkdir()
+ (seed / "pinned").write_text("source\n")
+ destination = root / "build"
+ fetched = root / "network-fetch"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export CX_BACKEND_SOURCE_ROOT="$2/source"
+ SEED="$3" FETCHED="$5"
+ copy_mode=
+ cx_backend_source_path() { printf '%s' "$SEED"; }
+ cx_backend_source_is_valid() { test -f "$2/pinned"; }
+ cx_fetch_revision() { : > "$FETCHED"; return 1; }
+ cp() {
+ test "$1" = -R
+ copy_mode=recursive
+ command cp "$@"
+ }
+ cx_materialize_backend_source deepep-hybrid "$4"
+ test -f "$4/pinned"
+ test "$copy_mode" = recursive
+ python3 - "$4" <<'PY'
+import os
+import stat
+import sys
+assert stat.S_IMODE(os.stat(sys.argv[1]).st_mode) == 0o700
+PY
+ test ! -e "$FETCHED"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(root),
+ str(seed), str(destination), str(fetched),
+ ],
+ check=True,
+ )
+
+ def test_backend_source_validation_rejects_status_errors_and_ignored_files(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ cx_backend_source_pin() { printf '%s|%s|' revision tree; }
+ git() {
+ case " $* " in
+ *' rev-parse HEAD '*) printf '%s\n' revision ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' tree ;;
+ *' status --porcelain '*) [ "$mode" != status-error ] ;;
+ *' ls-files --others --ignored '*)
+ [ "$mode" != ignored ] || printf '%s\n' ignored.bin
+ ;;
+ *) return 1 ;;
+ esac
+ }
+ mode=status-error
+ ! cx_backend_source_is_valid backend "$2"
+ mode=ignored
+ ! cx_backend_source_is_valid backend "$2"
+ mode=clean
+ cx_backend_source_is_valid backend "$2"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), temporary],
+ check=True,
+ )
+
+ def test_backend_source_root_normalizes_inherited_special_mode(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ source_root = root / "experimental" / "CollectiveX" / ".cx_sources"
+ source = source_root / "backend-revision"
+ source.mkdir(parents=True)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_EXECUTION_ID="source-mode-$$"
+ trap 'cx_cleanup_private_logs 0' EXIT
+ expected_mount="$2"
+ expected_source="$3"
+ expected_root="${expected_source%/*}"
+ observed_mode=2700
+ mock_stage_owner=4200
+ mock_root_owner=4200
+ chmod_calls=0
+ chmod() {
+ test "$1" = 700 && test "$2" = "$expected_root"
+ chmod_calls=$((chmod_calls + 1))
+ [ "$chmod_calls" = 2 ] || return 1
+ observed_mode=700
+ }
+ stat() {
+ case "$2" in
+ %u)
+ case "$3" in
+ "$expected_mount") printf '%s\n' "$mock_stage_owner" ;;
+ "$expected_root") printf '%s\n' "$mock_root_owner" ;;
+ *) return 1 ;;
+ esac
+ ;;
+ %a)
+ case "$3" in
+ "$expected_mount") printf '2700\n' ;;
+ "$expected_root") printf '%s\n' "$observed_mode" ;;
+ *) return 1 ;;
+ esac
+ ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_backend_source_path() { printf '%s' "$expected_source"; }
+ cx_backend_source_is_valid() {
+ test "$1" = backend && test "$2" = "$expected_source"
+ }
+ cx_prepare_backend_source "$2" backend
+ test "$observed_mode" = 2700
+ test "$chmod_calls" = 0
+ observed_mode=2750
+ ! _cx_prepare_backend_source "$2" backend
+ test "$chmod_calls" = 1
+ _cx_prepare_backend_source "$2" backend
+ test "$observed_mode" = 700
+ mock_root_owner=4300
+ ! _cx_prepare_backend_source "$2" backend
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), str(root), str(source)],
+ check=True,
+ )
+
+ def test_canonical_backend_sources_use_verified_seed_without_network(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ mount = root / "mount"
+ source_root = mount / "experimental" / "CollectiveX" / ".cx_sources"
+ seed_root = root / "seed"
+ seeds = [
+ seed_root / f"{backend}-revision"
+ for backend in ("backend-one", "backend-two")
+ ]
+ mount.mkdir(mode=0o700)
+ source_root.parent.mkdir(parents=True, mode=0o700)
+ for seed in seeds:
+ seed.mkdir(parents=True, mode=0o700)
+ (seed / "pinned").write_text("source\n")
+ network = root / "network"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1
+ export CX_BACKEND_SOURCE_SEED_ROOT="$4"
+ export COLLECTIVEX_EXECUTION_ID="source-seed-$$"
+ trap 'cx_cleanup_private_logs 0' EXIT
+ NETWORK="$5"
+ stat() {
+ case "$2" in
+ %u) printf '4200\n' ;;
+ %a) printf '700\n' ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_backend_source_path() { printf '%s/%s-revision' "$1" "$2"; }
+ cx_backend_source_is_valid() { test -f "$2/pinned"; }
+ cx_fetch_revision() { : > "$NETWORK"; return 1; }
+ cx_prepare_backend_source "$2" backend-one
+ cx_prepare_backend_source "$2" backend-two
+ test -f "$3/backend-one-revision/pinned"
+ test -f "$3/backend-two-revision/pinned"
+ test ! -e "$NETWORK"
+ rm -rf -- "$3/backend-one-revision" "$3/backend-two-revision"
+ unset CX_BACKEND_SOURCE_SEED_ROOT
+ ! _cx_prepare_backend_source "$2" backend-one
+ test ! -e "$NETWORK"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(mount),
+ str(source_root), str(seed_root), str(network),
+ ],
+ check=True,
+ )
+
+ def test_deepep_hybrid_cache_reuse_revalidates_extensions(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "deep_ep_cpp.so").write_bytes(b"deep")
+ (root / "hybrid_ep_cpp.so").write_bytes(b"hybrid")
+ command = r'''
+ set -euo pipefail
+ chmod 700 "$3"
+ source "$1"
+ eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$2")"
+ eval "$(sed -n '/^cx_deepep_hybrid_cache_is_valid()/,/^}/p' "$2")"
+ revision=revision tree=tree
+ cx_git() {
+ case " $* " in
+ *' rev-parse HEAD '*) printf '%s\n' "$revision" ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' "$tree" ;;
+ *' status --porcelain '*|*' ls-files --others '*) return 0 ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_git_in_tree() { shift; cx_git "$@"; }
+ marker="$3/.collectivex-complete"
+ digest="$(cx_extension_pair_sha256 "$3" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')"
+ (umask 077; printf '%s\n%s\n%s\n' "$revision" "$tree" "$digest" > "$marker")
+ cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ printf changed > "$3/hybrid_ep_cpp.so"
+ ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ printf hybrid > "$3/hybrid_ep_cpp.so"
+ cp "$3/deep_ep_cpp.so" "$3/deep_ep_cpp-extra.so"
+ ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), str(runtime), temporary],
+ check=True,
+ )
+
+ def test_rack_backend_environment_is_shared_per_node_and_required(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ launcher = (ROOT / "launchers" / "launch_gb-nv.sh").read_text()
+ assignment = next(
+ line for line in launcher.splitlines()
+ if line.startswith("SOURCE_BACKEND_ENV=")
+ )
+ self.assertNotIn("/tmp/.cx_backend_env", launcher)
+ self.assertIn('[ -f "$env_file" ] && [ -r "$env_file" ]', launcher)
+ self.assertIn('[ ! -L "$env_file" ]', launcher)
+ self.assertIn('$(stat -c "%u" "$env_root"):600', launcher)
+ self.assertIn('case "$(stat -c "%a" "$env_root")" in 700|[1-7]700)', launcher)
+ self.assertIn("node-${SLURM_NODEID}.sh", launcher)
+ self.assertIn("HybridEPBuffer", launcher)
+ self.assertIn('. "$env_file" || exit 66', launcher)
+ with tempfile.TemporaryDirectory() as temporary:
+ consumer = r'''
+ eval "$1"
+ env_root="$2/env"
+ SOURCE_BACKEND_ENV="${SOURCE_BACKEND_ENV//\/ix\/experimental\/CollectiveX\/.cx_backend\/env/$env_root}"
+ mkdir -p "$env_root"
+ env_file="$env_root/node-1.sh"
+ printf 'printf sourced > "$CX_SENTINEL"\n' > "$env_file"
+ chmod 600 "$env_file"
+ export CX_SENTINEL="$2/sentinel"
+ stat() {
+ [ "${STAT_FAIL:-0}" = 0 ] || return 1
+ case "$2" in
+ %a) printf '%s\n' "$ROOT_MODE" ;;
+ %u) printf '1000\n' ;;
+ %u:%a) printf '%s\n' "$FILE_OWNER_MODE" ;;
+ *) return 2 ;;
+ esac
+ }
+ run_case() {
+ rm -f "$CX_SENTINEL"
+ ROOT_MODE="$1" FILE_OWNER_MODE="$2" STAT_FAIL="$3" SLURM_NODEID="$4"
+ ( eval "$SOURCE_BACKEND_ENV" )
+ rc=$?
+ [ "$rc" = "$5" ] || return 1
+ if [ "$5" = 0 ]; then
+ [ -f "$CX_SENTINEL" ]
+ else
+ [ ! -e "$CX_SENTINEL" ]
+ fi
+ }
+ run_case 700 1000:600 0 1 0
+ run_case 2700 1000:600 0 1 0
+ run_case 755 1000:600 0 1 66
+ run_case 700 1000:600 1 1 66
+ run_case 700 2000:600 0 1 66
+ mv "$env_file" "$env_file.real"
+ ln -s "$env_file.real" "$env_file"
+ run_case 700 1000:600 0 1 66
+ rm "$env_file"
+ mv "$env_file.real" "$env_file"
+ run_case 700 1000:600 0 invalid 66
+ '''
+ subprocess.run(
+ ["bash", "-c", consumer, "_", assignment, temporary],
+ check=True,
+ )
+ command = r'''
+ set -euo pipefail
+ cd "$2"
+ eval "$(sed -n '/^cx_persist_backend_env()/,/^}/p' "$1")"
+ export SLURM_NODEID=1 PYTHONPATH=/ix/pinned DEEPEP_COMMIT=abc
+ cx_persist_backend_env
+ env_file="$PWD/.cx_backend/env/node-1.sh"
+ test -f "$env_file"
+ test "$(stat -f %Lp "$env_file" 2>/dev/null || stat -c %a "$env_file")" = 600
+ unset PYTHONPATH DEEPEP_COMMIT
+ . "$env_file"
+ test "$PYTHONPATH" = /ix/pinned
+ test "$DEEPEP_COMMIT" = abc
+ SLURM_NODEID=invalid && ! cx_persist_backend_env
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), temporary],
+ check=True,
+ )
+
+ def test_stage_cleanup_failure_fails_job_but_marks_allocation_safe(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "repo").mkdir()
+ (root / "stage").mkdir()
+ command = r'''
+ source "$1"
+ cx_write_cleanup_guard() {
+ rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe"
+ : > "$CX_JOB_ROOT/cleanup-$1"
+ }
+ cx_cleanup_stage() { return 1; }
+ cx_cleanup_private_logs() { : > "$CX_JOB_ROOT/logs-deleted"; }
+ export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/stage"
+ export COLLECTIVEX_CANONICAL_GHA=1 CX_ALLOCATION_REQUESTED=0
+ unset CX_BENCH JOB_ID
+ cx_launcher_cleanup 0
+ '''
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(root)],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+ self.assertEqual(result.returncode, 1, result.stderr)
+ self.assertTrue((root / "cleanup-safe").is_file())
+ self.assertFalse((root / "cleanup-unsafe").exists())
+ self.assertFalse((root / "logs-deleted").exists())
+
+ def test_generated_stage_cleanup_never_removes_configured_base(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ base = root / "stage"
+ repo = root / "repo"
+ generated = base / "job_execution"
+ generated.mkdir(parents=True)
+ repo.mkdir()
+ (generated / "payload").write_text("temporary")
+ subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_cleanup_stage "$2" "$3"; '
+ '! cx_cleanup_stage "$4" "$3"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(generated),
+ str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ "CX_STAGE_DIR": str(base),
+ },
+ )
+ self.assertFalse(generated.exists())
+ self.assertTrue(base.is_dir())
+ self.assertTrue(repo.is_dir())
+
+ def test_adapters_do_not_retain_dead_expected_methods(self) -> None:
+ for path in HERE.glob("ep_*.py"):
+ tree = ast.parse(path.read_text(), str(path))
+ methods = {
+ node.name for node in ast.walk(tree)
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+ }
+ self.assertNotIn("expected", methods, path.name)
+
+ def test_artifact_safety_rejects_sensitive_material(self) -> None:
+ private_address = ".".join(str(octet) for octet in (10, 0, 0, 1))
+ secret = "github_pat_" + "A" * 24
+ sensitive = {
+ "ipv4": ({"note": private_address}, private_address),
+ "ipv6": ({"note": "[2001:db8::1]:29500"}, "2001:db8::1"),
+ "user-at-host": ({"note": "ssh admin@private-host"}, "admin@private-host"),
+ "hostname": ({"note": "host=compute-17"}, "compute-17"),
+ "private-dns": ({"note": "worker-7.cluster.local"}, "worker-7.cluster.local"),
+ "suffixed-host": ({"worker_hostname": "relative"}, "worker_hostname"),
+ "suffixed-address": ({"control_address": "relative"}, "control_address"),
+ "suffixed-path": ({"scheduler_path": "relative"}, "scheduler_path"),
+ "exact-address": ({"address": "relative"}, "address"),
+ "exact-ip": ({"ip": "relative"}, "ip"),
+ "camel-host": ({"workerHost": "relative"}, "workerHost"),
+ "camel-path": ({"schedulerPath": "relative"}, "schedulerPath"),
+ "acronym-gpu-uuid": ({"gpuUUID": "relative"}, "gpuUUID"),
+ "acronym-device-uuid": ({"deviceUUID": "relative"}, "deviceUUID"),
+ "acronym-pci-bus": ({"pciBusID": "relative"}, "pciBusID"),
+ "mac-address": ({"note": "00:11:22:33:44:55"}, "00:11:22:33:44:55"),
+ "ib-guid": ({"note": "00:11:22:33:44:55:66:77"}, "00:11:22:33:44:55:66:77"),
+ "dgx-host": ({"note": "dgx-b300-001"}, "dgx-b300-001"),
+ "cloud-host": ({"note": "ip-10-20-30-40"}, "ip-10-20-30-40"),
+ "credential-field": ({"service_token": "short"}, "service_token"),
+ "prefixed-token": ({"note": secret}, secret),
+ "hf-token": ({"note": "hf_" + "A" * 24}, "hf_" + "A" * 24),
+ "payment-token": ({"note": "sk_live_" + "A" * 24}, "sk_live_" + "A" * 24),
+ "generic-secret": ({"note": "password=not-a-real-secret"}, "not-a-real-secret"),
+ }
+ for root in ("data", "it-share", "lustre", "raid", "nvme_home", "scratch", "gpfs", "fsx"):
+ value = f"/{root}/collectivex/run"
+ sensitive[f"private-root-{root}"] = ({"note": value}, value)
+ for name, (document, offending) in sensitive.items():
+ with self.subTest(name=name), self.assertRaises(
+ artifact_safety.ArtifactSafetyError
+ ) as caught:
+ artifact_safety.assert_publication_safe([document])
+ self.assertNotIn(offending, str(caught.exception))
+
+ artifact_safety.assert_publication_safe([{
+ "runner": "b300",
+ "redaction": "sanitized-v1",
+ "path": "datasets/" + "a" * 64 + "/dataset.json",
+ "timing": "8:64:32",
+ "image_digest": "sha256:" + "b" * 64,
+ "source": "github.com",
+ }])
+ for ref in ("release@candidate", "worker1-feature", "sk-refactor-long-component-name"):
+ artifact_safety.assert_publication_safe([{"ref": ref}])
+
+ def test_artifact_safety_cli_does_not_echo_sensitive_values(self) -> None:
+ private_value = ".".join(str(octet) for octet in (10, 24, 68, 12))
+ with tempfile.TemporaryDirectory() as temporary:
+ path = Path(temporary) / "artifact.json"
+ path.write_text(json.dumps({"note": private_value}))
+ result = subprocess.run(
+ [sys.executable, str(ROOT / "artifact_safety.py"), str(path)],
+ text=True,
+ capture_output=True,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("forbidden ipv4-address value", result.stderr)
+ self.assertNotIn(private_value, result.stderr)
+
+ def test_artifact_safety_rejects_linked_and_special_inputs(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ source = root / "source.json"
+ source.write_text("{}")
+ linked = root / "linked.json"
+ linked.symlink_to(source)
+ fifo = root / "fifo.json"
+ os.mkfifo(fifo)
+ for path in (linked, fifo):
+ with self.subTest(path=path.name), self.assertRaises(
+ artifact_safety.ArtifactSafetyError
+ ):
+ artifact_safety.load_documents([str(path)])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py
new file mode 100644
index 0000000000..89a6b46052
--- /dev/null
+++ b/experimental/CollectiveX/tests/workload.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""Canonical, byte-stable CollectiveX routing workloads.
+
+A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent
+file, and referenced by an immutable `workload_id`. Every promoted benchmark point consumes the
+SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a
+checksum match, not by trusting that two machines re-ran the same seeded generator.
+
+Layout on disk (one workload = two files, basename = workload_id):
+ /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32
+ /.manifest.json dims, routing profile, generator version, seed, SHA-256s
+
+Routing and gate weights come from a stdlib integer counter, not a framework RNG. The same
+parameters therefore produce the same int32/float32 bytes across PyTorch and accelerator images.
+"""
+from __future__ import annotations
+
+from array import array
+import bisect
+import hashlib
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import identity # noqa: E402
+
+WORKLOAD_SCHEMA_VERSION = 1
+# Bump when the counter or byte encoding changes. The workload ID binds parameters and trace bytes.
+GENERATOR_VERSION = "collectivex-routing-counter-v3"
+GATE_WEIGHT_FORMAT = "counter-u16-normalized-f32"
+ACTIVATION_GENERATOR = "collectivex-activation-counter-v3"
+_MASK64 = (1 << 64) - 1
+
+
+def _sha256(b: bytes) -> str:
+ return hashlib.sha256(b).hexdigest()
+
+
+def _mix64(value: int) -> int:
+ value = (value + 0x9E3779B97F4A7C15) & _MASK64
+ value = ((value ^ (value >> 30)) * 0xBF58476D1CE4E5B9) & _MASK64
+ value = ((value ^ (value >> 27)) * 0x94D049BB133111EB) & _MASK64
+ return value ^ (value >> 31)
+
+
+def _counter(seed: int, token: int, slot: int, attempt: int, stream: int) -> int:
+ value = (
+ (seed & _MASK64)
+ ^ (((token + 1) * 0xD2B74407B1CE6E93) & _MASK64)
+ ^ (((slot + 1) * 0xCA5A826395121157) & _MASK64)
+ ^ (((attempt + 1) * 0x9E3779B185EBCA87) & _MASK64)
+ ^ (((stream + 1) * 0xA24BAED4963EE407) & _MASK64)
+ )
+ return _mix64(value)
+
+
+def canonical_routing_rows(
+ global_tokens: int, experts: int, topk: int, routing: str, seed: int
+) -> tuple[list[list[int]], list[list[float]]]:
+ """Generate distinct experts and normalized weights using exact integer counters."""
+ if routing not in {"uniform", "zipf"}:
+ raise ValueError(f"unknown routing {routing!r} (uniform|zipf)")
+ if global_tokens <= 0 or experts <= 0 or topk <= 0 or topk > experts:
+ raise ValueError("global_tokens/experts/topk must be positive and topk <= experts")
+
+ cumulative: list[int] | None = None
+ if routing == "zipf":
+ total = 0
+ cumulative = []
+ for expert in range(experts):
+ total += (1 << 32) // (expert + 1)
+ cumulative.append(total)
+
+ indices: list[list[int]] = []
+ weights: list[list[float]] = []
+ for token in range(global_tokens):
+ selected: list[int] = []
+ used: set[int] = set()
+ for slot in range(topk):
+ attempt = 0
+ while True:
+ value = _counter(seed, token, slot, attempt, 0)
+ expert = (
+ value % experts
+ if cumulative is None
+ else bisect.bisect_right(cumulative, value % cumulative[-1])
+ )
+ if expert not in used:
+ used.add(expert)
+ selected.append(expert)
+ break
+ attempt += 1
+ if attempt > experts * 16:
+ raise RuntimeError("counter routing could not select distinct experts")
+ raw = [1 + _counter(seed, token, slot, 0, 1) % 65535 for slot in range(topk)]
+ denominator = float(sum(raw))
+ indices.append(selected)
+ weights.append([value / denominator for value in raw])
+ return indices, weights
+
+
+def _canonical_bytes(
+ indices: list[list[int]], weights: list[list[float]]
+) -> tuple[bytes, bytes]:
+ idx = array("i", (value for row in indices for value in row))
+ gate = array("f", (value for row in weights for value in row))
+ if idx.itemsize != 4 or gate.itemsize != 4:
+ raise RuntimeError("canonical workload requires 32-bit int and float arrays")
+ if sys.byteorder != "little":
+ idx.byteswap()
+ gate.byteswap()
+ return idx.tobytes(), gate.tobytes()
+
+
+def trace_checksums(
+ indices: list[list[int]], weights: list[list[float]]
+) -> dict[str, str]:
+ """Return the manifest hashes for exact logical or remapped routing rows."""
+ idx_bytes, weight_bytes = _canonical_bytes(indices, weights)
+ return {
+ "topk_idx": _sha256(idx_bytes),
+ "topk_weights": _sha256(weight_bytes),
+ "trace": _sha256(idx_bytes + weight_bytes),
+ }
+
+
+def canonical_member(
+ routing: str,
+ hidden: int,
+ topk: int,
+ experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ seed: int,
+) -> tuple[str, dict[str, str], list[list[int]], list[list[float]]]:
+ """Derive one canonical manifest member and retain its rows for proof checks."""
+ global_tokens = ep_size * tokens_per_rank
+ indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed)
+ checksums = trace_checksums(indices, weights)
+ member = compute_workload_id(
+ routing,
+ hidden,
+ topk,
+ experts,
+ ep_size,
+ global_tokens,
+ seed,
+ trace_checksum=checksums["trace"],
+ )
+ return member, checksums, indices, weights
+
+
+def compute_workload_id(routing: str, hidden: int, topk: int, experts: int,
+ ep_size: int, global_tokens: int, seed: int,
+ generator: str = GENERATOR_VERSION,
+ trace_checksum: str | None = None) -> str:
+ """Deterministic ID over parameters and canonical trace bytes."""
+ if generator != GENERATOR_VERSION:
+ raise ValueError(f"unsupported workload generator {generator!r}")
+ if trace_checksum is None:
+ indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed)
+ idx_bytes, weight_bytes = _canonical_bytes(indices, weights)
+ trace_checksum = _sha256(idx_bytes + weight_bytes)
+ key = {
+ "generator": generator, "routing": routing, "hidden": hidden, "topk": topk,
+ "experts": experts, "ep_size": ep_size, "global_tokens": global_tokens,
+ "seed": seed, "trace_sha256": trace_checksum,
+ "activation_generator": ACTIVATION_GENERATOR,
+ "activation_identity": compute_activation_identity(seed, hidden),
+ }
+ return identity.workload_id(key)
+
+
+def compute_activation_identity(seed, hidden, generator=ACTIVATION_GENERATOR) -> str:
+ """Identity of the exact counter-derived activation generator."""
+ key = f"counter|seed={seed}|hidden={hidden}|gen={generator}"
+ return _sha256(key.encode())
+
+
+def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank,
+ idx_np, weights_np):
+ """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib."""
+ if experts % experts_per_rank:
+ raise ValueError("experts must be divisible by experts_per_rank")
+ idx_bytes = idx_np.astype(" str:
+ import numpy as np
+ os.makedirs(out_dir, exist_ok=True)
+ wid = manifest["workload_id"]
+ np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"),
+ topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32))
+ with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh:
+ json.dump(manifest, fh, indent=2, sort_keys=True)
+ return wid
+
+
+def load_workload(npz_path, verify=True):
+ """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest).
+ Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums."""
+ import numpy as np
+ base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path
+ with open(base + ".manifest.json") as fh:
+ manifest = json.load(fh)
+ if manifest.get("workload_id") != os.path.basename(base):
+ raise ValueError(f"workload manifest ID does not match filename for {base}")
+ with np.load(base + ".npz", allow_pickle=False) as archive:
+ if set(archive.files) != {"topk_idx", "topk_weights"}:
+ raise ValueError(f"workload archive fields differ for {base}")
+ idx_np = np.ascontiguousarray(archive["topk_idx"])
+ w_np = np.ascontiguousarray(archive["topk_weights"])
+ if verify:
+ ok, reason = verify_workload(manifest, idx_np, w_np)
+ if not ok:
+ raise ValueError(f"workload checksum mismatch for {base}: {reason}")
+ return idx_np, w_np, manifest
+
+
+def verify_workload(manifest, idx_np, weights_np):
+ """Recompute checksums and compare to the manifest. Returns (ok, reason)."""
+ import numpy as np
+ expected_fields = {
+ "schema_version", "workload_id", "generator_version", "gate_weight_format", "dims",
+ "routing_profile", "seed", "checksums", "activation_profile", "activation_generator",
+ "activation_identity",
+ }
+ if not isinstance(manifest, dict) or set(manifest) != expected_fields:
+ return False, "manifest fields differ from the v1 contract"
+ if (manifest["schema_version"] != WORKLOAD_SCHEMA_VERSION
+ or manifest["generator_version"] != GENERATOR_VERSION
+ or manifest["gate_weight_format"] != GATE_WEIGHT_FORMAT
+ or manifest["routing_profile"] not in {"uniform", "zipf"}):
+ return False, "manifest version or generator is unsupported"
+ if (isinstance(manifest["seed"], bool) or not isinstance(manifest["seed"], int)
+ or not identity.is_typed_id(manifest["workload_id"], "workload")):
+ return False, "manifest seed or workload ID is invalid"
+ dims = manifest["dims"]
+ dim_fields = {"hidden", "topk", "experts", "ep_size", "tokens_per_rank",
+ "global_tokens", "experts_per_rank"}
+ if not isinstance(dims, dict) or set(dims) != dim_fields:
+ return False, "manifest dimensions are invalid"
+ if any(isinstance(dims[key], bool) or not isinstance(dims[key], int) or dims[key] <= 0
+ for key in dim_fields):
+ return False, "manifest dimensions must be positive integers"
+ if (dims["experts"] != dims["ep_size"] * dims["experts_per_rank"]
+ or dims["global_tokens"] != dims["ep_size"] * dims["tokens_per_rank"]):
+ return False, "manifest EP dimensions are inconsistent"
+ shape = (dims["global_tokens"], dims["topk"])
+ if (idx_np.dtype != np.int32 or weights_np.dtype != np.float32
+ or idx_np.shape != shape or weights_np.shape != shape
+ or not idx_np.flags.c_contiguous or not weights_np.flags.c_contiguous):
+ return False, "workload array dtype, shape, or layout is invalid"
+ if (np.any(idx_np < 0) or np.any(idx_np >= dims["experts"])
+ or np.any(np.diff(np.sort(idx_np, axis=1), axis=1) == 0)):
+ return False, "expert indices are out of range or repeated"
+ if (not np.isfinite(weights_np).all() or np.any(weights_np < 0)
+ or not np.allclose(weights_np.sum(axis=1), 1.0, rtol=1e-5, atol=1e-6)):
+ return False, "gate weights are invalid"
+ if (manifest["activation_profile"] != "canonical-counter-source-v3"
+ or manifest["activation_generator"] != ACTIVATION_GENERATOR
+ or manifest["activation_identity"]
+ != compute_activation_identity(
+ manifest["seed"], dims["hidden"], manifest["activation_generator"]
+ )):
+ return False, "activation identity is invalid"
+ ib = idx_np.astype(" must fail
+ idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256
+ bad, _ = verify_workload(man2, idx2, w2)
+ assert not bad, "verify must catch tampering"
+ print(f"save/load/verify roundtrip OK (workload_id={wid})")
+ except ImportError:
+ print("(numpy unavailable — skipped serialization roundtrip; id logic passed)")
+ print("workload self-test: PASS")
+ sys.exit(0)