From ec6f6cf463bbbc9654e29eef495ce4d0cf4e5a17 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 4 Jul 2026 01:03:33 +0800
Subject: [PATCH] feat(collectivex): add sanitized v1 benchmark suite

---
 .../workflows/collectivex-experimental.yml    |  192 +++
 .github/workflows/collectivex-sweep.yml       |  119 +-
 experimental/CollectiveX/.gitignore           |   11 +
 experimental/CollectiveX/README.md            |  119 ++
 experimental/CollectiveX/aggregate_results.py |  177 +++
 experimental/CollectiveX/artifact_safety.py   |  109 ++
 experimental/CollectiveX/capability.py        |  203 +++
 experimental/CollectiveX/configs/suites.yaml  |   53 +
 .../CollectiveX/configs/workloads.yaml        |   26 +
 experimental/CollectiveX/docs/methodology.md  |  266 ++++
 experimental/CollectiveX/env_capture.py       |  249 ++++
 experimental/CollectiveX/generate_matrix.py   |  160 +++
 .../CollectiveX/launchers/launch_b200-dgxc.sh |   59 +
 .../CollectiveX/launchers/launch_b300.sh      |   55 +
 .../CollectiveX/launchers/launch_gb200-nv.sh  |  264 ++++
 .../CollectiveX/launchers/launch_gb300-nv.sh  |  245 ++++
 .../launchers/launch_h100-dgxc-slurm.sh       |   57 +
 .../CollectiveX/launchers/launch_h200.sh      |   55 +
 .../launchers/launch_mi325x-amds.sh           |   23 +
 .../launchers/launch_mi355x-amds.sh           |   99 ++
 experimental/CollectiveX/make_bundle.py       |  384 ++++++
 experimental/CollectiveX/requirements.txt     |    7 +
 experimental/CollectiveX/results/.gitkeep     |    2 +
 experimental/CollectiveX/runtime/common.sh    |  322 +++++
 .../CollectiveX/runtime/run_in_container.sh   |  578 +++++++++
 .../schemas/ep-result-v4.schema.json          |  219 ++++
 .../schemas/ep-result-v5.schema.json          |  230 ++++
 .../schemas/workload-v1.schema.json           |   51 +
 experimental/CollectiveX/summarize.py         |  190 +++
 experimental/CollectiveX/sweep_matrix.py      |  248 ++++
 experimental/CollectiveX/tests/ep_deepep.py   |  378 ++++++
 .../CollectiveX/tests/ep_deepep_hybrid.py     |  169 +++
 .../CollectiveX/tests/ep_flashinfer.py        |  815 ++++++++++++
 experimental/CollectiveX/tests/ep_harness.py  | 1130 +++++++++++++++++
 experimental/CollectiveX/tests/ep_mori.py     |  450 +++++++
 experimental/CollectiveX/tests/ep_nccl.py     |  140 ++
 experimental/CollectiveX/tests/ep_uccl.py     |  345 +++++
 experimental/CollectiveX/tests/eplb.py        |  177 +++
 .../CollectiveX/tests/make_workloads.py       |  115 ++
 experimental/CollectiveX/tests/routing.py     |  277 ++++
 experimental/CollectiveX/tests/run_ep.py      |  177 +++
 .../tests/test_sampling_contract.py           |  845 ++++++++++++
 experimental/CollectiveX/tests/workload.py    |  192 +++
 experimental/CollectiveX/validate_results.py  |  442 +++++++
 44 files changed, 10389 insertions(+), 35 deletions(-)
 create mode 100644 .github/workflows/collectivex-experimental.yml
 create mode 100644 experimental/CollectiveX/.gitignore
 create mode 100644 experimental/CollectiveX/README.md
 create mode 100644 experimental/CollectiveX/aggregate_results.py
 create mode 100644 experimental/CollectiveX/artifact_safety.py
 create mode 100644 experimental/CollectiveX/capability.py
 create mode 100644 experimental/CollectiveX/configs/suites.yaml
 create mode 100644 experimental/CollectiveX/configs/workloads.yaml
 create mode 100644 experimental/CollectiveX/docs/methodology.md
 create mode 100644 experimental/CollectiveX/env_capture.py
 create mode 100644 experimental/CollectiveX/generate_matrix.py
 create mode 100644 experimental/CollectiveX/launchers/launch_b200-dgxc.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_b300.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_gb200-nv.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_gb300-nv.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_h200.sh
 create mode 100755 experimental/CollectiveX/launchers/launch_mi325x-amds.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_mi355x-amds.sh
 create mode 100644 experimental/CollectiveX/make_bundle.py
 create mode 100644 experimental/CollectiveX/requirements.txt
 create mode 100644 experimental/CollectiveX/results/.gitkeep
 create mode 100644 experimental/CollectiveX/runtime/common.sh
 create mode 100644 experimental/CollectiveX/runtime/run_in_container.sh
 create mode 100644 experimental/CollectiveX/schemas/ep-result-v4.schema.json
 create mode 100644 experimental/CollectiveX/schemas/ep-result-v5.schema.json
 create mode 100644 experimental/CollectiveX/schemas/workload-v1.schema.json
 create mode 100644 experimental/CollectiveX/summarize.py
 create mode 100644 experimental/CollectiveX/sweep_matrix.py
 create mode 100644 experimental/CollectiveX/tests/ep_deepep.py
 create mode 100644 experimental/CollectiveX/tests/ep_deepep_hybrid.py
 create mode 100644 experimental/CollectiveX/tests/ep_flashinfer.py
 create mode 100644 experimental/CollectiveX/tests/ep_harness.py
 create mode 100644 experimental/CollectiveX/tests/ep_mori.py
 create mode 100644 experimental/CollectiveX/tests/ep_nccl.py
 create mode 100644 experimental/CollectiveX/tests/ep_uccl.py
 create mode 100644 experimental/CollectiveX/tests/eplb.py
 create mode 100644 experimental/CollectiveX/tests/make_workloads.py
 create mode 100644 experimental/CollectiveX/tests/routing.py
 create mode 100644 experimental/CollectiveX/tests/run_ep.py
 create mode 100644 experimental/CollectiveX/tests/test_sampling_contract.py
 create mode 100644 experimental/CollectiveX/tests/workload.py
 create mode 100644 experimental/CollectiveX/validate_results.py

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
new file mode 100644
index 0000000000..d485e29f34
--- /dev/null
+++ b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,192 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Manual one-off diagnostics. Promoted v1 coverage uses collectivex-sweep.yml.
+
+on:
+  workflow_dispatch:
+    inputs:
+      sku:
+        description: Public self-hosted runner pool
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, mi355x, mi325x, h100-dgxc, h200-dgxc, b300, gb300]
+      benchmark:
+        description: EP backend to bring up
+        type: choice
+        default: deepep
+        options: [deepep, deepep-hybrid, mori, uccl, nccl-ep, flashinfer]
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision (fp8 scale-layout recipes + FlashInfer OCP-microscaling mxfp8/nvfp4)
+        type: choice
+        default: bf16
+        options: [bf16, fp8, fp8-pertoken, fp8-directcast, mxfp8, mxfp4, nvfp4]
+      mode:
+        # LL is retained for manual diagnostics only; it is not a promoted v1 dimension.
+        description: EP kernel path (LL is diagnostic only)
+        type: choice
+        default: normal
+        options: [normal, ll]
+      resource_mode:
+        # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
+        # tuned = each backend's own recommended/default launch config.
+        description: Comm resource regime
+        type: choice
+        default: tuned
+        options: [normalized, tuned, default]
+      contract:
+        # [cl]/[rv] are retained for explicit diagnostics, never promoted v1 comparisons.
+        description: Measurement contract (non-default contracts are diagnostic only)
+        type: choice
+        default: layout-and-dispatch-v1
+        options: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+      routing:
+        # v1 schedules uniform and zipf only. The remaining choices are one-off diagnostics.
+        description: EP routing distribution
+        type: choice
+        default: uniform
+        options: [uniform, zipf, balanced, balanced-rank-local, hotspot-single]
+      eplb:
+        # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure
+        # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*.
+        description: Apply EPLB expert replication/placement
+        type: boolean
+        default: false
+      canonical:
+        # Consume a CANONICAL serialized workload (generated deterministically in-container) instead
+        # of seeded-runtime. A canonical-serialized run with full GHA provenance is publication
+        # 'official' — this is the switch that promotes a cohort past comparable-experimental.
+        description: Use canonical serialized workload (official-grade workload identity)
+        type: boolean
+        default: false
+      activation_profile:
+        # Activation VALUE distribution of expert inputs. normal = headline; the others stress a
+        # future quantized combine (latency-neutral under bf16 — the expected null result).
+        description: Activation value profile
+        type: choice
+        default: normal
+        options: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+      sm_fraction:
+        # normalized comm-resource fraction (DeepEP sm_fraction*SMs / MoRI ~*CUs). Sweep this with
+        # resource_mode=normalized to build the resource-Pareto (latency vs comm fraction). Blank =
+        # harness default 0.18.
+        description: Normalized comm-resource fraction (resource_mode=normalized)
+        type: string
+        default: ''
+      hidden:
+        # Manual shape override. Blank = deepseek-v3-v1 default 7168.
+        description: MoE hidden dim (model-derived workloads); blank = 7168
+        type: string
+        default: ''
+      topk:
+        description: MoE top-k (model-derived workloads); blank = 8
+        type: string
+        default: ''
+      experts:
+        description: MoE total experts (model-derived workloads); blank = 256
+        type: string
+        default: ''
+      uneven_tokens:
+        # Manual diagnostic only; not a promoted v1 dimension.
+        description: Uneven source-token allocation
+        type: choice
+        default: none
+        options: [none, linear, empty-rank]
+
+concurrency:
+  # Group per (SKU + FULL config): GitHub keeps only one running + one pending per group and
+  # cancels the rest, so a coarse per-SKU group made a fan-out of many configs on one SKU
+  # self-cancel down to ~2. Including dtype/mode/contract/routing/eplb/phase gives each config
+  # its OWN group -> all configs survive; they queue only on the runner's own capacity, not on
+  # GitHub concurrency. cancel-in-progress FALSE so a re-dispatch of the SAME config queues.
+  # Resource/value axes remain in the group so distinct diagnostics do not self-cancel.
+  group: collectivex-${{ github.ref }}-${{ inputs.sku }}-${{ inputs.benchmark }}-${{ inputs.dispatch_dtype }}-${{ inputs.mode }}-${{ inputs.contract }}-${{ inputs.routing }}-${{ inputs.eplb }}-${{ inputs.phase }}-${{ inputs.resource_mode }}-${{ inputs.sm_fraction }}-${{ inputs.activation_profile }}-${{ inputs.hidden }}-${{ inputs.topk }}-${{ inputs.experts }}-${{ inputs.uneven_tokens }}-${{ inputs.nodes }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    runs-on: ${{ inputs.sku }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase)) }}
+    env:
+      CX_BENCH: ${{ inputs.benchmark }}
+      CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
+      CX_MODE: ${{ inputs.mode }}
+      CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
+      CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }}
+      CX_ROUTING: ${{ inputs.routing }}
+      CX_EPLB: ${{ inputs.eplb && '1' || '' }}
+      # Canonical serialized workload (official-grade identity) + value diagnostics.
+      CX_CANONICAL: ${{ inputs.canonical && '1' || '' }}
+      CX_ACTIVATION_PROFILE: ${{ inputs.activation_profile }}
+      CX_SM_FRACTION: ${{ inputs.sm_fraction }}
+      # Manual shape and uneven-allocation diagnostics.
+      CX_HIDDEN: ${{ inputs.hidden }}
+      CX_TOPK: ${{ inputs.topk }}
+      CX_EXPERTS: ${{ inputs.experts }}
+      CX_UNEVEN_TOKENS: ${{ inputs.uneven_tokens }}
+      CX_TIMING: '8:64:32'
+      # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job/artifact) -> a GHA result
+      # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical).
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+      COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ inputs.sku }}_${{ matrix.phase }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Install workflow dependencies
+        run: python3 -m pip install --quiet PyYAML==6.0.2 jsonschema==4.23.0
+      # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner
+      # Fail before requesting an allocation when the public capability table rejects a combination.
+      - name: Validate capability
+        run: |
+          python3 experimental/CollectiveX/capability.py \
+            --sku "${{ inputs.sku }}" \
+            --backend "${{ inputs.benchmark }}" \
+            --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \
+            --contract "${{ inputs.contract }}"
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
+        run: |
+          launcher="$(python3 experimental/CollectiveX/capability.py --launcher-for "${{ inputs.sku }}")"
+          RUNNER_NAME="${{ inputs.sku }}" \
+            bash "experimental/CollectiveX/launchers/launch_${launcher}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Validate result artifact safety
+        id: artifact_safety
+        if: always()
+        run: python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json
+      - name: Upload results
+        if: always() && steps.artifact_safety.outcome == 'success'
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+          path: |
+            experimental/CollectiveX/results/*.json
+            !experimental/CollectiveX/results/env_*.json
+          if-no-files-found: warn
diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml
index 7ddaca285c..9db2eb43d6 100644
--- a/.github/workflows/collectivex-sweep.yml
+++ b/.github/workflows/collectivex-sweep.yml
@@ -5,36 +5,43 @@
 # sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked
 # so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job
 # collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't
-# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer /
-# deepep-hybrid / nccl-ep, + deepep_v2) for full parity.
+# thousands of individual result files. The default `all` expands every EP backend into one matrix.
 name: CollectiveX Sweep
 on:
   workflow_dispatch:
     inputs:
       backend:
-        description: EP library to sweep (deepep matrix is remapped onto the others, capability-filtered)
+        description: "EP library to sweep — 'all' runs every EP backend in one matrix"
         type: choice
-        default: deepep
-        options: [deepep, uccl, flashinfer, deepep-hybrid, nccl-ep]
-      deepep_v2:
-        description: DeepEP V2 from-source kernels (kernel_gen=v2; deepep backend only)
-        type: boolean
-        default: false
+        default: all
+        options: [all, deepep, uccl, flashinfer, deepep-hybrid, mori, nccl-ep]
       suites:
         description: "'all' or comma-list of suite names"
         type: string
         default: all
       only_sku:
-        description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all
+        description: Restrict to one GHA runner pool (h100-dgxc|h200-dgxc|b300|b200-dgxc|gb200|gb300|mi325x|mi355x); blank = all
+        type: string
+        default: ''
+      min_nodes:
+        description: Keep only shards with >= this tray count (2 = rack-scale EP8 only; blank = all)
+        type: string
+        default: ''
+      max_nodes:
+        description: Keep only shards with <= this tray count (1 = single-tray EP4 only; blank = all)
         type: string
         default: ''
       max_cases:
-        description: Max cases per shard cell (chunk larger shards)
+        description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites)
         type: string
-        default: '14'
+        default: '128'
+      flashinfer_upgrade:
+        description: Upgrade FlashInfer to the newer (MNNVL-fixed) wheel for plain flashinfer runs too (fixes h100 completion-flag deadlock)
+        type: boolean
+        default: false
 
 concurrency:
-  group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }}
+  group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.only_sku }}
   cancel-in-progress: false
 
 jobs:
@@ -47,16 +54,22 @@ jobs:
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
-      - run: pip install --quiet pyyaml
+      - name: Install matrix dependencies
+        run: python3 -m pip install --quiet PyYAML==6.0.2 jsonschema==4.23.0
       - id: gen
         working-directory: experimental/CollectiveX
         run: |
           set -euo pipefail
-          ov=""; [ "${{ inputs.backend }}" != "deepep" ] && ov="--backend ${{ inputs.backend }}"
-          v2=""; [ "${{ inputs.deepep_v2 }}" = "true" ] && v2="--deepep-v2"
-          os=""; [ -n "${{ inputs.only_sku }}" ] && os="--only-sku ${{ inputs.only_sku }}"
-          # full matrix (with cases) -> artifact for the cells; slim (no cases) -> the strategy output.
-          python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $ov $v2 $os --out matrix_full.json >/dev/null
+          args=(--suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}")
+          case "${{ inputs.backend }}" in
+            all) args+=(--backends all) ;;
+            deepep) ;;
+            *) args+=(--backend "${{ inputs.backend }}") ;;
+          esac
+          [ -n "${{ inputs.only_sku }}" ] && args+=(--only-sku "${{ inputs.only_sku }}")
+          [ -n "${{ inputs.min_nodes }}" ] && args+=(--min-nodes "${{ inputs.min_nodes }}")
+          [ -n "${{ inputs.max_nodes }}" ] && args+=(--max-nodes "${{ inputs.max_nodes }}")
+          python3 sweep_matrix.py "${args[@]}" --out matrix_full.json >/dev/null
           SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))")
           echo "matrix=$SLIM" >> "$GITHUB_OUTPUT"
           echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT"
@@ -73,19 +86,24 @@ jobs:
     if: ${{ fromJSON(needs.setup.outputs.n) > 0 }}
     strategy:
       fail-fast: false
-      max-parallel: 10            # don't saturate the ~20-runner fleet; cells queue as slots free
+      max-parallel: 10
       matrix: ${{ fromJSON(needs.setup.outputs.matrix) }}
-    # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental).
-    runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }}
+    runs-on: ${{ matrix.sku }}
     timeout-minutes: 350
     env:
       CX_BENCH: ${{ matrix.backend }}
-      CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }}
       CX_NODES: ${{ matrix.nodes }}
+      CX_GPUS_PER_NODE: ${{ matrix.gpus_per_node }}
+      CX_SCALE_UP_DOMAIN: ${{ matrix.scale_up_domain }}
       CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json
       COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
-      CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
-      CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      COLLECTIVEX_ARTIFACT_NAME: cxshard-${{ matrix.id }}-${{ github.run_id }}
+      # Consolidated shards run one build-group (currently at most six cases) in one
+      # slurm allocation, so the launcher's default 45-min --time is too short. 120 min gives headroom;
+      # the allocation releases early when the shard finishes, so short shards don't waste it.
+      CX_TIME: '120'
+      CX_FLASHINFER_UPGRADE: ${{ inputs.flashinfer_upgrade && '1' || '' }}
+      COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.id }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
@@ -103,22 +121,28 @@ jobs:
           s=[x for x in m['include'] if x['id']=='${{ matrix.id }}']
           assert s, 'shard ${{ matrix.id }} not in matrix'
           s=s[0]
-          json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w'))
+          json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w'))
           print('shard ${{ matrix.id }}:', len(s['cases']), 'cases')
           "
       - name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation)
-        env:
-          RUNNER_NAME: ${{ runner.name }}
-        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+        run: |
+          RUNNER_NAME="${{ matrix.sku }}" \
+            bash "experimental/CollectiveX/launchers/launch_${{ matrix.launcher }}.sh"
       - name: Shard summary
         if: always()
         run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
-      - name: Upload shard results
+      - name: Validate shard artifact safety
+        id: artifact_safety
         if: always()
+        run: python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json
+      - name: Upload shard results
+        if: always() && steps.artifact_safety.outcome == 'success'
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: cxshard-${{ matrix.id }}-${{ github.run_id }}
-          path: experimental/CollectiveX/results/*.json   # glob skips the hidden .shard_*.json
+          path: |
+            experimental/CollectiveX/results/*.json
+            !experimental/CollectiveX/results/env_*.json
           if-no-files-found: warn
 
   # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ----
@@ -129,26 +153,51 @@ jobs:
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
+      - name: Install bundle dependencies
+        run: python3 -m pip install --quiet PyYAML==6.0.2 jsonschema==4.23.0
+      - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: cxsweep-matrix-${{ github.run_id }}
+          path: experimental/CollectiveX
       - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
         with:
           pattern: cxshard-*-${{ github.run_id }}
           path: _shards
-          merge-multiple: true
-      - name: Aggregate shards -> one ndjson
+      # Aggregate + publication bundle. The bundle IS the artifact-validation stage:
+      # make_bundle.py validates every doc (version-selected EP schema + semantic gates) before
+      # writing manifest/report/checksums; any validation error fails this job.
+      - name: Aggregate shards -> ndjson + publication bundle
         working-directory: experimental/CollectiveX
         run: |
           set -euo pipefail
-          tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}"
+          tag="${{ inputs.backend }}"
           python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson"
+          python3 make_bundle.py \
+            --aggregate "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson" \
+            --matrix matrix_full.json \
+            --out-dir results/bundle \
+            --source-run-id "${{ github.run_id }}" \
+            --source-sha "${{ github.sha }}" \
+            --source-workflow "${{ github.workflow }}" \
+            --source-run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
           {
             echo "## CollectiveX sweep aggregate (${tag})"
             echo '```'
             wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson"
+            python3 -c "import json; m=json.load(open('results/bundle/manifest.json')); print('bundle:', m['docs'], 'docs,', m['validation']['by_publication_status'])"
             echo '```'
           } >> "$GITHUB_STEP_SUMMARY"
       - name: Upload aggregate
+        if: success()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }}
+          name: cxsweep-aggregate-${{ inputs.backend }}-${{ github.run_id }}
           path: experimental/CollectiveX/results/aggregate/*.ndjson
           if-no-files-found: warn
+      - name: Upload publication bundle
+        if: success()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: cxsweep-bundle-${{ inputs.backend }}-${{ github.run_id }}
+          path: experimental/CollectiveX/results/bundle
+          if-no-files-found: ignore
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 0000000000..f94841345c
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.pyc
+results/*
+!results/.gitkeep
+.cx_workloads/
+
+# Local plans and infrastructure inventory.
+goal.md
+notes.md
+configs/platforms.yaml
+private-infra.md
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 0000000000..57149dbaf5
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,119 @@
+# CollectiveX
+
+CollectiveX is an experimental expert-parallel communication benchmark for comparing EP libraries
+on one platform and matched EP latency/effective logical payload bandwidth across platforms.
+
+> Publication hold: existing schema 3-5 artifacts are historical diagnostics. They cannot drive a
+> ranking, recommendation, regression baseline, or CollectiveX v1 dataset.
+
+## v1 Target
+
+The namespaced `collectivex.ep.v1` product covers H100, H200, B200, B300, GB200, GB300, MI325X, and
+MI355X with explicit topology. Headline points use the same BF16 workload, 512 observations, and
+three independent allocations. The final dataset provides:
+
+- measured roundtrip p50/p99 and independently available component latency;
+- effective logical payload GB/s, kept separate from bus or wire metrics;
+- within-chip library, portable-reference, identical-stack, and best-conforming comparisons;
+- complete accepted/failed/unsupported coverage, provenance, and repeat stability;
+- immutable locally hosted artifacts with an atomic development channel.
+
+`goal.md` is the local `/goal` execution checklist. [docs/methodology.md](docs/methodology.md) is the
+tracked technical contract and artifact architecture. `notes.md` is a local evidence ledger.
+
+## EP Backends
+
+| Backend | v1 status |
+|---|---|
+| Legacy DeepEP | Adapter uses `deep_ep.Buffer` |
+| DeepEP PR #605 V2 | Future: needs a dedicated `ElasticBuffer`/NCCL-Gin adapter |
+| DeepEP Hybrid | Adapter exists; exact API/build/timing identity required |
+| FlashInfer EP | Paired roundtrip; isolated components may be unavailable |
+| UCCL EP | Adapter exists; native build and provenance required |
+| NCCL/RCCL A2A | Portable `all_to_all_single` reference |
+| MoRI | AMD adapter exists; timing/correctness and launcher fixes remain |
+
+Historical `--deepep-v2` runs instantiated legacy `Buffer` and are not PR #605 V2 evidence. V2 is
+excluded from every workflow and promoted suite until the real adapter exists. Native NCCL EP and
+AITER EP are follow-on adapters, not aliases for the portable reference.
+
+## Workflows
+
+`.github/workflows/collectivex-sweep.yml` resolves named suites into self-hosted shard jobs and
+aggregates uploaded results. It has exactly two promoted suites:
+
+- `ep-core-v1`: 78 uniform cases and 390 token points;
+- `ep-routing-v1`: 154 Zipf/EPLB cases and 228 token points.
+
+The combined run is 39 shard cells, 232 cases, and 618 token points. Every case is normal-mode BF16
+under `layout-and-dispatch-v1`. Cached-layout (`[cl]`), runtime-visible (`[rv]`), LL, FP8, extra
+routing distributions, model envelopes, placement labels, and temporal/uneven scenarios are not v1
+sweep dimensions. Their adapter paths remain available only for explicit manual diagnostics and
+historical display.
+
+Once the real PR #605 adapter exists, its eight cells add 48 cases and 128 token points, making the
+final v1 target 47 cells, 280 cases, and 746 points.
+
+`.github/workflows/collectivex-experimental.yml` is manual bring-up. Both workflows stop at GitHub
+artifacts; neither updates the frontend or any external store. Results remain diagnostic until v1
+validation, exact coverage, repeat stability, and local promotion gates land.
+
+Workflows map public SKU labels to launchers explicitly and never persist the physical runner name.
+Container images and digests live in `runtime/common.sh`; the public GHA SKU and build capability
+table lives in `capability.py`. Private host inventory is never part of generation.
+
+## Runner Configuration
+
+Each self-hosted runner sources one operator-owned shell file outside the checkout. The default is
+`$XDG_CONFIG_HOME/inferencex/collectivex.env` (or `~/.config/inferencex/collectivex.env`); set
+`COLLECTIVEX_OPERATOR_CONFIG` to use another location. Required exported variables are:
+
+| Public SKU | Required variables |
+|---|---|
+| `h100-dgxc`, `b200-dgxc` | `CX_PARTITION`, `CX_ACCOUNT`, `CX_SQUASH_DIR` |
+| `h200-dgxc` | `CX_PARTITION`, `CX_SQUASH_DIR` |
+| `b300`, `gb200` | `CX_PARTITION`, `CX_ACCOUNT`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` |
+| `gb300` | `CX_PARTITION`, `CX_ACCOUNT`, `CX_SQUASH_DIR`, `CX_STAGE_DIR`, `CX_ENROOT_CACHE_PATH` |
+| `mi325x`, `mi355x` | `CX_PARTITION`, `CX_SQUASH_DIR` |
+
+`CX_EXCLUDE_NODES`, `CX_NODELIST`, `CX_ACCOUNT` (where optional), `CX_STAGE_DIR` (where optional),
+`CX_LOCK_DIR`, and `CX_IMAGE` are deployment overrides. The config file and `env_*.json` captures are
+never uploaded as workflow artifacts.
+
+## Local Checks
+
+```bash
+python3 -m unittest discover experimental/CollectiveX/tests -p 'test_*.py'
+python3 experimental/CollectiveX/sweep_matrix.py \
+  --suites ep-core-v1 --backends deepep,nccl-ep --only-sku h100-dgxc \
+  --out /tmp/collectivex-matrix.json >/dev/null
+bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh
+```
+
+These exercise the current implementation; they do not promote data.
+
+## Main Files
+
+| Path | Role |
+|---|---|
+| `capability.py`, `configs/` | Public backend/platform capabilities and workload/suite registries |
+| `sweep_matrix.py`, `generate_matrix.py` | Suite and shard resolution |
+| `tests/ep_harness.py`, `tests/run_ep.py` | Shared EP execution |
+| `tests/ep_*.py` | Backend adapters; the independent v1 oracle is not yet wired |
+| `validate_results.py` | Strict result validation |
+| `aggregate_results.py` | Per-run outcome projection; the private attempt ledger is still pending |
+| `make_bundle.py` | Bundle construction; authoritative publisher still pending |
+| `docs/methodology.md` | v1 contract, comparability, evidence, and isolated storage |
+
+## Isolated Storage
+
+Development storage is one self-hosted persistent filesystem. GitHub artifacts are transient input;
+there is no Vercel, GCP, Neon, managed database, or managed object store. Private run bundles and
+sanitized public datasets are immutable and content-addressed; only a validated `dev-latest` pointer
+is updated atomically.
+
+## Current Status
+
+Fixed-512 scheduling is present. The v1 schema/identity, backend correctness fixes, exact coverage,
+three-allocation stability, local publisher, and frontend channel ingestion remain active work. No
+current row is approved for a public library or chip ranking.
diff --git a/experimental/CollectiveX/aggregate_results.py b/experimental/CollectiveX/aggregate_results.py
new file mode 100644
index 0000000000..5996f152f2
--- /dev/null
+++ b/experimental/CollectiveX/aggregate_results.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""CollectiveX — result aggregator (the end-of-sweep collector).
+
+The sweep workflow (collectivex-sweep.yml) fans out one matrix CELL per SHARD
+(platform × backend × mode × resource), each cell sweeping its cases in a single
+allocation and emitting a handful of per-case result JSONs. Instead of leaving
+thousands of individual files scattered across the repo, this aggregator COLLECTS
+every shard's results into ONE compact line-delimited file:
+
+    results/aggregate/collectivex_ep.ndjson     # one result doc per line
+
+That aggregate is a transient input to bundle validation and the future local
+publisher; the per-case JSONs stay inside the run as transient shard intermediates. Within a shard, a
+config that was re-run keeps only its NEWEST usable doc (newest generated_at with
+publication_status/status in official|comparable-experimental|valid), with
+genuinely-failed configs preserved when they have no usable counterpart. The hygiene
+rule is folded into the merge so the aggregate is already canonical.
+
+  python3 aggregate_results.py --in-dir <shards_root> --out results/aggregate/collectivex_ep.ndjson
+
+Stdlib only.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+
+USABLE = {"official", "comparable-experimental", "valid"}
+
+
+def _first(*values):
+    """Return the first available value while preserving false/zero identity fields."""
+    return next((value for value in values if value is not None), None)
+
+
+def _failed_key(d: dict) -> str:
+    """Scheduled identity for legacy failed records that predate top-level ``case_id``."""
+    failure = d.get("failure") if isinstance(d.get("failure"), dict) else {}
+    raw_case = failure.get("case") if isinstance(failure.get("case"), dict) else {}
+    case = dict(raw_case)
+    shape = d.get("shape") if isinstance(d.get("shape"), dict) else {}
+    quant = shape.get("quant") if isinstance(shape.get("quant"), dict) else {}
+    eplb_doc = d.get("eplb")
+    eplb = eplb_doc.get("enabled") if isinstance(eplb_doc, dict) else eplb_doc
+    workload = d.get("workload_name")
+    if workload is None:
+        workload_doc = d.get("workload")
+        workload = (workload_doc.get("workload_id") if isinstance(workload_doc, dict)
+                    else workload_doc)
+    routing_doc = d.get("routing_profile")
+    routing = routing_doc.get("routing") if isinstance(routing_doc, dict) else routing_doc
+
+    # Current failed records already carry these fields in failure.case. Top-level aliases keep
+    # older records distinct whenever that scheduled identity was available there instead.
+    fallbacks = {
+        "suite": d.get("suite"),
+        "workload": workload,
+        "backend": d.get("backend"),
+        "phase": d.get("phase"),
+        "ep": d.get("ep_size"),
+        "mode": d.get("mode"),
+        "dispatch_dtype": _first(shape.get("dispatch_dtype"), d.get("dispatch_dtype")),
+        "contract": d.get("measurement_contract"),
+        "routing": _first(shape.get("routing"), routing),
+        "eplb": eplb,
+        "combine_quant_mode": _first(quant.get("combine_quant_mode"),
+                                      d.get("combine_quant_mode")),
+        "resource_mode": d.get("resource_mode"),
+        "tokens_ladder": _first(
+            (d.get("reproduction") or {}).get("tokens_ladder")
+            if isinstance(d.get("reproduction"), dict) else None,
+            d.get("tokens_ladder"),
+        ),
+    }
+    for field, value in fallbacks.items():
+        case[field] = _first(case.get(field), value)
+
+    identity = {
+        "family": d.get("family"),
+        "runner": d.get("runner"),
+        "topology_class": d.get("topology_class"),
+        "case": case,
+    }
+    return "failed:" + json.dumps(identity, sort_keys=True, separators=(",", ":"))
+
+
+def _key(d: dict) -> str:
+    """Config identity used to keep newest-per-config."""
+    if d.get("case_id"):
+        return "case:" + str(d["case_id"])
+    if d.get("comparison_key"):
+        return str(d["comparison_key"])
+    keys = [g.get("comparison_key") for g in d.get("groups", []) if g.get("comparison_key")]
+    if keys:
+        return "|".join(sorted(str(k) for k in keys))
+    if d.get("record_type") == "failed-case":
+        return _failed_key(d)
+    return "|".join(str(d.get(k, "")) for k in ("family", "runner", "backend", "phase",
+                                                "measurement_contract"))
+
+
+def _usable(d: dict) -> bool:
+    return (d.get("publication_status") or d.get("status")) in USABLE
+
+
+def _document(value, source: str) -> dict:
+    if not isinstance(value, dict):
+        raise SystemExit(f"aggregate: {source} is not a JSON object")
+    return value
+
+
+def _iter_docs(in_dir: str):
+    """Yield (source, doc) for every result doc under in_dir — both per-file *.json and
+    line-delimited *.ndjson (so aggregates can be re-merged idempotently)."""
+    for root, _dirs, files in os.walk(in_dir):
+        for f in files:
+            if f.startswith("env_") or f == "analysis.json":
+                continue
+            p = os.path.join(root, f)
+            if f.endswith(".ndjson"):
+                with open(p) as fh:
+                    for line_number, line in enumerate(fh, 1):
+                        line = line.strip()
+                        if line:
+                            try:
+                                value = json.loads(line)
+                            except json.JSONDecodeError as exc:
+                                raise SystemExit(
+                                    f"aggregate: malformed NDJSON at {p}:{line_number}: {exc}"
+                                ) from exc
+                            yield p, _document(value, f"{p}:{line_number}")
+            elif f.endswith(".json"):
+                try:
+                    with open(p) as fh:
+                        value = json.load(fh)
+                except (OSError, json.JSONDecodeError) as exc:
+                    raise SystemExit(f"aggregate: malformed JSON at {p}: {exc}") from exc
+                yield p, _document(value, p)
+
+
+def aggregate(in_dir: str) -> list:
+    """Collect every result doc into one newest terminal outcome per config."""
+    groups: dict = {}
+    for _src, d in _iter_docs(in_dir):
+        groups.setdefault(_key(d), []).append(d)
+    out = []
+    for _k, docs in groups.items():
+        usable = sorted([d for d in docs if _usable(d)],
+                        key=lambda d: d.get("generated_at", ""), reverse=True)
+        if usable:
+            out.append(usable[0])
+        else:
+            # a config that ONLY ever failed: keep its newest record (preserve failed cases)
+            out.append(sorted(docs, key=lambda d: d.get("generated_at", ""), reverse=True)[0])
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result aggregator")
+    ap.add_argument("--in-dir", default="results", help="root to walk for shard result files")
+    ap.add_argument("--out", default="results/aggregate/collectivex_ep.ndjson")
+    a = ap.parse_args()
+
+    docs = aggregate(a.in_dir)
+    os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True)
+    with open(a.out, "w") as fh:
+        for d in docs:
+            fh.write(json.dumps(d, separators=(",", ":")) + "\n")
+    skus = sorted({str(d.get("runner", "?")).split("_")[0].split("-")[0] for d in docs})
+    backs = sorted({str(d.get("backend") or d.get("op") or "?") for d in docs})
+    print(f"aggregate: {len(docs)} docs -> {a.out}  (SKUs={skus} backends={backs})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/artifact_safety.py b/experimental/CollectiveX/artifact_safety.py
new file mode 100644
index 0000000000..222c43fd02
--- /dev/null
+++ b/experimental/CollectiveX/artifact_safety.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""Fail-closed privacy check for CollectiveX public result documents."""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+
+
+SENSITIVE_FIELDS = frozenset({
+    "environment", "env", "host", "hostname", "uuid", "gpu_uuid", "device_uuid",
+    "pci_bus_id", "ip_address", "ip_addresses", "master_addr", "ssh", "ssh_target",
+    "nodelist", "node_list", "nic_guid", "ib_guid", "topology_matrix", "rdma_devices",
+})
+SENSITIVE_VALUE_PATTERNS = (
+    ("private-path", re.compile(r"(?:^|[\s=:])/(?:home|mnt|workspace|root|Users|tmp)/")),
+    ("ipv4-address", re.compile(r"(?<!\d)(?:\d{1,3}\.){3}\d{1,3}(?!\d)")),
+    ("ipv6-or-pci-address", re.compile(
+        r"(?:\b[0-9a-f]{1,4}(?::[0-9a-f]{1,4}){2,}\b|(?<![\w:])(?:[0-9a-f]{0,4}:){2,}[0-9a-f]{0,4}(?![\w:]))",
+        re.I,
+    )),
+    ("uuid", re.compile(
+        r"\b(?:GPU-|MIG-)?[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b",
+        re.I,
+    )),
+    ("ssh-target", re.compile(r"(?:ssh://|\b[^\s/@]+@[^\s/]+)")),
+    ("host-identifier", re.compile(r"\b(?:host|hostname|master_addr|nodelist)=[^\s]+", re.I)),
+    ("secret-token", re.compile(
+        r"(?:gh[pousr]_[A-Za-z0-9]{20,}|Bearer\s+[A-Za-z0-9._-]{16,}|AKIA[0-9A-Z]{16})"
+    )),
+)
+
+
+def _normalized_field(value: object) -> str:
+    return str(value).strip().lower().replace("-", "_")
+
+
+def _sensitive_value_rule(value: str) -> str | None:
+    return next((name for name, pattern in SENSITIVE_VALUE_PATTERNS if pattern.search(value)), None)
+
+
+def assert_publication_safe(docs: list[dict]) -> None:
+    """Reject private infrastructure fields and value shapes."""
+    def walk(value, doc_index: int) -> None:
+        if isinstance(value, dict):
+            for key, child in value.items():
+                field = _normalized_field(key)
+                if field in SENSITIVE_FIELDS:
+                    raise SystemExit(
+                        f"artifact safety: doc[{doc_index}] contains forbidden field {field!r}"
+                    )
+                walk(child, doc_index)
+        elif isinstance(value, list):
+            for child in value:
+                walk(child, doc_index)
+        elif isinstance(value, str):
+            rule = _sensitive_value_rule(value)
+            if rule:
+                raise SystemExit(
+                    f"artifact safety: doc[{doc_index}] contains forbidden {rule} value"
+                )
+
+    for index, doc in enumerate(docs):
+        if not isinstance(doc, dict):
+            raise SystemExit(f"artifact safety: doc[{index}] is not a JSON object")
+        walk(doc, index)
+
+
+def load_documents(paths: list[str]) -> list[dict]:
+    docs: list[dict] = []
+    for path in paths:
+        if os.path.basename(path).startswith("env_"):
+            continue
+        if not os.path.isfile(path):
+            raise SystemExit(f"artifact safety: result file not found: {path}")
+        with open(path) as fh:
+            if path.endswith(".ndjson"):
+                for line_number, line in enumerate(fh, 1):
+                    if not line.strip():
+                        continue
+                    try:
+                        docs.append(json.loads(line))
+                    except json.JSONDecodeError as exc:
+                        raise SystemExit(
+                            f"artifact safety: malformed NDJSON at {path}:{line_number}: {exc}"
+                        ) from exc
+            else:
+                try:
+                    docs.append(json.load(fh))
+                except json.JSONDecodeError as exc:
+                    raise SystemExit(f"artifact safety: malformed JSON at {path}: {exc}") from exc
+    if not docs:
+        raise SystemExit("artifact safety: no public result documents found")
+    return docs
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Check CollectiveX result artifacts for private data")
+    parser.add_argument("paths", nargs="+")
+    args = parser.parse_args()
+    docs = load_documents(args.paths)
+    assert_publication_safe(docs)
+    print(f"artifact safety: {len(docs)} public document(s) passed")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/capability.py b/experimental/CollectiveX/capability.py
new file mode 100644
index 0000000000..21be0025a0
--- /dev/null
+++ b/experimental/CollectiveX/capability.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""Public CollectiveX runner and EP backend capability registry."""
+from __future__ import annotations
+
+import argparse
+import json
+
+
+# Keys are exact GitHub Actions ``runs-on`` labels. Hostnames, addresses, scheduler
+# accounts, and filesystem paths belong in runner-local configuration, never here.
+PLATFORMS = {
+    "h100-dgxc": {
+        "vendor": "nvidia", "gpus_per_node": 8, "scale_up_domain": 8, "ep_degrees": (8,),
+        "launcher": "h100-dgxc-slurm",
+    },
+    "h200-dgxc": {
+        "vendor": "nvidia", "gpus_per_node": 8, "scale_up_domain": 8, "ep_degrees": (8,),
+        "launcher": "h200",
+    },
+    "b200-dgxc": {
+        "vendor": "nvidia", "gpus_per_node": 8, "scale_up_domain": 8, "ep_degrees": (8,),
+        "modes": ("normal",), "launcher": "b200-dgxc",
+    },
+    "b300": {
+        "vendor": "nvidia", "gpus_per_node": 8, "scale_up_domain": 8, "ep_degrees": (8,),
+        "modes": ("normal",), "launcher": "b300",
+    },
+    "gb200": {
+        "vendor": "nvidia", "gpus_per_node": 4, "scale_up_domain": 72,
+        "ep_degrees": (4, 8), "launcher": "gb200-nv",
+    },
+    "gb300": {
+        "vendor": "nvidia", "gpus_per_node": 4, "scale_up_domain": 72,
+        "ep_degrees": (4, 8), "launcher": "gb300-nv",
+    },
+    "mi325x": {
+        "vendor": "amd", "gpus_per_node": 8, "scale_up_domain": 8,
+        "ep_degrees": (8,), "launcher": "mi325x-amds",
+    },
+    "mi355x": {
+        "vendor": "amd", "gpus_per_node": 8, "scale_up_domain": 8,
+        "ep_degrees": (8,), "launcher": "mi355x-amds",
+    },
+}
+
+ALL_ROUTINGS = ("uniform", "balanced", "balanced-rank-local", "zipf", "hotspot-single")
+ALL_ACTIVATIONS = ("normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation")
+
+
+def _backend(vendors, modes, dtypes, contracts, transports, *, combine_dtypes=("bf16",),
+             quant_modes=("none",), quant_combine_arch=None):
+    result = {
+        "vendors": tuple(vendors),
+        "modes": tuple(modes),
+        "dtypes": tuple(dtypes),
+        "contracts": tuple(contracts),
+        "transports": tuple(transports),
+        "combine_dtypes": tuple(combine_dtypes),
+        "quant_modes": tuple(quant_modes),
+        "routings": ALL_ROUTINGS,
+        "eplb": True,
+        "activation_profiles": ALL_ACTIVATIONS,
+    }
+    if quant_combine_arch:
+        result["quant_combine_arch"] = quant_combine_arch
+    return result
+
+
+LAYOUT = "layout-and-dispatch-v1"
+DIAGNOSTIC_CONTRACTS = (LAYOUT, "cached-layout-comm-only-v1", "runtime-visible-v1")
+CAP = {
+    "deepep": _backend(
+        ("nvidia",), ("normal", "ll"),
+        ("bf16", "fp8", "fp8-pertoken", "fp8-directcast"),
+        DIAGNOSTIC_CONTRACTS, ("nvlink", "rdma"),
+    ),
+    "uccl": _backend(
+        ("nvidia",), ("normal", "ll"), ("bf16", "fp8"),
+        DIAGNOSTIC_CONTRACTS, ("nvlink", "rdma"),
+    ),
+    "flashinfer": _backend(
+        ("nvidia",), ("normal",),
+        ("bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"),
+        (LAYOUT,), ("nvlink", "mnnvl"),
+        combine_dtypes=("bf16", "fp8", "nvfp4"),
+        quant_modes=("none", "fp8", "nvfp4"),
+        quant_combine_arch="blackwell",
+    ),
+    "deepep-hybrid": _backend(
+        ("nvidia",), ("normal",), ("bf16",), (LAYOUT,), ("nvlink",),
+    ),
+    "mori": _backend(
+        ("amd",), ("normal",), ("bf16", "fp8"), (LAYOUT,), ("xgmi", "rdma"),
+    ),
+    "nccl-ep": _backend(
+        ("nvidia", "amd"), ("normal",), ("bf16",), (LAYOUT,),
+        ("nvlink", "mnnvl", "rdma", "xgmi"),
+    ),
+}
+
+NVIDIA_SWEEP_BACKENDS = ("deepep", "uccl", "flashinfer", "deepep-hybrid", "nccl-ep")
+SWEEP_BACKENDS = NVIDIA_SWEEP_BACKENDS + ("mori",)
+AARCH64_SKUS = {"gb200", "gb300"}
+RUNNER_WALLS = {
+    ("h200-dgxc", "flashinfer"): "runner container lacks the process capability required by MoeAlltoAll",
+}
+ARCH_ONLY_DTYPES = {"nvfp4": "blackwell", "mxfp4": "blackwell"}
+
+
+def _sku_arch(sku: str) -> str:
+    if sku.startswith(("gb", "b2", "b3")):
+        return "blackwell"
+    if sku.startswith(("h100", "h200")):
+        return "hopper"
+    if sku.startswith("mi3"):
+        return "cdna"
+    return "unknown"
+
+
+def resolve(sku, backend, mode="normal", dtype="bf16", contract=LAYOUT,
+            combine_dtype="bf16", combine_quant_mode="none", routing="uniform",
+            eplb=False, activation_profile="normal"):
+    """Return whether an EP combination can be dispatched on a public runner label."""
+    platform = PLATFORMS.get(sku or "")
+    if platform is None:
+        return False, f"unknown GHA runner label '{sku}'"
+    backend_cap = CAP.get(backend)
+    if backend_cap is None:
+        return False, f"unknown EP backend '{backend}'"
+    if platform["vendor"] not in backend_cap["vendors"]:
+        return False, f"{backend} does not run on {platform['vendor']}"
+    wall = RUNNER_WALLS.get((sku, backend))
+    if wall:
+        return False, f"runner environment wall: {wall}"
+    if backend == "uccl" and sku in AARCH64_SKUS:
+        return False, "uccl EP has no aarch64 build"
+    platform_modes = platform.get("modes")
+    if platform_modes and mode not in platform_modes:
+        return False, f"{sku} modes={platform_modes} (got '{mode}')"
+    if mode not in backend_cap["modes"]:
+        return False, f"{backend} modes={backend_cap['modes']} (got '{mode}')"
+    if dtype not in backend_cap["dtypes"]:
+        return False, f"{backend} dispatch dtypes={backend_cap['dtypes']} (got '{dtype}')"
+    required_arch = ARCH_ONLY_DTYPES.get(dtype)
+    if required_arch and _sku_arch(sku) != required_arch:
+        return False, f"{dtype} dispatch requires {required_arch}"
+    if contract not in backend_cap["contracts"]:
+        return False, f"{backend} contracts={backend_cap['contracts']} (got '{contract}')"
+    if mode == "ll" and contract == "cached-layout-comm-only-v1":
+        return False, "cached-layout is not defined for LL"
+    if combine_dtype not in backend_cap["combine_dtypes"]:
+        return False, f"{backend} combine dtypes={backend_cap['combine_dtypes']}"
+    required_arch = ARCH_ONLY_DTYPES.get(combine_dtype)
+    if required_arch and _sku_arch(sku) != required_arch:
+        return False, f"{combine_dtype} combine requires {required_arch}"
+    if combine_quant_mode not in backend_cap["quant_modes"]:
+        return False, f"{backend} combine quant modes={backend_cap['quant_modes']}"
+    quant_arch = backend_cap.get("quant_combine_arch")
+    if combine_quant_mode != "none" and quant_arch and _sku_arch(sku) != quant_arch:
+        return False, f"{backend} quantized combine requires {quant_arch}"
+    if routing not in backend_cap["routings"]:
+        return False, f"{backend} routings={backend_cap['routings']}"
+    if eplb and not backend_cap["eplb"]:
+        return False, f"{backend} does not support EPLB"
+    if activation_profile not in backend_cap["activation_profiles"]:
+        return False, f"{backend} activation profiles={backend_cap['activation_profiles']}"
+    return True, "ok"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="CollectiveX EP capability resolver")
+    parser.add_argument("--sku")
+    parser.add_argument("--backend")
+    parser.add_argument("--mode", default="normal")
+    parser.add_argument("--dtype", default="bf16")
+    parser.add_argument("--contract", default=LAYOUT)
+    parser.add_argument("--combine-dtype", default="bf16")
+    parser.add_argument("--combine-quant-mode", default="none")
+    parser.add_argument("--routing", default="uniform")
+    parser.add_argument("--eplb", action="store_true")
+    parser.add_argument("--activation-profile", default="normal")
+    parser.add_argument("--list", action="store_true")
+    parser.add_argument("--launcher-for", metavar="SKU")
+    args = parser.parse_args()
+    if args.list:
+        print(json.dumps({"platforms": PLATFORMS, "backends": CAP}, indent=2))
+        return 0
+    if args.launcher_for:
+        platform = PLATFORMS.get(args.launcher_for)
+        if platform is None:
+            parser.error(f"unknown GHA runner label: {args.launcher_for}")
+        print(platform["launcher"])
+        return 0
+    ok, reason = resolve(
+        args.sku, args.backend, args.mode, args.dtype, args.contract, args.combine_dtype,
+        args.combine_quant_mode, args.routing, args.eplb, args.activation_profile,
+    )
+    print(f"{'VALID' if ok else 'INVALID'}: {reason}")
+    return 0 if ok else 3
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml
new file mode 100644
index 0000000000..32b37c37c7
--- /dev/null
+++ b/experimental/CollectiveX/configs/suites.yaml
@@ -0,0 +1,53 @@
+# CollectiveX v1 promoted suites. Diagnostic adapter capabilities stay available through the
+# manual workflow, but are not multiplied across the comparison matrix.
+schema_version: 1
+
+timing_profile:
+  iters: 8
+  trials: 64
+  warmup: 32
+  warmup_semantics: full-roundtrip-per-trial-point-v1
+
+headline_distribution:
+  routing: uniform
+  basis: synthetic
+  rationale: >-
+    Uniform is the deterministic cross-chip headline. One Zipf trace measures skew sensitivity;
+    the same trace with EPLB measures remediation. Other synthetic routes are diagnostics, not
+    promoted dimensions.
+  sensitivity_distributions: [zipf, zipf+eplb]
+
+suites:
+  ep-core-v1:
+    description: "portable BF16 EP comparison across every supported stack and topology"
+    workloads: [deepseek-v3-v1]
+    platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [uniform]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    token_points_prefill: [256, 512]
+    canonical: true
+    samples_per_point: 512
+    required_publication: official
+
+  ep-routing-v1:
+    description: "Zipf skew and EPLB recovery at decision-relevant anchors"
+    workloads: [deepseek-v3-v1]
+    platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+    backends: [deepep, mori]
+    modes: [normal]
+    dtypes: [bf16]
+    contracts: [layout-and-dispatch-v1]
+    routings: [zipf]
+    eplb: [false, true]
+    resource_modes: [tuned]
+    phases: [decode, prefill]
+    token_points_decode: [128]
+    token_points_prefill: [512, 2048]
+    canonical: true
+    samples_per_point: 512
+    required_publication: comparable-experimental
diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml
new file mode 100644
index 0000000000..a67aeaa2bd
--- /dev/null
+++ b/experimental/CollectiveX/configs/workloads.yaml
@@ -0,0 +1,26 @@
+# CollectiveX v1 canonical workload and phase metadata.
+schema_version: 1
+
+model_derived:
+  deepseek-v3-v1:
+    kind: model-derived
+    hidden: 7168
+    topk: 8
+    routed_experts: 256
+    shared_experts: 1
+    expert_alignment: 128
+    dispatch_dtype: bf16
+    combine_dtype: bf16
+    verified_against: "deepseek-ai/DeepSeek-V3@e815299b0bcbac849fa540c768ef21845365c9eb/config.json"
+
+phase_profiles:
+  decode:
+    token_ladder: [1, 2, 4, 8, 16, 32, 64, 128]
+    description: "one MoE layer and one decode-step dispatch/combine pair"
+    active_sequences: "one batch of active sequences"
+    tokens_per_iter: "one or a few per active sequence"
+  prefill:
+    token_ladder: [128, 256, 512, 1024, 2048, 4096]
+    description: "one MoE layer and one chunked-prefill dispatch/combine pair"
+    chunk_size: "the tokens/rank point entering the MoE layer"
+    tokens_entering_moe: "chunk_size * ep_size"
diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md
new file mode 100644
index 0000000000..12ccdc3328
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology.md
@@ -0,0 +1,266 @@
+# CollectiveX EP v1 Technical Design
+
+This is the tracked technical design for new CollectiveX expert-parallel results. Active work and
+exit criteria live in `../goal.md`; historical run narratives are evidence, not contract.
+
+The result namespace is `collectivex.ep.v1`. New producers must use it end to end: matrix,
+benchmark, bundle, projection, and frontend. Numeric schemas 3 through 5 are import-only legacy.
+
+## Product boundary
+
+CollectiveX measures MoE dispatch, combine, and their paired roundtrip so users can:
+- compare EP libraries on one chip and topology;
+- compare EP latency and logical payload bandwidth across chips at the same logical workload; and
+- inspect failures, unsupported cells, topology effects, and tail stability without contaminating rankings.
+
+This is a communication microbenchmark. It does not claim to predict serving throughput unless a
+separate end-to-end correlation study demonstrates that relationship.
+
+## Record model
+
+Each JSON result document has `format: "collectivex.ep.v1"` and exactly one terminal outcome per
+expected case. Unknown fields, invalid enums, missing nested identity, or zero parsed documents fail.
+
+Required top-level groups are:
+- `case`: stable case ID, suite membership, required evidence tier, and swept coordinate;
+- `workload`: logical MoE shape and canonical routing identity;
+- `measurement`: timing boundary, sampling schedule, component availability, and byte accounting;
+- `implementation`: library, instantiated API, build, runtime, and resource identity;
+- `topology`: requested and realized placement and transport;
+- `provenance`: source, image, loaded libraries, allocation, attempt, and timestamps;
+- `rows`: per-point latency, bandwidth, correctness, and tail evidence; and
+- `outcome`: `success`, `failed`, `invalid`, `diagnostic`, or `unsupported`, with reasons.
+
+Raw samples and private environment data live in the immutable run bundle, not the public row; every
+result and failure retains its case ID and attempt ID.
+
+## Workload contract
+
+A workload is generated once over the global token batch. Every rank materializes only its assigned
+slice; adapters may not generate their own routing. The serialized canonical workload includes:
+
+- phase, tokens per rank, hidden size, top-k, expert count, EP size, and source-token allocation;
+- dispatch and combine dtypes, quantization/scaling layout, alignment, and capacity policy;
+- routing distribution, seed, routing step, expert placement, EPLB mapping, and trace checksum; and
+- exact input values, gate weights, expected receive counts, and oracle version.
+
+The headline shape is DeepSeek-V3-like (`hidden=7168`, `top_k=8`, `experts=256`), but every shape is
+named and checksummed. Decode and prefill are distinct cases; dropped points are terminal outcomes.
+
+## Promoted v1 matrix
+
+The promoted matrix is intentionally finite:
+
+- `ep-core-v1`: uniform routing, the full decode ladder, and prefill 256/512 (T=128 is measured once
+  in the decode ladder because phase does not change the kernel);
+- `ep-routing-v1`: one Zipf trace with EPLB off/on at decode 128 and prefill 512/2048; and
+- 39 runnable stack/topology cells, producing 232 cases and 618 token points before repeat allocations.
+
+Every promoted case is normal mode, BF16 dispatch/combine, backend-tuned resources, canonical
+`deepseek-v3-v1`, and `layout-and-dispatch-v1`. Balanced, rank-local, hotspot, heavier Zipf, temporal,
+uneven-token, model-envelope, placement, scaling, and quantized-combine sweeps are manual diagnostics
+or follow-on studies, not missing v1 coverage.
+
+DeepEP PR #605 V2 is not a runnable v1 cell yet. Historical V2-labelled runs used legacy `Buffer`;
+the real `ElasticBuffer` adapter must land before V2 re-enters the matrix. It will add eight cells,
+48 cases, and 128 points, yielding the final 47-cell/280-case/746-point v1 target.
+
+## Measurement contracts
+
+The timing boundary is named and immutable. Implementations advertise supported contracts; an
+unsupported pairing must fail before allocation or emit `unsupported` without timing.
+
+### `layout-and-dispatch-v1`
+
+Dispatch includes routing-layout generation and communication. Input quantization and receive-side
+dequantization are outside the timed region. This is the common library-comparison boundary only
+when every selected adapter can implement the same start and stop states.
+
+### `cached-layout-comm-only-v1`
+
+The exact routing layout or handle is prepared and validated before timing, then reused. The timer
+covers dispatch from that cached state, which may still include packing, local movement, handle work,
+and communication. Handle reuse is bound to the routing checksum. This contract is never overlaid
+with a layout-inclusive result.
+
+### `runtime-visible-v1`
+
+Timing starts at the runtime-visible input state and ends when the expert input or combined token
+output is consumable. Any cast, scale generation, layout, dequantization, event wait, or staging
+inside that boundary is recorded in `stage_scope` and timed consistently for isolated components
+and paired roundtrip.
+
+Only `layout-and-dispatch-v1` enters the promoted v1 matrix. Cached-layout (`[cl]`) is a decomposition
+diagnostic, not a communication-only portable contract. Runtime-visible (`[rv]`) duplicates the BF16
+path and is retained only for a future targeted quantization-cost study. Native low-latency (LL)
+remains manual until it has matched normal-mode semantics, correct byte accounting, one honest timing
+contract, and evidence-gated platform support. Legacy `[cl]`, `[rv]`, and LL rows remain importable and
+displayable but cannot rank or recommend.
+
+### Component semantics
+
+`dispatch`, `combine`, and `roundtrip` each have `availability`, `origin`, `start_state`, and
+`end_state`. Unmeasured components are null. A paired-only implementation, such as a stateful
+roundtrip protocol, must not copy roundtrip samples into dispatch or combine. `isolated_sum` is a
+derived diagnostic and is never a measured latency, throughput denominator, or recommendation.
+
+## Sampling and timing
+
+Every scored point uses `fixed-512-v1`:
+
+- 64 trials;
+- 8 timed iterations per trial, for 512 observations per measured component; and
+- 32 synchronized, untimed, full dispatch-stage-combine warmups immediately before each
+  trial and point.
+
+The realized point order, warmup schedule, retry policy, attempt count, and all failed attempts are
+recorded. Backend-specific warmup or sampling changes create a different contract and cannot enter
+the same contrast.
+
+Device work is timed with events on the stream that performs the work, with explicit dependencies
+for multi-stream operations. Host monotonic time is retained as a diagnostic. Each iteration is
+reduced by maximum latency across ranks before percentiles are computed. Report p50, p90, p95, and
+p99; measured roundtrip p99 is the headline configuration latency.
+
+Retries never replace earlier attempts. Selection rules operate on the full attempt history so a
+successful retry cannot hide instability or bias a curve. Tail gates use suite-versioned thresholds
+for p99/p50, exceedance rate, adjacent-point discontinuity, and cross-allocation variation; a failed
+tail gate makes the point diagnostic.
+
+## Correctness
+
+Correctness uses an implementation-independent oracle. For each routed token copy it verifies the
+destination rank, expert, source token, multiplicity, gate weight, and source-order reconstruction.
+A deterministic expert-specific transform ensures that routing to the wrong expert cannot pass as
+an identity roundtrip.
+
+For every rank and point, the benchmark must:
+
+1. verify expected and realized receive counts;
+2. validate dispatch metadata and payload against the oracle;
+3. validate combine output against the oracle before timing;
+4. run all timed samples without mutating the semantic input; and
+5. validate payload and metadata again after timing.
+
+Quantized paths declare the exact format, scale layout, accumulation behavior, absolute and relative
+tolerances, and the reason for each tolerance. A whole document cannot be marked correct from one
+implementation or one pre-timing smoke check. Any failed rank or point prevents that case from being
+comparison eligible.
+
+## Latency and bandwidth
+
+All latency fields use microseconds. The document records the formula and byte-accounting version
+for each bandwidth field.
+
+- `logical_payload_bytes` counts actual routed activation and required scale bytes at the named
+  operation boundary. Metadata and padding are reported separately.
+- `logical_bandwidth_Bps = logical_payload_bytes / measured_latency_seconds` for that operation.
+- paired roundtrip accounting records dispatch and combine payload separately before summing them;
+- `roundtrip_tokens_per_second` uses measured paired roundtrip, never `isolated_sum`;
+- primitive `algbw` and operation-adjusted `busbw` remain primitive-specific metrics; and
+- physical wire utilization is null unless measured transport counters support it.
+
+Logical payload bandwidth is useful for comparing the same EP semantics. It is not physical link
+bandwidth and must not be labeled as such. Charts expose byte definitions, units, and denominators.
+
+## Identity and controlled comparisons
+
+Identity is canonical JSON hashed with SHA-256. Three related IDs avoid hiding differences:
+- `series_id`: all locked factors except the swept token coordinate and repeat allocation;
+- `point_id`: `series_id` plus the swept coordinate; and
+- `evidence_id`: `point_id` plus allocation, run, attempt, and sample-set checksum.
+
+Locked factors include workload bytes and routing; measurement contract and component states;
+sampling, order, warmups, and retries; requested and achieved resources; physical placement and
+transport; instantiated backend API/class/build; loaded libraries; image; runtime; and source SHA.
+
+A comparison declares exactly one contrast axis:
+- `library`: backend implementation may differ; workload, chip, topology, resource policy, and
+  measurement remain matched;
+- `chip`: hardware and realized topology may differ; workload, EP size, placement class, resource
+  policy, implementation contract, and measurement remain matched;
+- `system`: chip, topology, and backend may differ; workload, EP size, measurement, and declared
+  resource policy remain matched, and every varied field remains visible; or
+- `resource`: requested resource profile may differ; all other locked factors remain matched.
+
+The validator excludes only the declared axis; any additional difference rejects the overlay. Chip
+and system contrasts are measured systems, not silicon-only claims. `standardized`, `normalized`,
+and backend-tuned resource policies are distinct classes and are never silently mixed.
+
+## Topology and provenance
+
+Requested and realized topology are both mandatory: chip SKU and architecture, nodes, GPUs per
+node, world size, rank-to-node/device/tray map, scale-up domain, locality, transport, fabric, and a
+topology fingerprint. Validate `world_size == placement ranks`, allocation capacity, packed-case
+occupancy, and platform-registry compatibility before timing.
+
+Placement labels are valid only if execution applies and records that placement. Contradictory SKU,
+node, tray, or transport metadata makes the case invalid.
+
+Implementation identity names the instantiated class and probed API, not an inferred package major
+version. Legacy DeepEP `Buffer`, PR #605 `ElasticBuffer`, native NVIDIA `contrib/nccl_ep`, and a
+PyTorch `all_to_all_single` reference are separate implementations. Record source commit, patches,
+native GPU targets, build inputs, image digest, and actually loaded libraries after dynamic builds.
+
+Private hostnames, addresses, device IDs, NIC IDs, and paths are retained only in the private bundle
+and removed from the public projection.
+
+## Capability and evidence policy
+
+Capability declarations describe combinations the resolver may attempt; they do not prove that a
+cell works or that its measurements are comparable. Evidence status is derived from artifacts:
+
+- `unsupported`: the library or platform cannot represent the requested contract;
+- `failed`: setup or execution did not produce a complete result;
+- `invalid`: correctness, timing, identity, topology, or schema failed;
+- `diagnostic`: valid evidence that does not satisfy comparison or repeat requirements; and
+- `eligible`: complete, conforming evidence that may enter a controlled contrast.
+
+Every requested matrix case has one terminal outcome. Missing, extra, duplicate, malformed,
+heterogeneous, or wrong-status cases block channel promotion but remain visible as evidence.
+Machine-readable quarantine is applied before plotting or decision generation.
+
+A p99 point becomes decision-grade only after three complete independent allocation IDs agree under
+the same point identity and pass correctness, coverage, provenance, and tail-stability gates. The
+public UI may show diagnostic evidence, but only decision-grade measured roundtrip p99 can drive a
+ranking or recommendation.
+
+## Isolated artifact store
+
+Development storage uses one self-hosted machine and one persistent filesystem. It must not depend
+on Vercel storage, GCP, Neon, another managed database, or a third-party object store.
+
+`$COLLECTIVEX_STORE_ROOT/private` contains incoming attempts, content-addressed immutable run
+bundles, quarantined attempts, raw samples, environments, matrix definitions, outcomes, schemas,
+and checksums. `$COLLECTIVEX_STORE_ROOT/public` contains only sanitized content-addressed datasets
+and mutable channel pointers such as `dev-latest.json`. The two trees have separate permissions.
+
+`bundle_id` hashes the canonical manifest and file checksums. `dataset_id` hashes projection format,
+selection policy, source bundle IDs, and projected checksums; publication time is excluded. JSON
+manifests are authoritative. A rebuildable catalog is an index, not a database.
+
+Publication is fail-closed and atomic:
+
+1. take an exclusive filesystem lock;
+2. stage on the same filesystem as the destination;
+3. verify checksums and strict schemas;
+4. compare the full expected matrix with terminal outcomes;
+5. verify homogeneous identities and realized timing schedules;
+6. write checksums and `COMPLETE`, then fsync files and directories;
+7. atomically rename the private run bundle;
+8. build, sanitize, validate, fsync, and atomically rename the public dataset; and
+9. atomically replace the channel pointer only after all prior steps succeed.
+
+Invalid or incomplete attempts may update a sanitized `latest-attempt` diagnostic pointer but never
+`dev-latest`. Channel responses use `no-cache`; immutable dataset responses may use long-lived
+caching. GitHub Actions artifacts are transient delivery inputs, not durable authority.
+
+## Legacy imports
+
+Numeric schema versions 3, 4, and 5 are immutable historical inputs. Importers preserve original
+bytes, source availability, schema, sampling, timing, and quarantine reasons. They must not rewrite
+legacy records as `collectivex.ep.v1`, synthesize missing components, seed `dev-latest`, or drive
+rankings, budgets, crossovers, and recommendations.
+
+Legacy data may appear in an explicitly historical evidence view. New comparable results begin only
+with native `collectivex.ep.v1` producers and a publisher-created dataset.
diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py
new file mode 100644
index 0000000000..2a143ca18e
--- /dev/null
+++ b/experimental/CollectiveX/env_capture.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — Layer-0 environment + topology capture.
+
+Emits a JSON document describing the node a collective benchmark ran on, so
+every result is provenance-tagged and a B200-vs-GB200 comparison is defensible.
+Standard library only (so it runs in any minimal container, and off-GPU it
+degrades gracefully instead of crashing). torch is used only if importable.
+
+Usage:
+    python env_capture.py --redact --out results/env_b200-dgxc.json
+    python env_capture.py --redact --out env.json   # private local provenance
+
+Importable:
+    from env_capture import capture_environment
+    env = capture_environment(redact=True)
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import platform
+import re
+import shutil
+import socket
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+
+# Env vars worth recording — transport/tuning knobs that change what a
+# collective actually does (esp. the GB200 MNNVL flags vs B200).
+ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_")
+ENV_EXACT = (
+    "CUDA_VISIBLE_DEVICES",
+    "CUDA_DEVICE_ORDER",
+    "SLURM_JOB_ID",
+    "SLURM_NNODES",
+    "SLURM_NTASKS",
+    "SLURM_JOB_PARTITION",
+    # Image identity — set by the launcher so the bundle records what ran.
+    "COLLECTIVEX_IMAGE",
+    "COLLECTIVEX_IMAGE_DIGEST",
+)
+
+
+def _run(cmd: list[str], timeout: int = 20) -> str | None:
+    """Run a command, return stdout (stripped) or None if unavailable."""
+    if shutil.which(cmd[0]) is None:
+        return None
+    try:
+        out = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False
+        )
+    except (subprocess.TimeoutExpired, OSError):
+        return None
+    if out.returncode != 0:
+        return None
+    return out.stdout.strip()
+
+
+def _redact(value: str | None) -> str | None:
+    """Stable short hash for private provenance identifiers that must remain joinable."""
+    if not value:
+        return value
+    return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12]
+
+
+def _gpus(redact: bool) -> dict:
+    """GPU inventory via nvidia-smi (None fields off-GPU)."""
+    info: dict = {"source": None, "count": None, "devices": []}
+    q = _run(
+        [
+            "nvidia-smi",
+            "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id",
+            "--format=csv,noheader,nounits",
+        ]
+    )
+    if q is None:
+        return info
+    info["source"] = "nvidia-smi"
+    devices = []
+    for line in q.splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 5:
+            continue
+        name, uuid, mem_mib, cc, bus = parts[:5]
+        devices.append(
+            {
+                "name": name,
+                "uuid": _redact(uuid) if redact else uuid,
+                "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib,
+                "compute_capability": cc,
+                "pci_bus_id": _redact(bus) if redact else bus,
+            }
+        )
+    info["count"] = len(devices)
+    info["devices"] = devices
+    return info
+
+
+def _driver_cuda() -> dict:
+    out = _run(
+        ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+    )
+    driver = out.splitlines()[0].strip() if out else None
+    # `nvidia-smi` (no args) prints the CUDA driver-API version in its header.
+    cuda = None
+    header = _run(["nvidia-smi"])
+    if header:
+        m = re.search(r"CUDA Version:\s*([0-9.]+)", header)
+        if m:
+            cuda = m.group(1)
+    return {"driver_version": driver, "cuda_version": cuda}
+
+
+def _torch_info() -> dict:
+    """NCCL / torch build info — only if torch is importable in this env."""
+    info: dict = {"available": False}
+    try:
+        import torch  # type: ignore
+    except Exception:
+        return info
+    info["available"] = True
+    info["torch_version"] = torch.__version__
+    try:
+        info["cuda_runtime"] = torch.version.cuda
+    except Exception:
+        info["cuda_runtime"] = None
+    try:
+        if torch.cuda.is_available():
+            nccl = torch.cuda.nccl.version()
+            # version() returns an int (e.g. 22304) or a tuple, depending on build.
+            info["nccl_version"] = (
+                ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl
+            )
+            info["device_count"] = torch.cuda.device_count()
+            info["device_name"] = torch.cuda.get_device_name(0)
+            cc = torch.cuda.get_device_capability(0)
+            info["compute_capability"] = f"{cc[0]}.{cc[1]}"
+    except Exception as exc:  # pragma: no cover - hardware dependent
+        info["error"] = repr(exc)
+    return info
+
+
+def _topology(redact: bool) -> dict:
+    """GPU/NIC topology matrix + a fingerprint to gate comparability.
+
+    The fingerprint is a hash of the structural part of `nvidia-smi topo -m`
+    (the connection legend), so two nodes with the same wiring share a key
+    even if absolute device IDs differ."""
+    topo = _run(["nvidia-smi", "topo", "-m"])
+    if topo is None:
+        return {"source": None, "matrix": None, "fingerprint": None}
+    # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only —
+    # ignore GPU/NIC labels and whitespace so it's placement-stable.
+    tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo)
+    fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16]
+    return {
+        "source": "nvidia-smi topo -m",
+        # The matrix can contain hostnames in some setups; redact wholesale.
+        "matrix": ("<redacted>" if redact else topo),
+        "fingerprint": fingerprint,
+    }
+
+
+def _rdma(redact: bool) -> dict:
+    """RDMA/IB device presence — names only, GUIDs redactable."""
+    devices: list[str] = []
+    listing = _run(["ibv_devinfo", "-l"])
+    if listing:
+        for line in listing.splitlines()[1:]:  # first line is a count
+            name = line.strip()
+            if name:
+                devices.append(name)
+    elif _run(["ibstat", "-l"]):
+        devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()]
+    return {
+        "available": bool(devices),
+        "devices": [_redact(d) if redact else d for d in devices],
+    }
+
+
+def _env_vars() -> dict:
+    out = {}
+    for k, v in os.environ.items():
+        if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES):
+            out[k] = v
+    return dict(sorted(out.items()))
+
+
+def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict:
+    """Return a JSON-serializable environment/provenance record."""
+    host = socket.gethostname()
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "redacted": redact,
+        "host": _redact(host) if redact else host,
+        "platform": {
+            "system": platform.system(),
+            "release": platform.release(),
+            "machine": platform.machine(),  # x86_64 vs aarch64 (B200 vs GB200)
+            "python": sys.version.split()[0],
+        },
+        "gpus": _gpus(redact),
+        "driver": _driver_cuda(),
+        "torch": _torch_info(),
+        "topology": _topology(redact),
+        "rdma": _rdma(redact),
+        "env": _env_vars(),
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture")
+    ap.add_argument("--out", help="write JSON here (default: stdout)")
+    ap.add_argument(
+        "--redact",
+        action="store_true",
+        help="hash hostnames / IPs / GPU UUIDs / IB GUIDs in the private capture",
+    )
+    ap.add_argument(
+        "--timestamp",
+        help="ISO timestamp to stamp (default: now); pass one for reproducible bundles",
+    )
+    args = ap.parse_args()
+
+    env = capture_environment(redact=args.redact, timestamp=args.timestamp)
+    blob = json.dumps(env, indent=2)
+    if args.out:
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            fh.write(blob + "\n")
+        # A one-line human summary to stdout (the JSON is the artifact).
+        g = env["gpus"]
+        print(
+            f"env -> {args.out} | machine={env['platform']['machine']} "
+            f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}"
+        )
+    else:
+        print(blob)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/generate_matrix.py b/experimental/CollectiveX/generate_matrix.py
new file mode 100644
index 0000000000..6ab66d2ff3
--- /dev/null
+++ b/experimental/CollectiveX/generate_matrix.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""CollectiveX matrix generator.
+
+Reads the public suite/workload registries and capability table, then resolves a named suite into
+the validated cases before any GPU is allocated. ``platform`` is always an exact GHA runner label.
+
+  python3 generate_matrix.py --suite ep-core-v1 --out matrix.json
+
+Pure stdlib + PyYAML.
+"""
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import os
+import sys
+
+import yaml
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
+import capability as cap  # noqa: E402
+
+EXPECTED_TIMING_PROFILE = {
+    "iters": 8,
+    "trials": 64,
+    "warmup": 32,
+    "warmup_semantics": "full-roundtrip-per-trial-point-v1",
+}
+
+
+def _load(name):
+    with open(os.path.join(HERE, "configs", name)) as fh:
+        return yaml.safe_load(fh)
+
+
+def resolve_case(plat, beng, mode, dtype, contract, routing, ep, phase,
+                 combine_quant_mode="none", activation_profile="normal", eplb=False):
+    """Return whether the case is supported by the public runner/backend registry."""
+    platform = cap.PLATFORMS.get(plat)
+    if platform is None:
+        return False, f"unknown platform {plat}"
+    if ep not in platform["ep_degrees"]:
+        return False, f"{plat} EP{ep} not validated (have {platform['ep_degrees']})"
+    if mode == "ll" and phase != "decode":
+        return False, f"{beng} mode=ll is decode-only (got {phase})"
+    return cap.resolve(
+        plat, beng, mode=mode, dtype=dtype, contract=contract,
+        combine_quant_mode=combine_quant_mode, routing=routing, eplb=eplb,
+        activation_profile=activation_profile,
+    )
+
+
+def validate_workloads(suite_name, suite, workloads):
+    """Validate workload names and pin official shapes to a reviewed source config."""
+    registry = {
+        name: cfg
+        for section in ("synthetic", "model_derived")
+        for name, cfg in (workloads.get(section) or {}).items()
+    }
+    unknown = sorted(set(suite["workloads"]) - set(registry))
+    if unknown:
+        raise SystemExit(f"suite {suite_name}: unknown workloads {unknown}")
+    if suite.get("required_publication") == "official":
+        unverified = sorted(
+            name for name in suite["workloads"] if not registry[name].get("verified_against")
+        )
+        if unverified:
+            raise SystemExit(
+                f"suite {suite_name}: official workloads need verified_against: {unverified}"
+            )
+
+
+def generate(suite_name):
+    suites_doc = _load("suites.yaml")
+    suites = suites_doc["suites"]
+    workloads = _load("workloads.yaml")
+    if suite_name not in suites:
+        raise SystemExit(f"unknown suite {suite_name}; have {sorted(suites)}")
+    timing_profile = suites_doc.get("timing_profile")
+    if timing_profile != EXPECTED_TIMING_PROFILE:
+        raise SystemExit(f"suite registry timing_profile must be {EXPECTED_TIMING_PROFILE}, "
+                         f"got {timing_profile}")
+    timing = f"{timing_profile['iters']}:{timing_profile['trials']}:{timing_profile['warmup']}"
+    s = suites[suite_name]
+    validate_workloads(suite_name, s, workloads)
+    if "samples_per_point" not in s:
+        raise SystemExit(f"suite {suite_name}: missing required samples_per_point: 512")
+    samples_per_point = int(s["samples_per_point"])
+    if samples_per_point != 512:
+        raise SystemExit(f"suite {suite_name}: samples_per_point must be 512, got {samples_per_point}")
+    phases = s.get("phases", ["decode"])
+    routings = s.get("routings", ["uniform"])
+    resource_modes = s.get("resource_modes", ["tuned"])
+    # Optional diagnostic axes default to the promoted path when omitted.
+    cqms = s.get("combine_quant_modes", ["none"])
+    placements = s.get("placements", ["packed"])
+    activations = s.get("activation_profiles", ["normal"])
+    eplbs = s.get("eplb", [False])
+    unevens = s.get("uneven_tokens", ["none"])
+    cases, omitted = [], []
+    for plat in s["platforms"]:
+        platform = cap.PLATFORMS.get(plat)
+        if platform is None:
+            raise SystemExit(f"suite {suite_name}: unknown GHA platform {plat}")
+        for beng in sorted(set(s["backends"])):
+            eps = s.get("ep_degrees") or platform["ep_degrees"]
+            for (wl, mode, dtype, contract, routing, ep, phase, rmode, cqm, placement, act,
+                 eplb, uneven) in itertools.product(
+                    s["workloads"], s["modes"], s.get("dtypes", ["bf16"]), s["contracts"],
+                    routings, eps, phases, resource_modes, cqms, placements, activations,
+                    eplbs, unevens):
+                ok, reason = resolve_case(
+                    plat, beng, mode, dtype, contract, routing, ep, phase,
+                    combine_quant_mode=cqm, activation_profile=act, eplb=eplb,
+                )
+                rec = {"suite": suite_name, "workload": wl, "platform": plat,
+                       "backend": beng, "mode": mode,
+                       "dtype": dtype, "contract": contract, "routing": routing, "ep": ep,
+                       "phase": phase, "resource_mode": rmode, "combine_quant_mode": cqm,
+                       "placement": placement, "activation_profile": act,
+                       "eplb": eplb, "routing_step": 0, "uneven_tokens": uneven,
+                       "canonical": bool(s.get("canonical", False)),
+                       "required_publication": s.get("required_publication"),
+                       "samples_per_point": samples_per_point, "timing": timing,
+                       "warmup_semantics": timing_profile["warmup_semantics"]}
+                (cases if ok else omitted).append({**rec, **({} if ok else {"reason": reason})})
+    return {"suite": suite_name, "required_publication": s.get("required_publication"),
+            "samples_per_point": samples_per_point,
+            "timing_profile": timing_profile,
+            "n_cases": len(cases), "n_omitted": len(omitted),
+            "cases": cases, "omitted": omitted}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX matrix generator")
+    ap.add_argument("--suite", required=True)
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    m = generate(a.suite)
+    print(f"suite={m['suite']} required={m['required_publication']} "
+          f"timing={m['timing_profile']['iters']}:{m['timing_profile']['trials']}:"
+          f"{m['timing_profile']['warmup']} samples/point={m['samples_per_point']}: "
+          f"{m['n_cases']} valid cases, {m['n_omitted']} omitted")
+    seen = set()
+    for o in m["omitted"]:
+        k = (o["platform"], o["backend"], o["mode"], o["dtype"], o["contract"], o["reason"])
+        if k not in seen:
+            seen.add(k)
+            print(f"  OMIT {o['platform']}/{o['backend']}/{o['mode']}/{o['dtype']}/{o['contract']}: {o['reason']}")
+    if a.out:
+        with open(a.out, "w") as fh:
+            json.dump(m, fh, indent=2)
+        print(f"wrote {a.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
new file mode 100644
index 0000000000..1a4ea7a800
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64).
+#
+# Thin adapter: handles B200-specific allocation/container, then hands off to
+# runtime/run_in_container.sh which runs the selected EP backend. Mirrors the Slurm/enroot
+# squash + srun --container) with all model-serving stripped.
+#
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR
+PARTITION="$CX_PARTITION"
+ACCOUNT="$CX_ACCOUNT"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-deepep}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance.
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME ngpus=$NGPUS bench=$CX_BENCH"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER_NAME"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
+          --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — result artifacts collected"
diff --git a/experimental/CollectiveX/launchers/launch_b300.sh b/experimental/CollectiveX/launchers/launch_b300.sh
new file mode 100644
index 0000000000..791bb59ca1
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b300.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# CollectiveX — B300 single-node SKU adapter (8x B300 SXM6, NVLink island, x86_64, SM100).
+#
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b300}"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR
+PARTITION="$CX_PARTITION"
+ACCOUNT="$CX_ACCOUNT"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b300)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b300-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-deepep}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME ngpus=$NGPUS bench=$CX_BENCH"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER_NAME"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" ${EXCLUDE_NODES:+--exclude="$EXCLUDE_NODES"} \
+          --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — result artifacts collected"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
new file mode 100644
index 0000000000..3f94c004de
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -0,0 +1,264 @@
+#!/usr/bin/env bash
+# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray.
+#
+# Two paths, selected by CX_NODES:
+#   * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to
+#     run_in_container.sh, -g 4.
+#   * CX_NODES>1: runs the EP adapter across all ranks in the MNNVL domain.
+#
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-gb200-nv}"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR
+PARTITION="$CX_PARTITION"
+ACCOUNT="$CX_ACCOUNT"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}"          # NVL72 compute tray = 4 GPU/node
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}"
+NODES="${CX_NODES:-1}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+WORLD=$((NODES * GPUS_PER_NODE))
+[ "$NODES" = 1 ] || [ "$NODES" = 2 ] || cx_die "GB200 supports one or two four-GPU trays"
+[ "$GPUS_PER_NODE" = 4 ] || cx_die "GB200 requires four GPUs per tray"
+[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "GB200 requires the NVL72 scale-up domain"
+cx_apply_timing_profile
+
+export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS" CX_NGPUS="$WORLD" CX_GPUS_PER_NODE="$GPUS_PER_NODE"
+export CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN"
+export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_BENCH="${CX_BENCH:-deepep}"
+case "$CX_BENCH" in
+  deepep|deepep-hybrid|uccl|nccl-ep|flashinfer) ;;
+  *) cx_die "unsupported GB200 EP backend: $CX_BENCH" ;;
+esac
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+# Required MNNVL transport settings, also recorded in provenance.
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+
+# ----------------------------------------------------------------------------
+if [ "$NODES" -le 1 ]; then
+  # Single tray (4 GPU): generic dispatcher, -g N single process.
+  export CX_NGPUS="$GPUS_PER_NODE"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \
+            --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+  cx_log "JOB_ID=$JOB_ID"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint --export=ALL \
+    bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — result artifacts collected"
+  exit 0
+fi
+
+# ----------------------------------------------------------------------------
+# Multi-node MNNVL EP path: run_ep.py across WORLD srun tasks (1 GPU/rank,
+# per-rank RANK/LOCAL_RANK from SLURM_*), intranode NVLink across <=8 MNNVL ranks. One config/dispatch.
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+          --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+# Run run_ep.py across WORLD srun tasks over MNNVL.
+  MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)" 2>/dev/null | head -1)"; MP=29553
+  mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+  # Restore process-local loader/import paths and exact backend build identity from build-only.
+  WRAP='[ -f /tmp/.cx_backend_env ] && . /tmp/.cx_backend_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+  # Build legacy direct-env DeepEP or FlashInfer quant diagnostics once per node into a persistent
+  # named container, then every case-srun reuses it (build visible to all WORLD ranks). Mirrors the
+  # proven launch_gb300-nv.sh EP8 path: without this, the multi-srun ran ephemeral per-rank containers
+  # that bypassed the build hooks (legacy direct-env DeepEP and quant-combine diagnostics).
+  CNAME="cxep_${JOB_ID}"
+  CMOUNT=(--container-mounts="$MOUNT_SRC:$MOUNT_DIR" --no-container-mount-home
+          --container-workdir="$MOUNT_DIR/experimental/CollectiveX" --no-container-entrypoint)
+  cx_log "EP backend preparation: bench=$CX_BENCH"
+  srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --container-name="$CNAME" \
+    --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \
+    bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" </dev/null 2>&1 | tail -15 \
+    || cx_die "EP backend preparation failed"
+
+  # Per-rank env. DeepEP main spans NVL72 trays only with allow_mnnvl=True (else DeepEP sets
+  # NVSHMEM_DISABLE_MNNVL=1 -> intranode-IPC path -> illegal address cross-tray); CX_ALLOW_MNNVL=1 makes
+  # tests/ep_deepep.py pass it (gated on the param existing, so bundled V1 is unchanged). flashinfer rides
+  # NCCL's MNNVL transport.
+  EP_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1"
+  [ "$CX_BENCH" = "deepep" ] && EP_EXPORTS="$EP_EXPORTS,CX_ALLOW_MNNVL=1"
+
+  # SWEEP (CX_SHARD_FILE set): one pipe-delimited record per shard case so the rack-scale EP path sweeps EVERY
+  # case (parity with single-node). MANUAL: one line per phase from the :-defaulted CX_* env.
+  cx_ep_cases() {
+    # CX_SHARD_FILE is workflow-relative (results/.shard_<id>.json, written under
+    # working-directory=experimental/CollectiveX). This path runs on the SUBMIT HOST (cwd=repo root),
+    # so resolve against $CX_DIR when not found as-is — else the SHARD branch is skipped and only ONE
+    # default case runs instead of the shard's N.
+    local sf="${CX_SHARD_FILE:-}"
+    [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf"
+    if [ -n "$sf" ] && [ -f "$sf" ]; then
+      # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` collapses consecutive tabs and
+      # swallows empty fields like a false eplb, shifting columns. No case field contains '|'.)
+      python3 - "$sf" <<'PY'
+import json, sys
+d = json.load(open(sys.argv[1]))
+for c in d.get("cases", []):
+    g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv)
+    print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"),
+        g("contract","layout-and-dispatch-v1"), g("routing","uniform"),
+        ("1" if c.get("eplb") else ""), g("resource_mode","tuned"),
+        g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"),
+        g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"),
+        g("ladder",""), g("suite",""), g("workload",""), g("required_publication",""),
+        ("1" if c.get("canonical") else ""), g("case_id",""), g("ep",""),
+        g("timing","8:64:32"), g("combine_quant_mode","none")]))
+PY
+    else
+      local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+      local ph; local -a fields
+      for ph in $phases; do
+        fields=("$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}"
+          "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}"
+          "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}"
+          "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}"
+          "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}"
+          "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" "${CX_REQUIRED_PUBLICATION:-}"
+          "${CX_CANONICAL:+1}" "${CX_CASE_ID:-}" "$WORLD"
+          "${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}" "${CX_COMBINE_QUANT_MODE:-none}")
+        (IFS='|'; printf '%s\n' "${fields[*]}")
+      done
+    fi
+  }
+
+  ci=0
+  failed_cases=0
+  while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven \
+      hidden topk experts lad suite workload required_pub canonical case_id ep timing combine_q; do
+    [ -n "$ph" ] || continue
+    ci=$((ci+1))
+    case_stem="${RUNNER_NAME}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")"
+    IFS=':' read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}"
+    case_iters="${case_iters:-8}"; case_trials="${case_trials:-64}"; case_warmup="${case_warmup:-32}"
+    ep="${ep:-$WORLD}"
+    export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+    export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep"
+    export CX_DISPATCH_DTYPE="$dtype" CX_MODE="$mode" CX_MEASUREMENT_CONTRACT="$contract"
+    export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_RESOURCE_MODE="$rmode"
+    export CX_ACTIVATION_PROFILE="$act" CX_PLACEMENT="$placement" CX_ROUTING_STEP="$rstep"
+    export CX_UNEVEN_TOKENS="$uneven" CX_TOKENS_LADDER="$lad" CX_COMBINE_QUANT_MODE="$combine_q"
+    export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup"
+    export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))"
+    export CX_WARMUP_SEMANTICS="full-roundtrip-per-trial-point-v1"
+    cx_log "EP${WORLD}[$ci] id=${case_id:-manual} $ph $CX_BENCH $dtype/$mode/$contract routing=$routing eplb=${eplb:-} rmode=$rmode act=$act plc=$placement"
+    if [ "$ep" != "$WORLD" ]; then
+      cx_log "ERROR: case EP$ep does not match allocated world size $WORLD"
+      export CX_ATTEMPT_ID=1
+      failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+      cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5
+      failed_cases=$((failed_cases + 1))
+      continue
+    fi
+
+    workload_dir=""
+    if [ -n "$canonical" ]; then
+      workload_dir=".cx_workloads/ep${ep}_${routing}"
+      workload_ladder="$lad"
+      [ -n "$workload_ladder" ] || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+      workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir" --routing "$routing"
+        --ep "$ep" --hidden "$hidden" --topk "$topk" --experts "$experts"
+        --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder")
+      [ -n "$workload" ] && workload_args+=(--workload "$workload")
+      stage_rc=0
+      set +e
+      srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --container-name="$CNAME" "${CMOUNT[@]}" \
+        --export="$EP_EXPORTS" "${workload_args[@]}" </dev/null 2>&1 | tail -8
+      stage_status=("${PIPESTATUS[@]}")
+      set -e
+      stage_rc="${stage_status[0]}"
+      if [ "$stage_rc" != 0 ]; then
+        cx_log "ERROR: canonical workload staging failed rc=$stage_rc"
+        export CX_ATTEMPT_ID=1
+        failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+        cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc"
+        failed_cases=$((failed_cases + 1))
+        continue
+      fi
+    fi
+
+    ep_args=(--backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype"
+      --mode "$mode" --measurement-contract "$contract" --routing "$routing"
+      --resource-mode "$rmode" --sm-fraction "${CX_SM_FRACTION:-0.18}"
+      --num-sms "${CX_NUM_SMS:-24}" --activation-profile "$act" --placement "$placement"
+      --gpus-per-node "$GPUS_PER_NODE" --scale-up-domain "$SCALE_UP_DOMAIN"
+      --routing-step "$rstep" --uneven-tokens "$uneven" --tokens-ladder "$lad"
+      --hidden "$hidden" --topk "$topk" --experts "$experts"
+      --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials"
+      --seed "${CX_SEED:-67}" --runner "$RUNNER_NAME" --topology-class "$CX_TOPO"
+      --transport "$CX_TRANSPORT" --case-id "$case_id" --suite "$suite"
+      --workload-name "$workload" --required-publication "$required_pub"
+      --combine-quant-mode "$combine_q")
+    [ -n "$eplb" ] && ep_args+=(--eplb)
+    [ -n "$workload_dir" ] && ep_args+=(--workload-dir "$workload_dir")
+    [ -n "${CX_COMBINE_DTYPE:-}" ] && ep_args+=(--combine-dtype "$CX_COMBINE_DTYPE")
+    attempts=1
+    [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 ))
+    attempt=1
+    case_ok=0
+    while [ "$attempt" -le "$attempts" ]; do
+      export CX_ATTEMPT_ID="$attempt"
+      attempt_tag="a$(printf '%02d' "$attempt")"
+      out="results/${case_stem}_${attempt_tag}_${dtype}_${mode}.json"
+      failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json"
+      set +e
+      timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$WORLD" \
+        --ntasks-per-node="$GPUS_PER_NODE" --container-name="$CNAME" "${CMOUNT[@]}" \
+        --export="$EP_EXPORTS" \
+        bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" </dev/null 2>&1 | tail -8
+      run_status=("${PIPESTATUS[@]}")
+      set -e
+      run_rc="${run_status[0]}"
+      expected_out="$MOUNT_SRC/experimental/CollectiveX/$out"
+      if [ "$run_rc" = 0 ] && cx_has_result_doc "$expected_out"; then
+        case_ok=1
+        break
+      fi
+      [ "$run_rc" = 0 ] && run_rc=1
+      if cx_has_result_doc "$expected_out"; then
+        cx_demote_result_doc "$expected_out" "$run_rc" \
+          || { rm -f "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; }
+      else
+        cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"
+      fi
+      [ "$attempt" -lt "$attempts" ] && cx_log "EP${WORLD}[$ci] attempt $attempt/$attempts failed; retrying"
+      attempt=$((attempt + 1))
+    done
+    if [ "$case_ok" = 0 ]; then
+      failed_cases=$((failed_cases + 1))
+      cx_log "ERROR: EP${WORLD}[$ci] failed after $attempts attempt(s)"
+    fi
+  done < <(cx_ep_cases)
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — EP result artifacts collected"
+  [ "$failed_cases" -eq 0 ] || exit 1
+  exit 0
diff --git a/experimental/CollectiveX/launchers/launch_gb300-nv.sh b/experimental/CollectiveX/launchers/launch_gb300-nv.sh
new file mode 100644
index 0000000000..a5cbd6381f
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb300-nv.sh
@@ -0,0 +1,245 @@
+#!/usr/bin/env bash
+# CollectiveX — GB300 (NVL72 Grace-Blackwell, aarch64) GHA launcher.
+#
+# Two paths by CX_NODES:
+#   CX_NODES<=1 (EP4): single NVL72 tray, 4 GPU. Hands off to run_in_container.sh (torchrun -g 4).
+#   CX_NODES==2 (EP8): 2 trays, 8 GPU over the MNNVL NVLink domain. run_in_container's single-node
+#     torchrun can't span nodes, so this path runs run_ep.py DIRECTLY across 8 srun tasks (1 rank
+#     each), per-rank RANK/LOCAL_RANK from SLURM_*, MASTER_ADDR=first node — the intranode NVLink
+#     path works across <=8 ranks on MNNVL (no internode/NVSHMEM). One CX_* config per dispatch.
+#
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR CX_ENROOT_CACHE_PATH
+PARTITION="$CX_PARTITION"; ACCOUNT="$CX_ACCOUNT"
+NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-4}"
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}"
+EXPECTED_WORLD=$((NODES * GPN))
+NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}"; TIME_MIN="${CX_TIME:-90}"
+[ "$NODES" = 1 ] || [ "$NODES" = 2 ] || cx_die "GB300 supports one or two four-GPU trays"
+[ "$GPN" = 4 ] || cx_die "GB300 requires four GPUs per tray"
+[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "GB300 requires the NVL72 scale-up domain"
+[ "$NGPUS" = "$EXPECTED_WORLD" ] || cx_die "GB300 world size must equal nodes x GPUs per tray"
+cx_apply_timing_profile
+# CX_IMAGE is a Docker tag; cx_ensure_squash derives the local squash filename.
+IMAGE="${CX_IMAGE:-$(cx_default_image gb300)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+export ENROOT_CACHE_PATH="$CX_ENROOT_CACHE_PATH"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+RUNNER="gb300-${NGPUS}x"
+export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="gb300-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN"
+export CX_BENCH="${CX_BENCH:-deepep}" CX_NGPUS="$NGPUS"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "GB300 runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH phase=${CX_PHASE:-decode}"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "$CX_STAGE_DIR")"
+[ "${CX_DRYRUN:-0}" = "1" ] && { cx_log "DRYRUN"; exit 0; }
+command -v salloc >/dev/null || cx_die "salloc not found"
+
+if [ "$NODES" -le 1 ]; then   # ---- EP4: single tray, run_in_container (torchrun -g 4) ----
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPN" --exclusive \
+            --time="$TIME_MIN" --job-name="$RUNNER")"
+  [ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:/ix" \
+    --no-container-mount-home --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \
+    --export=ALL bash /ix/experimental/CollectiveX/runtime/run_in_container.sh
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"; exit 0
+fi
+
+# ---- EP8: 2 trays, run_ep.py directly across 8 ranks (no torchrun; MNNVL intranode path) ----
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" --gres=gpu:"$GPN" \
+          --ntasks-per-node="$GPN" --exclusive --time="$TIME_MIN" --job-name="$RUNNER")"
+[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+MA="$(scontrol show hostnames "$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)" 2>/dev/null | head -1)"; MP=29551
+mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+# Restore process-local loader/import paths and exact backend build identity from build-only.
+WRAP='[ -f /tmp/.cx_backend_env ] && . /tmp/.cx_backend_env; export RANK=$SLURM_PROCID WORLD_SIZE=$SLURM_NTASKS LOCAL_RANK=$SLURM_LOCALID; exec python3 tests/run_ep.py "$@"'
+
+# From-source diagnostic kernels cannot be built in the per-rank multi-srun
+# (8 separate ephemeral containers). Build them ONCE PER NODE into a PERSISTENT named container, then
+# every case-srun REUSES it (--container-name, no re-import) so the build is visible to all 8 ranks.
+# Brings the EP8 rack path to parity with EP4 (run_in_container builds once + reuses). Mounts re-apply
+# per srun-step (not persisted in the container fs), so each srun still passes "${CMOUNT[@]}".
+CNAME="cxep8_${JOB_ID}"
+CMOUNT=(--container-mounts="$MOUNT_SRC:/ix" --no-container-mount-home
+        --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint)
+cx_log "EP backend preparation: bench=$CX_BENCH"
+srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 \
+  --container-name="$CNAME" --container-image="$SQUASH_FILE" "${CMOUNT[@]}" --export=ALL,CX_BUILD_ONLY=1 \
+  bash /ix/experimental/CollectiveX/runtime/run_in_container.sh </dev/null 2>&1 | tail -15 \
+  || cx_die "EP backend preparation failed"
+
+# The EP8 case list as pipe-delimited records. SWEEP (CX_SHARD_FILE set): one line per shard case,
+# so the rack-scale EP8 path sweeps EVERY case of its shard (parity with run_in_container's single-
+# node SHARD loop) instead of the old single CX_* config. MANUAL (no shard file): one line per phase
+# from the CX_* env — every field is :-defaulted so set -u never trips on an unset knob (the old bug:
+# bare $CX_DISPATCH_DTYPE here was unbound under sweep, crashing the whole job on its first line).
+cx_ep8_cases() {
+  # CX_SHARD_FILE is workflow-relative (results/.shard_<id>.json, written by the Extract step with
+  # working-directory=experimental/CollectiveX). This EP8 path runs on the SUBMIT HOST where cwd is
+  # the repo root, so resolve it against $CX_DIR (=experimental/CollectiveX) when not found as-is —
+  # else the SHARD branch is skipped and only ONE default case runs instead of the shard's N.
+  local sf="${CX_SHARD_FILE:-}"
+  [ -n "$sf" ] && [ ! -f "$sf" ] && [ -f "$CX_DIR/$sf" ] && sf="$CX_DIR/$sf"
+  if [ -n "$sf" ] && [ -f "$sf" ]; then
+    # '|'-separated (NOT tab: tab is IFS-whitespace, so `read` would collapse consecutive tabs and
+    # swallow empty fields like a false eplb, shifting every column. No case field contains '|'.)
+    python3 - "$sf" <<'PY'
+import json, sys
+d = json.load(open(sys.argv[1]))
+for c in d.get("cases", []):
+    g = lambda k, dv: (str(c[k]) if c.get(k) not in (None, "") else dv)
+    print("|".join([g("phase","decode"), g("dtype","bf16"), g("mode","normal"),
+        g("contract","layout-and-dispatch-v1"), g("routing","uniform"),
+        ("1" if c.get("eplb") else ""), g("resource_mode","tuned"),
+        g("activation_profile","normal"), g("placement","packed"), g("routing_step","0"),
+        g("uneven_tokens","none"), g("hidden","7168"), g("topk","8"), g("experts","256"),
+        g("ladder",""), g("suite",""), g("workload",""), g("required_publication",""),
+        ("1" if c.get("canonical") else ""), g("case_id",""), g("ep",""),
+        g("timing","8:64:32"), g("combine_quant_mode","none")]))
+PY
+  else
+    local phases="${CX_PHASE:-decode}"; [ "$phases" = both ] && phases="decode prefill"
+    local ph; local -a fields
+    for ph in $phases; do
+      fields=("$ph" "${CX_DISPATCH_DTYPE:-bf16}" "${CX_MODE:-normal}"
+        "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}" "${CX_ROUTING:-uniform}"
+        "${CX_EPLB:+1}" "${CX_RESOURCE_MODE:-tuned}" "${CX_ACTIVATION_PROFILE:-normal}"
+        "${CX_PLACEMENT:-packed}" "${CX_ROUTING_STEP:-0}" "${CX_UNEVEN_TOKENS:-none}"
+        "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" "${CX_TOKENS_LADDER:-}"
+        "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" "${CX_REQUIRED_PUBLICATION:-}"
+        "${CX_CANONICAL:+1}" "${CX_CASE_ID:-}" "$NGPUS"
+        "${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}" "${CX_COMBINE_QUANT_MODE:-none}")
+      (IFS='|'; printf '%s\n' "${fields[*]}")
+    done
+  fi
+}
+
+# Per-rank env for the EP8 case sruns. flashinfer-combine rides NCCL's MNNVL transport (validated:
+# cq=fp8/nvfp4 @ ws8). DeepEP main's Buffer gates multi-tray NVLink behind allow_mnnvl, which defaults
+# False -> DeepEP then sets NVSHMEM_DISABLE_MNNVL=1 and the legacy buffer takes the intranode-only CUDA-IPC
+# peer path, faulting across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). CX_ALLOW_MNNVL=1
+# makes tests/ep_deepep.py pass allow_mnnvl=True so the NVL buffer spans both trays over the fabric API.
+# Bundled V1's Buffer predates the param (its NVL buffer already spans MNNVL) -> the harness drops the kwarg.
+EP8_EXPORTS="ALL,MASTER_ADDR=$MA,MASTER_PORT=$MP,NCCL_MNNVL_ENABLE=1,NCCL_CUMEM_ENABLE=1,MC_FORCE_MNNVL=1"
+[ "$CX_BENCH" = "deepep" ] && EP8_EXPORTS="$EP8_EXPORTS,CX_ALLOW_MNNVL=1"
+
+ci=0
+failed_cases=0
+while IFS='|' read -r ph dtype mode contract routing eplb rmode act placement rstep uneven \
+    hidden topk experts lad suite workload required_pub canonical case_id ep timing combine_q; do
+  [ -n "$ph" ] || continue
+  ci=$((ci+1))
+  case_stem="${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")"
+  IFS=':' read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}"
+  case_iters="${case_iters:-8}"; case_trials="${case_trials:-64}"; case_warmup="${case_warmup:-32}"
+  ep="${ep:-$NGPUS}"
+  export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+  export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep"
+  export CX_DISPATCH_DTYPE="$dtype" CX_MODE="$mode" CX_MEASUREMENT_CONTRACT="$contract"
+  export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_RESOURCE_MODE="$rmode"
+  export CX_ACTIVATION_PROFILE="$act" CX_PLACEMENT="$placement" CX_ROUTING_STEP="$rstep"
+  export CX_UNEVEN_TOKENS="$uneven" CX_TOKENS_LADDER="$lad" CX_COMBINE_QUANT_MODE="$combine_q"
+  export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup"
+  export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))"
+  export CX_WARMUP_SEMANTICS="full-roundtrip-per-trial-point-v1"
+  cx_log "EP${NGPUS}[$ci] id=${case_id:-manual} $ph $CX_BENCH $dtype/$mode/$contract rt=$routing eplb=${eplb:-} combine=${CX_COMBINE_DTYPE:-bf16}/$combine_q"
+  if [ "$ep" != "$NGPUS" ]; then
+    cx_log "ERROR: case EP$ep does not match allocated world size $NGPUS"
+    export CX_ATTEMPT_ID=1
+    failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+    cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5
+    failed_cases=$((failed_cases + 1))
+    continue
+  fi
+
+  workload_dir=""
+  if [ -n "$canonical" ]; then
+    workload_dir=".cx_workloads/ep${ep}_${routing}"
+    workload_ladder="$lad"
+    [ -n "$workload_ladder" ] || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+    workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir" --routing "$routing"
+      --ep "$ep" --hidden "$hidden" --topk "$topk" --experts "$experts"
+      --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder")
+    [ -n "$workload" ] && workload_args+=(--workload "$workload")
+    stage_rc=0
+    set +e
+    srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --container-name="$CNAME" "${CMOUNT[@]}" \
+      --export="$EP8_EXPORTS" "${workload_args[@]}" </dev/null 2>&1 | tail -8
+    stage_status=("${PIPESTATUS[@]}")
+    set -e
+    stage_rc="${stage_status[0]}"
+    if [ "$stage_rc" != 0 ]; then
+      cx_log "ERROR: canonical workload staging failed rc=$stage_rc"
+      export CX_ATTEMPT_ID=1
+      failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+      cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc"
+      failed_cases=$((failed_cases + 1))
+      continue
+    fi
+  fi
+
+  ep_args=(--backend "$CX_BENCH" --phase "$ph" --dispatch-dtype "$dtype"
+    --mode "$mode" --measurement-contract "$contract" --routing "$routing"
+    --resource-mode "$rmode" --sm-fraction "${CX_SM_FRACTION:-0.18}"
+    --num-sms "${CX_NUM_SMS:-24}" --activation-profile "$act" --placement "$placement"
+    --gpus-per-node "$GPN" --scale-up-domain "$SCALE_UP_DOMAIN"
+    --routing-step "$rstep" --uneven-tokens "$uneven" --tokens-ladder "$lad"
+    --hidden "$hidden" --topk "$topk" --experts "$experts"
+    --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials"
+    --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$CX_TOPO"
+    --transport "$CX_TRANSPORT" --case-id "$case_id" --suite "$suite"
+    --workload-name "$workload" --required-publication "$required_pub"
+    --combine-quant-mode "$combine_q")
+  [ -n "$eplb" ] && ep_args+=(--eplb)
+  [ -n "$workload_dir" ] && ep_args+=(--workload-dir "$workload_dir")
+  [ -n "${CX_COMBINE_DTYPE:-}" ] && ep_args+=(--combine-dtype "$CX_COMBINE_DTYPE")
+  attempts=1
+  [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 ))
+  attempt=1
+  case_ok=0
+  while [ "$attempt" -le "$attempts" ]; do
+    export CX_ATTEMPT_ID="$attempt"
+    attempt_tag="a$(printf '%02d' "$attempt")"
+    out="results/${case_stem}_${attempt_tag}_${dtype}_${mode}.json"
+    failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json"
+    set +e
+    timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks="$NGPUS" \
+      --ntasks-per-node="$GPN" --container-name="$CNAME" "${CMOUNT[@]}" \
+      --export="$EP8_EXPORTS" \
+      bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" </dev/null 2>&1 | tail -8
+    run_status=("${PIPESTATUS[@]}")
+    set -e
+    run_rc="${run_status[0]}"
+    expected_out="$MOUNT_SRC/experimental/CollectiveX/$out"
+    if [ "$run_rc" = 0 ] && cx_has_result_doc "$expected_out"; then
+      case_ok=1
+      break
+    fi
+    [ "$run_rc" = 0 ] && run_rc=1
+    if cx_has_result_doc "$expected_out"; then
+      cx_demote_result_doc "$expected_out" "$run_rc" \
+        || { rm -f "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; }
+    else
+      cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"
+    fi
+    [ "$attempt" -lt "$attempts" ] && cx_log "EP${NGPUS}[$ci] attempt $attempt/$attempts failed; retrying"
+    attempt=$((attempt + 1))
+  done
+  if [ "$case_ok" = 0 ]; then
+    failed_cases=$((failed_cases + 1))
+    cx_log "ERROR: EP${NGPUS}[$ci] failed after $attempts attempt(s)"
+  fi
+done < <(cx_ep8_cases)
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+[ "$failed_cases" -eq 0 ] || exit 1
diff --git a/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
new file mode 100644
index 0000000000..a68716a5ab
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h100-dgxc-slurm.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# CollectiveX — H100 single-node SKU adapter (8x H100, NVLink island, x86_64, SM90).
+#
+# Allocates, then hands off to run_in_container.sh.
+# The promoted DeepEP path is normal-mode BF16; FP8/LL remain manual diagnostics.
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-h100-dgxc-slurm}"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR
+PARTITION="$CX_PARTITION"
+ACCOUNT="$CX_ACCOUNT"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image h100)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h100-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-deepep}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME ngpus=$NGPUS bench=$CX_BENCH"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER_NAME"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" ${EXCLUDE_NODES:+--exclude="$EXCLUDE_NODES"} \
+          --gres=gpu:"$NGPUS" --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — result artifacts collected"
diff --git a/experimental/CollectiveX/launchers/launch_h200.sh b/experimental/CollectiveX/launchers/launch_h200.sh
new file mode 100644
index 0000000000..db42e2f389
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_h200.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# CollectiveX — H200 single-node SKU adapter (8x H200, NVLink island, x86_64, SM90).
+#
+# Thin adapter: allocation/container setup, then runtime/run_in_container.sh.
+# Scheduling and compute-visible storage are supplied by the runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-h200}"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR
+PARTITION="$CX_PARTITION"
+ACCOUNT="${CX_ACCOUNT:-}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-45}"
+IMAGE="${CX_IMAGE:-$(cx_default_image h200)}"
+SQUASH_DIR="$CX_SQUASH_DIR"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="h200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-deepep}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME ngpus=$NGPUS bench=$CX_BENCH"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER_NAME"
+
+JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${ACCOUNT:+--account="$ACCOUNT"} --gres=gpu:"$NGPUS" \
+          --exclusive --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — result artifacts collected"
diff --git a/experimental/CollectiveX/launchers/launch_mi325x-amds.sh b/experimental/CollectiveX/launchers/launch_mi325x-amds.sh
new file mode 100755
index 0000000000..86da5397c3
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi325x-amds.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# CollectiveX — MI325X (AMD CDNA3 gfx942, 8 GPU/node) wrapper.
+# Scheduling, exclusions, and storage are supplied by the runner-local config.
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Same-host MoRI traffic uses the SDMA/XGMI path by default.
+export MORI_DISABLE_AUTO_XGMI="${MORI_DISABLE_AUTO_XGMI:-0}"
+# The CDNA3 EP adapter uses the newer MoRI image and AsyncLL kernel with SDMA.
+case "${CX_BENCH:-}" in
+  mori)
+    export CX_IMAGE="${CX_IMAGE:-rocm/sgl-dev:sglang-0.5.14-rocm720-mi35x-mori-0701}"
+    export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-asyncll}"
+    export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}"
+    ;;
+  *)
+    export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}"
+    ;;
+esac
+# MoRI initialization diagnostics record the selected transport path.
+export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-info}"
+export MORI_SHMEM_LOG_LEVEL="${MORI_SHMEM_LOG_LEVEL:-info}"
+export MORI_IO_LOG_LEVEL="${MORI_IO_LOG_LEVEL:-info}"
+exec bash "$HERE/launch_mi355x-amds.sh"
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
new file mode 100644
index 0000000000..26555f2fa2
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine.
+#
+# The ROCm path imports its squash in the allocation and uses writable/remapped
+# pyxis containers. Scheduling, exclusions, node pins, and storage come from the
+# runner-local config.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR
+PARTITION="$CX_PARTITION"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-60}"   # generous: a cold enroot import of the large ROCm image
+# Resolve the image after CX_BENCH so bench-scoped image selection sees the final backend.
+SQUASH_DIR="$CX_SQUASH_DIR"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}"
+# Optional node pin overrides the exclusion list.
+NODELIST="${CX_NODELIST:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# AMD EP backends: MoRI and the portable NCCL/RCCL all-to-all reference.
+export CX_BENCH="${CX_BENCH:-mori}"
+case "$CX_BENCH" in
+  mori|nccl-ep) ;;
+  *) cx_die "unsupported AMD EP backend: $CX_BENCH" ;;
+esac
+# Resolve the image now that CX_BENCH and RUNNER_NAME are both final (see note at IMAGE decl).
+IMAGE="${CX_IMAGE:-$(cx_default_image "$RUNNER_NAME")}"
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+# topology_class is part of comparison_key; label the actual SKU when the MI325X wrapper calls this.
+case "${RUNNER_NAME}" in
+  mi325x*) export CX_TOPO="mi325x-xgmi" ;;
+  *)       export CX_TOPO="mi355x-xgmi" ;;
+esac
+export CX_TRANSPORT="xgmi"
+# Allow a longer per-phase guard for large MoRI prefill points.
+export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$IMAGE")}"
+
+cx_log "runner=$RUNNER_NAME ngpus=$NGPUS bench=$CX_BENCH"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh"
+# Keep the import lock in a separately writable directory. CX_LOCK_DIR overrides.
+LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+cx_require_single_node "$RUNNER_NAME"
+
+# Pin to specific nodes when configured, otherwise apply the optional exclusion list.
+if [ -n "$NODELIST" ]; then
+  cx_log "using configured node pin"
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \
+            --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+else
+  JOB_ID="$(cx_salloc_jobid --partition="$PARTITION" ${EXCLUDE_NODES:+--exclude="$EXCLUDE_NODES"} --gres=gpu:"$NGPUS" \
+            --exclusive --cpus-per-task=128 --time="$TIME_MIN" --job-name="$RUNNER_NAME")"
+fi
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+# Clear stray containers, then enroot-import to the node-local squash (flock,
+# </dev/null so a missing token can't hang). Both run on the allocated node.
+# shellcheck disable=SC2016  # $(...) must expand on the remote node, not here
+srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
+srun --jobid="$JOB_ID" bash -c "
+  mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true
+  exec 9>\"$LOCK_FILE\" 2>/dev/null || { echo 'cannot open configured squash lock' >&2; exit 1; }
+  flock -w 600 9 || { echo 'configured squash lock timed out' >&2; exit 1; }
+  if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
+    echo 'container squash ready'
+  else
+    rm -f \"$SQUASH_FILE\" 2>/dev/null
+    enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" </dev/null >/dev/null 2>&1
+  fi
+"
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the
+# next checkout on this runner is clean.
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+cx_log "done — result artifacts collected"
diff --git a/experimental/CollectiveX/make_bundle.py b/experimental/CollectiveX/make_bundle.py
new file mode 100644
index 0000000000..02429291ff
--- /dev/null
+++ b/experimental/CollectiveX/make_bundle.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""CollectiveX publication bundle generator (goal P1: continuous benchmark infrastructure).
+
+Turns a validated aggregate into ONE self-contained, citable directory:
+
+    bundle/
+      manifest.json      bundle format, source run provenance, coverage + validation counts
+      <aggregate>.ndjson the schema-validated dataset (copied verbatim)
+      SHA256SUMS         checksums of every file above
+
+Fail-loud doctrine: every doc in the aggregate is validated (versioned EP result schema +
+validate_results semantic gates) BEFORE anything is written; any schema error or
+publication_status tamper aborts the bundle with a non-zero exit. A bundle therefore
+certifies its own dataset — nothing lands in it that the validator has not passed.
+
+  python3 make_bundle.py --aggregate results/aggregate/collectivex_all_123.ndjson \
+      --out-dir results/bundle --source-run-id 123 --source-sha abc --source-run-url https://...
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import shutil
+import sys
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
+import validate_results as vr  # noqa: E402
+import capability as cap  # noqa: E402
+from artifact_safety import assert_publication_safe  # noqa: E402
+
+BUNDLE_FORMAT = 1
+PUBLICATION_RANK = {
+    "failed": 0,
+    "invalid": 0,
+    "diagnostic": 1,
+    "valid": 1,
+    "comparable-experimental": 2,
+    "official": 3,
+}
+PHASE_TOKEN_DEFAULTS = {
+    "decode": (1, 2, 4, 8, 16, 32, 64, 128),
+    "prefill": (128, 256, 512, 1024, 2048, 4096),
+}
+SKU_PREFIXES = (
+    "h100-dgxc", "h200-dgxc", "b200-dgxc", "mi355x", "mi325x", "gb300", "gb200", "b300",
+)
+
+
+def _sku_of(doc: dict) -> str:
+    """SKU token from the runner name: 'h100-dgxc-slurm_19' -> 'h100', 'gb300-8x' -> 'gb300'."""
+    runner = str(doc.get("runner") or "unknown")
+    return runner.split("_")[0].split("-")[0] or "unknown"
+
+
+def _normalized_sku(value) -> str:
+    """Map runner names and matrix labels onto the v1 scheduled SKU vocabulary."""
+    value = str(value or "unknown").lower()
+    return next((sku for sku in SKU_PREFIXES if value == sku
+                 or value.startswith(f"{sku}-") or value.startswith(f"{sku}_")), "unknown")
+
+
+def _sha256(path: str) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def _load_ndjson(path: str) -> list[dict]:
+    docs = []
+    with open(path) as fh:
+        for i, line in enumerate(fh):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                value = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise SystemExit(f"bundle: {path}:{i + 1} is not JSON ({exc}) — refusing to bundle")
+            if not isinstance(value, dict):
+                raise SystemExit(
+                    f"bundle: {path}:{i + 1} is not a JSON object — refusing to bundle"
+                )
+            docs.append(value)
+    return docs
+
+
+def validate(docs: list[dict], schema: dict | None) -> dict:
+    """Validate every EP doc and reject every other family."""
+    assert_publication_safe(docs)
+    by_status: dict[str, int] = {}
+    by_family: dict[str, int] = {}
+    n_err = 0
+    for i, doc in enumerate(docs):
+        fam = doc.get("family") or "unknown"
+        by_family[fam] = by_family.get(fam, 0) + 1
+        if fam != "moe":
+            raise SystemExit(
+                f"bundle: doc[{i}] has unsupported family {fam!r}; CollectiveX v1 is EP-only"
+            )
+        errs, _warns, status = vr.validate_doc(doc, schema, f"doc[{i}]")
+        by_status[status] = by_status.get(status, 0) + 1
+        for e in errs:
+            n_err += 1
+            print(f"bundle: INVALID doc[{i}] ({doc.get('backend')}/{doc.get('runner')}): {e}",
+                  file=sys.stderr)
+    if n_err:
+        raise SystemExit(f"bundle: {n_err} validation error(s) — refusing to publish a tainted bundle")
+    identity_issues = vr.cross_document_workload_issues(docs)
+    if identity_issues:
+        raise SystemExit(
+            "bundle: cross-document workload identity failed: " + "; ".join(identity_issues[:8])
+        )
+    return {"by_publication_status": by_status, "by_family": by_family, "errors": 0}
+
+
+def coverage(docs: list[dict]) -> dict:
+    skus, backends, ws, contracts, versions = set(), set(), set(), set(), set()
+    newest = ""
+    for d in docs:
+        skus.add(_sku_of(d))
+        if d.get("backend"):
+            backends.add(d["backend"])
+        if d.get("world_size"):
+            ws.add(int(d["world_size"]))
+        if d.get("measurement_contract"):
+            contracts.add(d["measurement_contract"])
+        if d.get("schema_version") is not None:
+            versions.add(int(d["schema_version"]))
+        newest = max(newest, str(d.get("generated_at") or ""))
+    return {"skus": sorted(skus), "backends": sorted(backends), "world_sizes": sorted(ws),
+            "measurement_contracts": sorted(contracts), "schema_versions": sorted(versions),
+            "newest_result_at": newest or None}
+
+
+def _tokens(value, phase: str) -> tuple[int, ...]:
+    """Normalize a matrix/result ladder; blank means the v1 default for that phase."""
+    if value in (None, ""):
+        return PHASE_TOKEN_DEFAULTS.get(str(phase), ())
+    if isinstance(value, str):
+        values = value.replace(",", " ").split()
+    else:
+        values = value
+    return tuple(sorted(int(token) for token in values))
+
+
+def _expected_case_identity(case: dict) -> dict:
+    """Normalize every scheduled field that a v1 result can prove."""
+    identity = {}
+    for field in ("suite", "workload", "required_publication", "backend", "mode", "dtype",
+                  "contract", "routing", "phase", "combine_quant_mode", "resource_mode",
+                  "activation_profile", "placement", "uneven_tokens", "warmup_semantics"):
+        if field in case:
+            identity[field] = str(case[field])
+    for field in ("eplb", "canonical"):
+        if field in case:
+            identity[field] = bool(case[field])
+    for field in ("ep", "samples_per_point", "gpus_per_node", "scale_up_domain"):
+        if field in case:
+            identity[field] = int(case[field])
+    for field, default in (("hidden", 7168), ("topk", 8), ("experts", 256)):
+        if field in case:
+            identity[field] = int(case[field] or default)
+    if "routing_step" in case:
+        identity["routing_step"] = int(case["routing_step"] or 0)
+    if "nodes" in case:
+        identity["nodes"] = int(case["nodes"] or 1)
+    if "timing" in case:
+        identity["timing"] = tuple(int(value) for value in str(case["timing"]).split(":"))
+    if "ladder" in case:
+        identity["tokens"] = _tokens(case["ladder"], str(case.get("phase") or ""))
+    if "_sku" in case:
+        identity["sku"] = _normalized_sku(case["_sku"])
+    return identity
+
+
+def _actual_case_identity(doc: dict) -> dict:
+    """Project a result onto the same v1 identity as its scheduled matrix case."""
+    if doc.get("record_type") == "failed-case":
+        failure = doc.get("failure") if isinstance(doc.get("failure"), dict) else {}
+        raw = failure.get("case") if isinstance(failure.get("case"), dict) else {}
+        case = dict(raw)
+        aliases = {"dispatch_dtype": "dtype", "tokens_ladder": "ladder"}
+        for source, target in aliases.items():
+            if source in case:
+                case[target] = case[source]
+        case["_sku"] = doc.get("runner")
+        if all(field in case for field in ("iters", "trials", "warmup")):
+            case["timing"] = f"{case['iters']}:{case['trials']}:{case['warmup']}"
+        return _expected_case_identity(case)
+
+    shape = doc.get("shape") if isinstance(doc.get("shape"), dict) else {}
+    quant = shape.get("quant") if isinstance(shape.get("quant"), dict) else {}
+    reproduction = (doc.get("reproduction")
+                    if isinstance(doc.get("reproduction"), dict) else {})
+    placement = doc.get("placement") if isinstance(doc.get("placement"), dict) else {}
+    workload = doc.get("workload") if isinstance(doc.get("workload"), dict) else {}
+    logical_experts = shape.get("num_logical_experts") or shape.get("experts")
+    return {
+        "suite": doc.get("suite"),
+        "workload": doc.get("workload_name"),
+        "required_publication": doc.get("required_publication"),
+        "backend": doc.get("backend"),
+        "mode": doc.get("mode"),
+        "dtype": shape.get("dispatch_dtype", reproduction.get("dispatch_dtype")),
+        "contract": doc.get("measurement_contract"),
+        "routing": shape.get("routing"),
+        "phase": doc.get("phase"),
+        "ep": doc.get("ep_size"),
+        "eplb": bool(shape.get("eplb", False)),
+        "combine_quant_mode": quant.get(
+            "combine_quant_mode", reproduction.get("combine_quant_mode", "none")),
+        "resource_mode": doc.get("resource_mode"),
+        "activation_profile": shape.get(
+            "activation_profile", reproduction.get("activation_profile", "normal")),
+        "placement": placement.get("kind", "packed"),
+        "routing_step": int(shape.get("routing_step", reproduction.get("routing_step", 0))),
+        "uneven_tokens": shape.get(
+            "uneven_tokens", reproduction.get("uneven_tokens", "none")),
+        "hidden": shape.get("hidden"),
+        "topk": shape.get("topk"),
+        "experts": logical_experts,
+        "samples_per_point": reproduction.get("samples_per_point"),
+        "warmup_semantics": reproduction.get("warmup_semantics"),
+        "timing": tuple(reproduction.get(field) for field in ("iters", "trials", "warmup")),
+        "canonical": workload.get("source") == "canonical-serialized",
+        "nodes": int(doc.get("nodes") or placement.get("nodes") or 1),
+        "gpus_per_node": placement.get("gpus_per_node"),
+        "scale_up_domain": placement.get("scale_up_domain"),
+        "tokens": tuple(sorted(
+            int(row["tokens_per_rank"]) for row in doc.get("rows", [])
+            if row.get("tokens_per_rank") is not None
+        )),
+        "sku": _normalized_sku(doc.get("runner")),
+    }
+
+
+def _identity_differences(expected: dict, doc: dict) -> list[str]:
+    expected_identity = _expected_case_identity(expected)
+    actual_identity = _actual_case_identity(doc)
+    return [
+        f"{field}={actual_identity.get(field)!r}!={value!r}"
+        for field, value in expected_identity.items()
+        if actual_identity.get(field) != value
+    ]
+
+
+def validate_expected_coverage(docs: list[dict], matrix: dict) -> dict:
+    """Require one semantically matching, sufficiently published result per scheduled case."""
+    expected: dict[str, dict] = {}
+    for shard in matrix.get("include", []):
+        sku = _normalized_sku(shard.get("sku"))
+        platform = cap.PLATFORMS.get(sku)
+        if shard.get("sku") and platform is None:
+            raise SystemExit(f"bundle: unknown matrix SKU {shard.get('sku')!r}")
+        if platform:
+            for field in ("gpus_per_node", "scale_up_domain"):
+                if int(shard.get(field) or 0) != int(platform[field]):
+                    raise SystemExit(
+                        f"bundle: shard {shard.get('id')!r} has {field}={shard.get(field)!r}, "
+                        f"expected {platform[field]} for {sku}"
+                    )
+        for case in shard.get("cases", []):
+            if platform:
+                for field in ("gpus_per_node", "scale_up_domain"):
+                    if int(case.get(field) or 0) != int(platform[field]):
+                        raise SystemExit(
+                            f"bundle: case {case.get('case_id')!r} has {field}="
+                            f"{case.get(field)!r}, expected {platform[field]} for {sku}"
+                        )
+            case_id = case.get("case_id")
+            if not case_id:
+                raise SystemExit("bundle: expected matrix case is missing case_id")
+            if case_id in expected:
+                raise SystemExit(f"bundle: duplicate expected case_id {case_id}")
+            expected[case_id] = {**case, **({"_sku": shard["sku"]} if shard.get("sku") else {})}
+
+    actual: dict[str, list[dict]] = {}
+    missing_identity = 0
+    identity_mismatch = []
+    for doc in docs:
+        if doc.get("family") != "moe":
+            continue
+        case_id = doc.get("case_id")
+        if not case_id:
+            missing_identity += 1
+            continue
+        case_id = str(case_id)
+        if case_id in expected:
+            differences = _identity_differences(expected[case_id], doc)
+            if differences:
+                identity_mismatch.append(f"{case_id}:" + ",".join(differences))
+                continue
+        actual.setdefault(case_id, []).append(doc)
+
+    missing = sorted(set(expected) - set(actual))
+    extra = sorted(set(actual) - set(expected))
+    duplicates = sorted(case_id for case_id, values in actual.items() if len(values) != 1)
+    under_tier = []
+    for case_id in sorted(set(expected) & set(actual)):
+        if len(actual[case_id]) != 1:
+            continue
+        required = expected[case_id].get("required_publication") or "diagnostic"
+        observed = actual[case_id][0].get("publication_status") or "invalid"
+        if PUBLICATION_RANK.get(str(observed), -1) < PUBLICATION_RANK.get(str(required), 99):
+            under_tier.append(f"{case_id}:{observed}<{required}")
+
+    if missing_identity or missing or extra or duplicates or under_tier or identity_mismatch:
+        details = (
+            f"missing_identity={missing_identity} missing={missing[:8]} extra={extra[:8]} "
+            f"duplicates={duplicates[:8]} under_tier={under_tier[:8]} "
+            f"identity_mismatch={identity_mismatch[:8]}"
+        )
+        raise SystemExit(f"bundle: expected-matrix coverage failed ({details})")
+    return {"expected": len(expected), "observed": len(actual), "complete": True}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX publication bundle generator")
+    ap.add_argument("--aggregate", nargs="+", required=True, help="aggregate .ndjson file(s)")
+    ap.add_argument("--out-dir", default=os.path.join(HERE, "results", "bundle"))
+    ap.add_argument("--schema", default="",
+                    help="override with one schema for all EP docs; blank selects v3-v5 per doc")
+    ap.add_argument("--source-run-id", default=os.environ.get("GITHUB_RUN_ID", ""))
+    ap.add_argument("--source-sha", default=os.environ.get("GITHUB_SHA", ""))
+    ap.add_argument("--source-run-url", default="")
+    ap.add_argument("--source-workflow", default=os.environ.get("GITHUB_WORKFLOW", ""))
+    ap.add_argument("--matrix", default="", help="resolved matrix_full.json for exact case coverage")
+    a = ap.parse_args()
+
+    schema = json.load(open(a.schema)) if a.schema else vr.load_schema_registry()
+    docs: list[dict] = []
+    for path in a.aggregate:
+        if not os.path.exists(path):
+            raise SystemExit(f"bundle: aggregate not found: {path}")
+        docs.extend(_load_ndjson(path))
+    if not docs:
+        raise SystemExit("bundle: aggregate is empty — nothing to publish")
+
+    validation = validate(docs, schema)
+    matrix_coverage = None
+    if a.matrix:
+        with open(a.matrix) as fh:
+            matrix_coverage = validate_expected_coverage(docs, json.load(fh))
+
+    os.makedirs(a.out_dir, exist_ok=True)
+    files: list[str] = []
+    for path in a.aggregate:
+        dst = os.path.join(a.out_dir, os.path.basename(path))
+        shutil.copyfile(path, dst)
+        files.append(dst)
+
+    manifest = {
+        "bundle_format": BUNDLE_FORMAT,
+        "generated_at": _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "source": {"run_id": a.source_run_id or None, "sha": a.source_sha or None,
+                   "run_url": a.source_run_url or None, "workflow": a.source_workflow or None},
+        "docs": len(docs),
+        "validation": validation,
+        "coverage": {**coverage(docs), **({"matrix": matrix_coverage} if matrix_coverage else {})},
+        "files": {os.path.basename(p): {"sha256": _sha256(p), "bytes": os.path.getsize(p)}
+                  for p in files},
+    }
+    mpath = os.path.join(a.out_dir, "manifest.json")
+    with open(mpath, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    files.append(mpath)
+
+    with open(os.path.join(a.out_dir, "SHA256SUMS"), "w") as fh:
+        for p in files:
+            fh.write(f"{_sha256(p)}  {os.path.basename(p)}\n")
+
+    print(f"bundle: {len(docs)} docs -> {a.out_dir} "
+          f"({', '.join(sorted(os.path.basename(p) for p in files))}, SHA256SUMS)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 0000000000..107c634037
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,7 @@
+# Matrix and strict artifact validation dependencies. GPU libraries are supplied by
+# each benchmark container and are intentionally not installed from this file.
+PyYAML>=6.0.2,<7
+jsonschema>=4.23,<5
+
+# Canonical workload serialization and the independent correctness oracle.
+numpy>=1.26,<3
diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep
new file mode 100644
index 0000000000..1d32937f98
--- /dev/null
+++ b/experimental/CollectiveX/results/.gitkeep
@@ -0,0 +1,2 @@
+# Transient CollectiveX shard, aggregate, and bundle output lands here.
+# Keep this file so the directory exists before the first run.
diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh
new file mode 100644
index 0000000000..bc734bfdac
--- /dev/null
+++ b/experimental/CollectiveX/runtime/common.sh
@@ -0,0 +1,322 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Runner-local deployment settings are deliberately kept outside the checkout.
+# The file is trusted shell input owned by the runner operator.
+cx_load_operator_config() {
+  [ -n "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" ] && return 0
+  local config="${COLLECTIVEX_OPERATOR_CONFIG:-${XDG_CONFIG_HOME:-${HOME}/.config}/inferencex/collectivex.env}"
+  if [ -r "$config" ]; then
+    # shellcheck disable=SC1090
+    source "$config"
+  fi
+  export COLLECTIVEX_OPERATOR_CONFIG_LOADED=1
+}
+
+cx_require_vars() {
+  local name
+  local -a missing=()
+  for name in "$@"; do
+    [ -n "${!name:-}" ] || missing+=("$name")
+  done
+  [ "${#missing[@]}" -eq 0 ] || cx_die \
+    "missing runner-local configuration: ${missing[*]} (set them in COLLECTIVEX_OPERATOR_CONFIG)"
+}
+
+cx_require_single_node() {
+  [ "${CX_NODES:-1}" = "1" ] || cx_die "$1 supports one-node EP only"
+}
+
+cx_apply_timing_profile() {
+  [ -n "${CX_TIMING:-}" ] || return 0
+  local iters trials warmup extra
+  IFS=: read -r iters trials warmup extra <<< "$CX_TIMING"
+  [[ "$iters" =~ ^[1-9][0-9]*$ && "$trials" =~ ^[1-9][0-9]*$ \
+    && "$warmup" =~ ^[1-9][0-9]*$ && -z "$extra" ]] \
+    || cx_die "CX_TIMING must be positive iters:trials:warmup"
+  export CX_ITERS="$iters" CX_TRIALS="$trials" CX_WARMUP="$warmup"
+}
+
+cx_load_operator_config
+
+# Allocate via salloc (--no-shell is appended) and echo the GRANTED Slurm job id, parsed from
+# salloc's OWN output. Use INSTEAD of `salloc ...; JOB_ID=$(squeue --name=<name> -h -o %A | head -1)`:
+# that lookup is not unique per allocation, so concurrent cells can resolve a sibling allocation.
+# Parsing salloc's own "Granted job allocation N" is race-free; raw scheduler output stays private.
+cx_salloc_jobid() {
+  local _t; _t="$(mktemp)"
+  salloc "$@" --no-shell >"$_t" 2>&1 || true
+  sed -n 's/.*Granted job allocation \([0-9][0-9]*\).*/\1/p' "$_t" | head -n1
+  rm -f "$_t"
+}
+
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built
+# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and
+# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected
+# multi-arch index digest is recorded for provenance/verification:
+CX_IMAGE_MULTIARCH_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly.)
+# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep.
+# The arch-specific deepseek-v4-{blackwell,grace-blackwell} images do bundle
+# DeepEP, but are not multi-arch and are not the default.
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
+
+# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and
+# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the
+# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest-
+# pinned yet, so it is not promotion-eligible.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+cx_default_image() {
+  case "$1" in
+    mi355x*|mi325x*) echo "$CX_IMAGE_AMD_MORI" ;;
+    b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+    *) cx_die "no default image for runner prefix: $1" ;;
+  esac
+}
+
+cx_default_image_digest() {
+  [ "$1" = "$CX_IMAGE_MULTIARCH" ] && printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST"
+}
+
+# cx_ensure_squash <squash_dir> <image>  ->  echoes the squash file path.
+# Imports via enroot only if a valid squash is not already present (flock-guarded,
+# mirroring runners/launch_b200-dgxc.sh).
+cx_ensure_squash() {
+  local squash_dir="$1" image="$2"
+  mkdir -p "$squash_dir" 2>/dev/null || true
+  local key sq locks lock_fd
+  key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')"
+  sq="$squash_dir/${key}.sqsh"
+  locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true
+  { exec {lock_fd}>"$locks/${key}.lock"; } 2>/dev/null \
+    || cx_die "cannot open the configured squash lock"
+  flock -w 900 "$lock_fd" || cx_die "configured squash lock timed out"
+  if unsquashfs -l "$sq" >/dev/null 2>&1; then
+    cx_log "container squash ready"
+  else
+    cx_log "importing configured container image"
+    rm -f "$sq" 2>/dev/null || cx_die "cannot replace the configured squash"
+    # </dev/null: never block on an interactive password prompt.
+    enroot import -o "$sq" "docker://$image" </dev/null >/dev/null 2>&1 \
+      || cx_die "configured container image import failed"
+    unsquashfs -l "$sq" >/dev/null 2>&1 \
+      || cx_die "configured container image produced an invalid squash"
+  fi
+  flock -u "$lock_fd"
+  exec {lock_fd}>&-
+  echo "$sq"
+}
+
+# cx_stage_repo <repo_root> <stage_dir>  ->  echoes the mount-source root.
+# Some deployments do not cross-mount the runner workspace to compute nodes. If
+# CX_STAGE_DIR is set, rsync the CollectiveX tree onto a compute-visible shared
+# filesystem and mount from there. No-op (echo repo_root) when
+# stage_dir is empty or equals repo_root.
+cx_stage_repo() {
+  local repo_root="$1" stage_dir="${2:-}"
+  if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then
+    echo "$repo_root"; return 0
+  fi
+  # Concurrency isolation. Under GHA the per-config concurrency fan-out runs many
+  # same-SKU dispatches at once, all staging into the SAME shared base dir; a
+  # shared dir + `rsync --delete` lets one job unlink/replace a file a peer is
+  # mid-read of -> "error reading input file: Stale file handle" on the next
+  # `srun ... run_in_container.sh`. Give each EXECUTING job its own subdir keyed on
+  # a workflow-provided execution id. Outside GHA, keep the single shared dir.
+  local tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-}}"
+  if [ -n "$tag" ]; then
+    stage_dir="$stage_dir/job_$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+  fi
+  mkdir -p "$stage_dir/experimental" 2>/dev/null \
+    || cx_die "cannot create the configured stage directory"
+  cx_log "staging CollectiveX on compute-visible storage"
+  rsync -a --delete --delete-excluded \
+    --exclude='__pycache__/' --exclude='results/' --exclude='.cx_workloads/' \
+    --exclude='configs/platforms.yaml' --exclude='private-infra.md' \
+    --exclude='goal.md' --exclude='notes.md' \
+    "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >/dev/null 2>&1 \
+    || cx_die "staging CollectiveX failed"
+  echo "$stage_dir"
+}
+
+# cx_collect_results <mount_src> <repo_root>
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+  local mount_src="$1" repo_root="$2" dst
+  [ "$mount_src" = "$repo_root" ] && return 0
+  dst="$repo_root/experimental/CollectiveX/results"
+  mkdir -p "$dst"
+  cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true
+  cx_log "collected staged results for artifact validation"
+}
+
+# Return success only when a benchmark output is a complete JSON result object.
+# Callers use this before synthesizing a failed-case so an emitted invalid result
+# is not shadowed by a second record for the same attempt.
+cx_has_result_doc() {
+  local path="$1"
+  [ -f "$path" ] || return 1
+  python3 - "$path" <<'PY' >/dev/null 2>&1
+import json
+import sys
+
+try:
+    with open(sys.argv[1]) as fh:
+        doc = json.load(fh)
+except (OSError, json.JSONDecodeError):
+    raise SystemExit(1)
+
+is_result = (
+    isinstance(doc, dict)
+    and doc.get("schema_version") is not None
+    and doc.get("family") is not None
+    and any(key in doc for key in ("publication_status", "status", "record_type"))
+)
+raise SystemExit(0 if is_result else 1)
+PY
+}
+
+# A rank-zero result can be written before another rank or backend teardown fails. Preserve its
+# measurements, but make the distributed command's nonzero terminal status authoritative.
+cx_demote_result_doc() {
+  local path="$1" rc="$2"
+  python3 - "$path" "$rc" <<'PY'
+import json
+import os
+import sys
+
+path, rc_text = sys.argv[1:3]
+with open(path) as fh:
+    doc = json.load(fh)
+if not isinstance(doc, dict):
+    raise SystemExit(1)
+validity = doc.get("validity")
+if not isinstance(validity, dict):
+    validity = {}
+doc["validity"] = {**validity, "execution_status": "failed"}
+doc["publication_status"] = "failed"
+doc["status"] = "invalid"
+doc["post_emit_failure"] = {"return_code": int(rc_text)}
+tmp = f"{path}.tmp"
+with open(tmp, "w") as fh:
+    json.dump(doc, fh, indent=2)
+    fh.write("\n")
+os.replace(tmp, path)
+PY
+}
+
+# cx_emit_ep_failed_case <out> <backend> <phase> <return-code>
+# Preserve failures from rack launchers that invoke run_ep.py directly and therefore cannot use
+# run_in_container.sh's emitter. Case identity is read from the exported CX_* variables.
+cx_emit_ep_failed_case() {
+  local out="$1" backend="$2" phase="$3" rc="$4"
+  python3 - "$out" "$backend" "$phase" "$rc" <<'PY' || \
+    cx_log "WARN: could not preserve failed-case record"
+import datetime as dt
+import json
+import os
+import sys
+
+out, backend, phase, rc_text = sys.argv[1:5]
+rc = int(rc_text)
+
+
+def env(name, default=""):
+    return os.environ.get(name, default)
+
+
+def integer(name, default):
+    try:
+        return int(env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def enabled(name):
+    return env(name).lower() in {"1", "true", "yes"}
+
+
+failure_mode = {
+    5: "unsupported", 124: "timeout", 137: "timeout", 134: "deadlock",
+}.get(rc, "unknown")
+case = {
+    "case_id": env("CX_CASE_ID") or None,
+    "suite": env("CX_SUITE") or None,
+    "workload": env("CX_WORKLOAD_NAME") or None,
+    "required_publication": env("CX_REQUIRED_PUBLICATION") or None,
+    "backend": backend,
+    "phase": phase,
+    "ep": integer("CX_EP", integer("CX_NGPUS", 1)),
+    "gpus_per_node": integer("CX_GPUS_PER_NODE", integer("CX_NGPUS", 1)),
+    "scale_up_domain": integer("CX_SCALE_UP_DOMAIN", integer("CX_NGPUS", 1)),
+    "dispatch_dtype": env("CX_DISPATCH_DTYPE", "bf16"),
+    "mode": env("CX_MODE", "normal"),
+    "contract": env("CX_MEASUREMENT_CONTRACT", "layout-and-dispatch-v1"),
+    "routing": env("CX_ROUTING", "uniform"),
+    "eplb": enabled("CX_EPLB"),
+    "combine_quant_mode": env("CX_COMBINE_QUANT_MODE", "none"),
+    "resource_mode": env("CX_RESOURCE_MODE", "tuned"),
+    "activation_profile": env("CX_ACTIVATION_PROFILE", "normal"),
+    "placement": env("CX_PLACEMENT", "packed"),
+    "routing_step": env("CX_ROUTING_STEP", "0"),
+    "uneven_tokens": env("CX_UNEVEN_TOKENS", "none"),
+    "tokens_ladder": env("CX_TOKENS_LADDER"),
+    "canonical": enabled("CX_CANONICAL"),
+    "sampling_contract": "fixed-512-v1",
+    "samples_per_point": integer("CX_SAMPLES_PER_POINT", 512),
+    "iters": integer("CX_ITERS", 8),
+    "trials": integer("CX_TRIALS", 64),
+    "warmup": integer("CX_WARMUP", 32),
+    "warmup_semantics": env(
+        "CX_WARMUP_SEMANTICS", "full-roundtrip-per-trial-point-v1"
+    ),
+}
+record = {
+    "schema_version": 5,
+    "family": "moe",
+    "record_type": "failed-case",
+    "generated_by": "runtime/common.sh",
+    "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+    "attempt_id": env("CX_ATTEMPT_ID", "1"),
+    "case_id": case["case_id"],
+    "suite": case["suite"],
+    "workload_name": case["workload"],
+    "required_publication": case["required_publication"],
+    "runner": env("CX_RUNNER"),
+    "backend": backend,
+    "mode": case["mode"],
+    "phase": phase,
+    "ep_size": case["ep"],
+    "measurement_contract": case["contract"],
+    "resource_mode": case["resource_mode"],
+    "topology_class": env("CX_TOPO"),
+    "status": "failed",
+    "publication_status": "failed",
+    "rows": [],
+    "failure": {
+        "failure_mode": failure_mode,
+        "return_code": rc,
+        "case": case,
+        "evidence": "",
+    },
+}
+os.makedirs(os.path.dirname(out), exist_ok=True)
+with open(out, "w") as fh:
+    json.dump(record, fh, indent=2)
+print(f"preserved failed-case record ({failure_mode})")
+PY
+}
diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh
new file mode 100644
index 0000000000..135f32701c
--- /dev/null
+++ b/experimental/CollectiveX/runtime/run_in_container.sh
@@ -0,0 +1,578 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter
+# (launch_<sku>.sh). The SKU adapter handles allocation/container/transport-env;
+# this script selects one EP backend from CX_BENCH and writes result JSON under results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector: CX_BENCH = deepep | mori | uccl | nccl-ep | flashinfer | deepep-hybrid
+# EP knobs passed to tests/run_ep.py:
+#   CX_PHASE = decode | prefill | both (default decode)   <- picks the token sweep
+#   CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point)
+#   CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_MODE(normal|ll)
+#   CX_NUM_SMS (DeepEP comm SMs) CX_SEED CX_ITERS
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=../runtime/common.sh
+source runtime/common.sh
+mkdir -p results
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-deepep}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json"
+
+cx_apply_timing_profile
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+python3 env_capture.py --redact --out "$ENVJSON" --timestamp "$CX_TS"
+
+# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else
+# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else
+# blank => tests/run_ep.py picks the phase default (decode small / prefill large).
+cx_ep_ladder() {
+  if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER"
+  elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK"
+  else printf ''; fi
+}
+
+# Canonical workload staging (goal P1 "official" cohort). make_workloads.py is DETERMINISTIC, so
+# every SKU/backend generates byte-identical serialized traces in-container => identical workload_id
+# + checksum => proven cross-hardware workload identity with NO shared filesystem. When CX_CANONICAL=1
+# (and CX_WORKLOAD_DIR not already provided) we generate the routing's traces for the run's ladder
+# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and
+# point run_ep at it. A canonical-serialized run with full GHA provenance is publication 'official'.
+cx_stage_canonical() {
+  [ "${CX_CANONICAL:-0}" = "1" ] || return 0
+  [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0
+  local dir="$PWD/.cx_workloads"
+  local ladder; ladder="$(cx_ep_ladder)"
+  # cover both phase ladders when none is given, so either phase finds its files.
+  [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+  cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')"
+  python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \
+    --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \
+    --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \
+    || { cx_log "ERROR: canonical workload staging failed"; return 1; }
+  export CX_WORKLOAD_DIR="$dir"
+  cx_log "canonical workloads staged at $dir"
+}
+
+# run_ep_suite <backend: deepep|mori>
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+# Preserve a failed case with its full scheduled identity instead of letting it vanish.
+emit_failed_case() {  # backend phase rc
+  cx_emit_ep_failed_case \
+    "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" "$1" "$2" "$3" || true
+}
+
+run_ep_suite() {
+  local backend="$1" phase phases ladder rc=0 rc_run
+  ladder="$(cx_ep_ladder)"
+  phases="${CX_PHASE:-decode}"
+  [ "$phases" = "both" ] && phases="decode prefill"
+  if ! cx_stage_canonical; then
+    for phase in $phases; do
+      emit_failed_case "$backend" "$phase" 2
+    done
+    return 1
+  fi
+  for phase in $phases; do
+    cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-<phase-default>}'"
+    local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"
+    local -a EPARGS=(--backend "$backend" --phase "$phase" --tokens-ladder "$ladder" --mode "${CX_MODE:-normal}"
+      --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}"
+      --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-uniform}"
+      --num-sms "${CX_NUM_SMS:-24}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-8}"
+      --trials "${CX_TRIALS:-64}" --warmup "${CX_WARMUP:-32}"
+      --measurement-contract "${CX_MEASUREMENT_CONTRACT:-layout-and-dispatch-v1}"
+      --resource-mode "${CX_RESOURCE_MODE:-normalized}" --sm-fraction "${CX_SM_FRACTION:-0.18}"
+      --activation-profile "${CX_ACTIVATION_PROFILE:-normal}" --placement "${CX_PLACEMENT:-packed}"
+      --gpus-per-node "${CX_GPUS_PER_NODE:-0}" --scale-up-domain "${CX_SCALE_UP_DOMAIN:-0}"
+      --routing-step "${CX_ROUTING_STEP:-0}" --uneven-tokens "${CX_UNEVEN_TOKENS:-none}"
+      --combine-dtype "${CX_COMBINE_DTYPE:-bf16}" --combine-quant-mode "${CX_COMBINE_QUANT_MODE:-none}"
+      --case-id "${CX_CASE_ID:-}" --suite "${CX_SUITE:-}" --workload-name "${CX_WORKLOAD_NAME:-}"
+      --required-publication "${CX_REQUIRED_PUBLICATION:-}"
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT"
+      --env-json "$ENVJSON" --out "$out")
+    [ -n "${CX_EPLB:-}" ] && EPARGS+=(--eplb)
+    [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR")
+    [ -n "${CX_WAIVE_ANOMALY:-}" ] && EPARGS+=(--waive-anomaly)
+    timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+      torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"
+    rc_run=$?
+    if [ "$rc_run" != 0 ]; then
+      cx_log "WARN: $backend $phase run failed/timed out rc=$rc_run (CX_RUN_TIMEOUT=${CX_RUN_TIMEOUT:-900}s)"
+      if cx_has_result_doc "$out"; then
+        cx_demote_result_doc "$out" "$rc_run" \
+          || { rm -f "$out"; emit_failed_case "$backend" "$phase" "$rc_run"; }
+        cx_log "preserved benchmark output as a failed attempt"
+      else
+        emit_failed_case "$backend" "$phase" "$rc_run"
+      fi
+      rc=1
+    fi
+  done
+  return "$rc"
+}
+
+# Legacy direct-env diagnostic only. This installs DeepEP main and still drives `Buffer`; it is not
+# PR #605 `ElasticBuffer` V2 evidence and is intentionally absent from workflows and v1 matrices.
+# Keep the low-level hook while the real adapter is developed; its output must not be promoted.
+cx_build_deepep_v2() {
+  # IDEMPOTENT: SHARD mode calls dispatch_bench (hence this) once PER CASE. Build once per allocation,
+  # then skip — else a 60-case shard re-runs the from-source build 60x (force-reinstall) and blows the
+  # slurm --time. Sentinel lives in the container fs (persists across the x86 in-container case loop).
+  [ -f /tmp/.cx_built_deepep_v2 ] && { cx_log "legacy DeepEP diagnostic already built — skip"; return 0; }
+  local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac
+  cx_log "legacy DeepEP main diagnostic: building source (TORCH_CUDA_ARCH_LIST=$arch)"
+  # PEP 668: newer images (H200/B300) ship an externally-managed Python that refuses `pip install`.
+  # PIP_BREAK_SYSTEM_PACKAGES is honored by pip>=23.0.1 and silently ignored by older pip (H100),
+  # so this is safe across every image; --break-system-packages as a flag would error on old pip.
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  pip install -q "nvidia-nccl-cu13>=2.30.4" >&2 2>&1 || cx_log "WARN: nvidia-nccl-cu13 install warning"
+  rm -rf /tmp/DeepEP_v2
+  git clone --depth 1 https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_v2 >&2 2>&1 \
+    || { cx_log "ERROR: legacy DeepEP diagnostic clone failed"; return 1; }
+  DEEPEP_COMMIT="legacy-main-$(git -C /tmp/DeepEP_v2 rev-parse --short HEAD 2>/dev/null || echo main)"
+  export DEEPEP_COMMIT
+  ( cd /tmp/DeepEP_v2 && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+      pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1 \
+    || { cx_log "ERROR: legacy DeepEP diagnostic build failed (arch=$arch)"; return 1; }
+  python3 -c "import deep_ep; print('built deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+    || { cx_log "ERROR: legacy DeepEP diagnostic import failed"; return 1; }
+  : > /tmp/.cx_built_deepep_v2   # sentinel: skip rebuild on subsequent cases in this allocation
+  cx_log "legacy DeepEP diagnostic ready ($DEEPEP_COMMIT; non-publication)"
+}
+
+# Build the DeepEP `hybrid-ep` branch (NVIDIA's TMA-based impl: HybridEPBuffer, intranode NVLink +
+# internode RDMA/NIXL). Three container-specific fixes, all probe-confirmed on the B300 sglang image:
+#   1. CUDA-13 moved cccl/libcudacxx headers to <cuda>/include/cccl/ (not on nvcc's default path) —
+#      its nvshmem_tensor.h #includes <cuda/std/tuple> -> add that dir via CPATH/NVCC_PREPEND_FLAGS.
+#   2. The final link wants -l:libnvshmem_host.so but the bundled nvshmem ships only .so.3 -> create
+#      the unversioned symlink.
+#   3. NVSHMEM_DIR set to the bundled nvshmem enables build; unset => intranode-only (internode/LL off).
+# Intranode HybridEPBuffer (single NVLink domain, <=8 ranks) needs no multi-node/NVSHMEM bring-up.
+cx_build_deepep_hybrid() {
+  [ -f /tmp/.cx_built_deepep_hybrid ] && { cx_log "hybrid-ep already built this allocation — skip rebuild"; return 0; }
+  local arch="9.0"; case "$CX_RUNNER" in b300*|gb300*|b200*) arch="10.0";; esac
+  cx_log "DeepEP hybrid-ep: building NVIDIA TMA branch from source (TORCH_CUDA_ARCH_LIST=$arch)"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  NVSHMEM_DIR="$(python3 -c 'import os,nvidia.nvshmem as n; print(os.path.dirname(n.__file__))' 2>/dev/null || echo /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem)"
+  export NVSHMEM_DIR
+  local cccl; cccl="$(echo /usr/local/cuda*/targets/*/include/cccl | awk '{print $1}')"
+  [ -d "$cccl" ] && { export CPATH="$cccl:${CPATH:-}"; export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}"; }
+  [ -e "$NVSHMEM_DIR/lib/libnvshmem_host.so.3" ] && ln -sf libnvshmem_host.so.3 "$NVSHMEM_DIR/lib/libnvshmem_host.so" 2>/dev/null || true
+  export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}"
+  rm -rf /tmp/DeepEP_hybrid
+  git clone --depth 1 --branch hybrid-ep https://github.com/deepseek-ai/DeepEP /tmp/DeepEP_hybrid >&2 2>&1 \
+    || { cx_log "ERROR: hybrid-ep git clone failed"; return 1; }
+  DEEPEP_COMMIT="hybrid-$(git -C /tmp/DeepEP_hybrid rev-parse --short HEAD 2>/dev/null || echo hybrid-ep)"
+  export DEEPEP_COMMIT
+  # Install into site-packages so the package persists across separate srun shells in the named
+  # container. The shared backend-env handoff below carries process-local loader/provenance values.
+  if ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+        pip install -q --no-build-isolation --force-reinstall . ) >&2 2>&1; then
+    cx_log "hybrid-ep installed into site-packages (persists across srun steps)"
+  else
+    cx_log "WARN: hybrid-ep pip install failed — falling back to build_ext --inplace (EP4 single-node only)"
+    ( cd /tmp/DeepEP_hybrid && TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 python3 setup.py build_ext --inplace ) >&2 2>&1 \
+      || { cx_log "ERROR: hybrid-ep build failed (arch=$arch; cccl/nvshmem?)"; return 1; }
+    export PYTHONPATH="/tmp/DeepEP_hybrid:${PYTHONPATH:-}"
+  fi
+  python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+    || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; }
+  : > /tmp/.cx_built_deepep_hybrid   # sentinel: skip rebuild on subsequent cases in this allocation
+  cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT)"
+}
+
+# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13
+# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668
+# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly.
+cx_build_uccl() {
+  if [ -f /tmp/.cx_built_uccl ]; then
+    cx_log "UCCL EP already prepared this allocation — skip rebuild"
+    python3 -c "import torch; from uccl_deepep import Buffer" 2>/dev/null || return 1
+    return 0
+  fi
+  cx_log "UCCL EP: pip install uccl + cu12 runtime shim"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  pip install -q uccl >&2 2>&1 || { cx_log "ERROR: pip install uccl failed"; return 1; }
+  pip install -q nvidia-cuda-runtime-cu12 >&2 2>&1 || cx_log "WARN: nvidia-cuda-runtime-cu12 warning"
+  local cu12lib
+  cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)"
+  [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}"
+  UCCL_COMMIT="pkg-$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo uccl)"
+  export UCCL_COMMIT
+  # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader
+  # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too.
+  python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \
+    || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; }
+  # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name
+  # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes,
+  # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full
+  # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks.
+  rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg
+  # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's
+  # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel
+  # mismatches signatures. Match them.
+  _uccl_tag="v$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))' 2>/dev/null || echo 0.1.1)"
+  if { git clone --depth 1 --branch "$_uccl_tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \
+       || git clone --depth 1 https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1; } \
+     && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then
+    mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep
+    cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null
+    export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}"
+    python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2 \
+      || { cx_log "ERROR: uccl_deepep wrapper import failed"; return 1; }
+    export CX_UCCL_WRAPPER=1
+  else
+    cx_log "ERROR: uccl deep_ep_wrapper not available"
+    return 1
+  fi
+  : > /tmp/.cx_built_uccl
+  cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})"
+}
+
+run_deepep_suite() {
+  cx_prepare_backend deepep || { cx_log "WARN: DeepEP preparation failed"; return 1; }
+  run_ep_suite deepep
+}
+
+run_mori_suite() {
+  cx_prepare_backend mori || { cx_log "WARN: MoRI preparation failed"; return 1; }
+  run_ep_suite mori
+}
+
+run_uccl_suite() {
+  cx_prepare_backend uccl || { cx_log "WARN: UCCL EP preparation failed"; return 1; }
+  run_ep_suite uccl
+}
+run_nccl_ep_suite() {
+  # Portable torch.distributed all-to-all reference; no build step.
+  run_ep_suite nccl-ep
+}
+run_deepep_hybrid_suite() {
+  cx_prepare_backend deepep-hybrid || { cx_log "WARN: Hybrid DeepEP preparation failed"; return 1; }
+  run_ep_suite deepep-hybrid
+}
+
+# Upgrade FlashInfer in-container to the latest wheel — the bundled 0.6.8.post1 lacks the
+# quantized-COMBINE OUTPUT path (moe_a2a_combine output_dtype/output_scales, added in a newer
+# release; confirmed in the main-branch source). A combine-quant run needs it; the dispatch path
+# (bf16/fp8/mxfp8/nvfp4) is unaffected and stays on whatever is installed. Best-effort: a failed
+# upgrade leaves the run on the bundled version (the combine-quant adapter then rejects loudly).
+cx_build_flashinfer_latest() {
+  [ -f /tmp/.cx_built_flashinfer ] && { cx_log "FlashInfer quant-combine build already done this allocation — skip"; return 0; }
+  cx_log "FlashInfer: upgrading to latest wheel for quantized-combine output (moe_a2a_combine output_dtype)"
+  export PIP_BREAK_SYSTEM_PACKAGES=1
+  # moe_a2a_combine output_dtype is on flashinfer MAIN but NOT in the latest PyPI release (0.6.13) —
+  # so `pip -U flashinfer-python` (PyPI) is insufficient. Install from the NIGHTLY wheel index
+  # (built from main): flashinfer-python (--no-deps; the container already has torch etc.) + the
+  # matching cubin + cu130 jit-cache. FLASHINFER_DISABLE_VERSION_CHECK=1 bypasses any residual
+  # sub-package skew. Falls back to a PyPI -U (which then asserts-out cleanly if it lacks output_dtype).
+  export FLASHINFER_DISABLE_VERSION_CHECK=1
+  local before after NIDX="https://flashinfer.ai/whl/nightly"
+  before="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)"
+  { pip install -q -U --pre flashinfer-python --index-url "$NIDX/" --no-deps >&2 2>&1 \
+      && pip install -q -U --pre flashinfer-cubin --index-url "$NIDX/" >&2 2>&1 \
+      && pip install -q -U --pre flashinfer-jit-cache --index-url "$NIDX/cu130" >&2 2>&1; } \
+    || { cx_log "WARN: flashinfer nightly index failed — falling back to PyPI -U"; \
+         pip install -q -U flashinfer-python flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true; }
+  # The nightly (main) flashinfer's CuTe-DSL kernels import newer cutlass.cute symbols (e.g.
+  # OperandMajorMode) than the bundled nvidia-cutlass-dsl provides — upgrade it to match (PyPI).
+  pip install -q -U nvidia-cutlass-dsl >&2 2>&1 || cx_log "WARN: nvidia-cutlass-dsl upgrade warning"
+  # The cu130 nightly WHEEL (0.6.13.dev20260612) still predates the combine output_dtype PR — if it's
+  # absent, build flashinfer MAIN from source (the container has the cu130 toolchain that built
+  # deep_ep-v2 + hybrid-ep; cutlass-dsl 4.5.2 is now installed; JIT-first build, time-boxed).
+  if ! python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine))" 2>/dev/null; then
+    cx_log "FlashInfer nightly wheel lacks combine output_dtype — building flashinfer main from source"
+    # Uninstall the precompiled cubin + jit-cache FIRST: they ship the OLD 10-arg moe_a2a_combine
+    # kernel, which the main Python wrapper (14-arg, with output_dtype) then mis-calls ("Expected 10
+    # but got 14 arguments"). Removing them forces get_moe_alltoall_module() to JIT-compile the
+    # kernel FRESH from main's csrc at runtime (14-arg, matching the wrapper).
+    pip uninstall -y flashinfer-cubin flashinfer-jit-cache >&2 2>&1 || true
+    rm -rf /tmp/fi_main ~/.cache/flashinfer 2>/dev/null || true
+    if git clone --recursive --depth 1 https://github.com/flashinfer-ai/flashinfer.git /tmp/fi_main >&2 2>&1; then
+      ( cd /tmp/fi_main && timeout 2400 pip install -q --no-build-isolation . >&2 2>&1 ) \
+        || cx_log "WARN: flashinfer main source build failed/timed out"
+    else
+      cx_log "WARN: flashinfer main clone failed (compute-node network?)"
+    fi
+  fi
+  after="$(python3 -c 'import flashinfer;print(flashinfer.__version__)' 2>/dev/null || echo none)"
+  export FLASHINFER_COMMIT="pkg-$after"
+  cx_capture_flashinfer_identity
+  cx_log "FlashInfer upgrade (nightly): $before -> $after"
+  cx_log "FlashInfer stack: $CX_FLASHINFER_STACK"
+  python3 -c "import inspect, flashinfer.comm as c; assert 'output_dtype' in str(inspect.signature(c.MoeAlltoAll.combine)), 'combine still has no output_dtype'; print('combine output_dtype: present')" >&2 \
+    || { cx_log "ERROR: upgraded FlashInfer combine still lacks output_dtype — cannot quant-combine"; return 1; }
+  : > /tmp/.cx_built_flashinfer   # sentinel: skip rebuild on subsequent cases in this allocation
+}
+
+cx_capture_deepep_identity() {
+  local version
+  version="$(python3 - <<'PY' 2>/dev/null || echo unknown
+try:
+    import importlib.metadata as metadata
+    print(metadata.version("deep_ep"))
+except Exception:
+    import deep_ep
+    print(getattr(deep_ep, "__version__", "unknown"))
+PY
+)"
+  export DEEPEP_COMMIT="${DEEPEP_COMMIT:-pkg-$version}"
+}
+
+cx_capture_flashinfer_identity() {
+  local version
+  version="$(python3 - <<'PY' 2>/dev/null || echo unknown
+try:
+    import importlib.metadata as metadata
+    print(metadata.version("flashinfer-python"))
+except Exception:
+    import flashinfer
+    print(getattr(flashinfer, "__version__", "unknown"))
+PY
+)"
+  export FLASHINFER_COMMIT="${FLASHINFER_COMMIT:-pkg-$version}"
+  CX_FLASHINFER_STACK="$(python3 - <<'PY' 2>/dev/null || echo capture-failed
+import importlib.metadata as metadata
+
+packages = ("flashinfer-python", "flashinfer-cubin", "flashinfer-jit-cache",
+            "nvidia-cutlass-dsl", "torch")
+def version(name):
+    try:
+        return metadata.version(name)
+    except Exception:
+        return "absent"
+print(" ".join(f"{name}={version(name)}" for name in packages))
+PY
+)"
+  export CX_FLASHINFER_STACK
+}
+
+# A rack build-only step and its rank steps are separate shells. Persist every backend-created
+# loader/import path and build identity in the named container, then source it from each rank.
+cx_persist_backend_env() {
+  local path=/tmp/.cx_backend_env name
+  local -a names=(LD_LIBRARY_PATH PYTHONPATH NVSHMEM_DIR DEEPEP_COMMIT FLASHINFER_COMMIT
+    CX_FLASHINFER_STACK FLASHINFER_DISABLE_VERSION_CHECK UCCL_COMMIT CX_UCCL_WRAPPER)
+  : > "$path" || return 1
+  for name in "${names[@]}"; do
+    if declare -p "$name" >/dev/null 2>&1; then
+      printf 'export %s=%q\n' "$name" "${!name}" >> "$path" || return 1
+    fi
+  done
+}
+
+# Prepare and probe one backend without running a benchmark. The same hook is used
+# by normal in-container runs and by rack launchers' persistent build-only step.
+cx_prepare_backend() {
+  local backend="${1:-}"
+  [ -f /tmp/.cx_backend_env ] && source /tmp/.cx_backend_env
+  case "$backend" in
+    deepep)
+      if [ "${CX_DEEPEP_V2:-0}" = "1" ]; then
+        cx_build_deepep_v2 || return 1
+      fi
+      if ! python3 -c "from deep_ep import Buffer" 2>/dev/null; then
+        command -v rebuild-deepep.sh >/dev/null 2>&1 || {
+          cx_log "WARN: DeepEP is unavailable and rebuild-deepep.sh is missing"
+          return 1
+        }
+        cx_log "building normal DeepEP"
+        rebuild-deepep.sh >&2 || return 1
+      fi
+      python3 -c "from deep_ep import Buffer" 2>/dev/null || return 1
+      cx_capture_deepep_identity
+      ;;
+    deepep-hybrid)
+      cx_build_deepep_hybrid || return 1
+      ;;
+    flashinfer)
+      if { [ -n "${CX_COMBINE_DTYPE:-}" ] && [ "${CX_COMBINE_DTYPE}" != "bf16" ]; } \
+          || [ "${CX_FLASHINFER_UPGRADE:-}" = "1" ]; then
+        cx_build_flashinfer_latest || return 1
+      fi
+      python3 -c "import flashinfer.comm" 2>/dev/null || return 1
+      cx_capture_flashinfer_identity
+      ;;
+    uccl)
+      cx_build_uccl || return 1
+      ;;
+    mori)
+      python3 -c "import mori" 2>/dev/null || return 1
+      ;;
+    nccl-ep)
+      ;;
+    *)
+      cx_log "ERROR: unknown backend preparation request"
+      return 1
+      ;;
+  esac
+}
+
+run_flashinfer_suite() {
+  cx_prepare_backend flashinfer || { cx_log "WARN: FlashInfer preparation failed"; return 1; }
+  run_ep_suite flashinfer
+}
+
+# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many
+# of these per allocation (SHARD mode below), reusing this single container + its built backend.
+dispatch_bench() {
+  local rc=0
+  case "$CX_BENCH" in
+    deepep)      run_deepep_suite || rc=1 ;;
+    mori)        run_mori_suite || rc=1 ;;
+    uccl)        run_uccl_suite || rc=1 ;;
+    nccl-ep)     run_nccl_ep_suite || rc=1 ;;
+    flashinfer)  run_flashinfer_suite || rc=1 ;;
+    deepep-hybrid) run_deepep_hybrid_suite || rc=1 ;;
+    *)           cx_die "unknown CX_BENCH=$CX_BENCH (want deepep|mori|uccl|nccl-ep|flashinfer|deepep-hybrid)" ;;
+  esac
+  return $rc
+}
+
+rc=0
+# Structured v1 shards never run the legacy DeepEP-main diagnostic, even if a self-hosted runner
+# happens to inherit the old environment variable. Direct manual invocations without a shard remain.
+[ -n "${CX_SHARD_FILE:-}" ] && unset CX_DEEPEP_V2
+# Build-only mode: rack launchers run the shared backend preparation hook once per
+# node inside a persistent named container, then direct rank processes reuse it.
+if [ -n "${CX_BUILD_ONLY:-}" ]; then
+  if cx_prepare_backend "${CX_BENCH:-}"; then
+    cx_persist_backend_env || rc=1
+  else
+    rc=1
+  fi
+  cx_log "backend preparation: bench=${CX_BENCH:-unknown} rc=$rc"
+  exit "$rc"
+fi
+if [ -n "${CX_SHARD_FILE:-}" ] && [ -f "${CX_SHARD_FILE:-/nonexistent}" ]; then
+  # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation.
+  # All cases share (sku, backend, nodes) so the backend build (cx_build_*) is paid once and cached
+  # for the rest. Each case overrides its own mode/resource_mode/dtype/contract/routing/phase/eplb/
+  # workload, then reuses the same per-config path (dispatch_bench). Collapses a whole build-group's
+  # cases (all modes/resource_modes) into one allocation; the shard key is (sku,backend,nodes).
+  ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE')).get('cases',[])))" 2>/dev/null || echo 0)"
+  cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)"
+  _cx_ts_base="$CX_TS"   # per-case CX_TS suffix below keeps each case's result file UNIQUE (else
+                         # cases sharing backend+phase overwrite each other at the same timestamp).
+  ci=0
+  failed_cases=0
+  while [ "$ci" -lt "$ncases" ]; do
+    CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")"
+    export CX_TS
+    # Map case[ci] fields -> CX_* env (shell-quoted). The setup job pre-resolved hidden/topk/experts
+    # + the token ladder into each case, so the loop is config-only (no workloads.yaml lookup here).
+    _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY'
+import json, sys, shlex
+c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])]
+def g(k, d=""):
+    v = c.get(k, d); return "" if v is None else str(v)
+env = {
+  "CX_BENCH": g("backend"), "CX_MODE": g("mode", "normal"),
+  "CX_DISPATCH_DTYPE": g("dtype", "bf16"),
+  "CX_MEASUREMENT_CONTRACT": g("contract", "layout-and-dispatch-v1"),
+  "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"),
+  "CX_RESOURCE_MODE": g("resource_mode", "normalized"),
+  "CX_ACTIVATION_PROFILE": g("activation_profile", "normal"),
+  "CX_PLACEMENT": g("placement", "packed"), "CX_ROUTING_STEP": g("routing_step", "0"),
+  "CX_UNEVEN_TOKENS": g("uneven_tokens", "none"),
+  "CX_EP": g("ep", "1"),
+  "CX_EPLB": "1" if c.get("eplb") else "",
+  "CX_COMBINE_QUANT_MODE": g("combine_quant_mode", "none"),
+  "CX_CASE_ID": g("case_id"), "CX_SUITE": g("suite"), "CX_WORKLOAD_NAME": g("workload"),
+  "CX_REQUIRED_PUBLICATION": g("required_publication"),
+  "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"),
+  "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""),
+}
+lines = [f"export {k}={shlex.quote(v)}" for k, v in env.items()]
+# Per-case timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32 everywhere);
+# cases without one must fall back to the harness defaults, so UNSET rather than export-empty
+# (an empty CX_ITERS would defeat the 8-iter default and break the run_ep argparse; NOTE no
+# apostrophes in this heredoc — bash command-substitution scanning chokes on unbalanced quotes).
+timing = g("timing")
+if timing:
+    parts = (timing.split(":") + ["", "", ""])[:3]
+    for k, v in zip(("CX_ITERS", "CX_TRIALS", "CX_WARMUP"), parts):
+        if v:
+            lines.append(f"export {k}={shlex.quote(v)}")
+else:
+    lines.append("unset CX_ITERS CX_TRIALS CX_WARMUP 2>/dev/null || true")
+print("\n".join(lines))
+PY
+)"
+    eval "$_exports"
+    # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical
+    # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's
+    # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests
+    # (FileNotFoundError .cx_workloads/<wid>.manifest.json). Unset so every case re-stages its own.
+    unset CX_WORKLOAD_DIR 2>/dev/null || true
+    cx_log "  [$((ci+1))/$ncases] $CX_BENCH $CX_PHASE $CX_DISPATCH_DTYPE/$CX_MODE/${CX_MEASUREMENT_CONTRACT/-v1/} rt=$CX_ROUTING eplb=${CX_EPLB:-0}"
+    # flashinfer's MoeAlltoAll MNNVL barrier INTERMITTENTLY deadlocks on h100 ('Rank N timed out waiting
+    # for completion flag' -> CUDA unspecified launch failure): ~half of cases, scattered across T/routing,
+    # the SAME config both crashes AND passes (a transient, not config/pidfd). Upgrade to flashinfer 0.6.14
+    # + a between-case shm-drop settle were both TESTED and did NOT fix it (the settle made it worse). Since
+    # it's intermittent, RETRY: each fresh torchrun is another independent attempt. Every attempt gets
+    # a unique identity and filename; a later success must not erase the earlier failure evidence.
+    attempts=1; [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 ))
+    _cx_case_ts="$CX_TS"
+    a=1
+    while :; do
+      CX_TS="${_cx_case_ts}-a$(printf '%02d' "$a")"
+      export CX_ATTEMPT_ID="$a" CX_TS
+      if dispatch_bench; then
+        break
+      fi
+      # A failed CASE does NOT fail the shard job. The failed-case record + the summary table are
+      # the signal (the doctrine is judge-by-data, and the conclusion should match it): expected
+      # per-case failures — the empty-rank diagnostic on HybridEP/UCCL Hopper, a flashinfer
+      # intermittent that survived its retries — used to flip 200+-correct-point jobs red. The job
+      # now fails only when the harness itself is unhealthy (summarize.py: NO valid results at all).
+      # Known DETERMINISTIC whole-shard walls never even dispatch (capability RUNNER_WALLS/aarch64).
+      [ "$a" -ge "$attempts" ] && { failed_cases=$((failed_cases+1)); cx_log "  [$((ci+1))/$ncases] $CX_BENCH case FAILED after $a attempt(s) — failed-case record preserved; shard continues"; break; }
+      cx_log "  [$((ci+1))/$ncases] $CX_BENCH attempt $a/$attempts failed — retry (intermittent MNNVL barrier)"
+      a=$((a+1))
+    done
+    export CX_TS="$_cx_case_ts"
+    ci=$((ci + 1))
+  done
+  [ "${failed_cases:-0}" -gt 0 ] && cx_log "SHARD done: $failed_cases/$ncases case(s) failed (records preserved — see the summary table + failed_*.json)" || true
+  # RESTORE the base timestamp: the loop re-exported CX_TS per case (…-cNNN), so leaving the LAST
+  # case's ts in place made the final summarize below filter to that ONE case — and when the last
+  # case happened to be a failing diagnostic (empty-rank sorts last), summarize saw "no result
+  # files" and flipped an otherwise-complete shard red (h200 run 28577792572: 39/40 good cases,
+  # conclusion failure). The base ts is a substring of every per-case filename, so summarize then
+  # gates on the WHOLE shard's results, as intended.
+  export CX_TS="$_cx_ts_base"
+else
+  # Single-bench (workflow_dispatch) path gets the SAME flashinfer retry as SHARD mode — the
+  # combine-quant runs (flashinfer-combine-* -> CX_BENCH=flashinfer) come through here and are
+  # subject to the same intermittent h100 MNNVL-barrier deadlock; one attempt dies ~50% of the
+  # time. Non-flashinfer benches run once (their failures are deterministic — retry wastes time).
+  attempts=1; [ "$CX_BENCH" = "flashinfer" ] && attempts=$(( ${CX_FLASHINFER_RETRIES:-3} + 1 ))
+  _cx_single_ts="$CX_TS"
+  a=1
+  while :; do
+    CX_TS="${_cx_single_ts}-a$(printf '%02d' "$a")"
+    export CX_ATTEMPT_ID="$a" CX_TS
+    if dispatch_bench; then
+      break
+    fi
+    [ "$a" -ge "$attempts" ] && { rc=1; break; }
+    cx_log "$CX_BENCH attempt $a/$attempts failed — retry (intermittent MNNVL barrier)"
+    a=$((a+1))
+  done
+fi
+
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/schemas/ep-result-v4.schema.json b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
new file mode 100644
index 0000000000..d37e25bcb2
--- /dev/null
+++ b/experimental/CollectiveX/schemas/ep-result-v4.schema.json
@@ -0,0 +1,219 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/ep-result-v4.schema.json",
+  "title": "CollectiveX EP dispatch/combine result (v4)",
+  "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v4 adds multi-dimensional validity + machine-derived publication_status, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. v3 docs load via compatibility (publication_status absent => treated as legacy/experimental). record_type=failed-case marks an intentionally preserved failure skeleton (judge-by-data doctrine): empty rows + a failure block, exempt from the full-sweep requirements.",
+  "type": "object",
+  "required": ["schema_version", "family", "runner", "backend", "publication_status", "rows"],
+  "if": {"properties": {"record_type": {"const": "failed-case"}}, "required": ["record_type"]},
+  "then": {
+    "required": ["failure"],
+    "properties": {
+      "publication_status": {"const": "failed"},
+      "rows": {"maxItems": 0}
+    }
+  },
+  "else": {
+    "required": ["mode", "phase", "ep_size", "measurement_contract", "shape",
+                 "validity", "workload", "reproduction",
+                 "backend_provenance", "comparison_key"],
+    "properties": {
+      "rows": {"minItems": 1}
+    }
+  },
+  "properties": {
+    "schema_version": {"type": "integer", "minimum": 3},
+    "family": {"const": "moe"},
+    "runner": {"type": "string"},
+    "record_type": {"type": "string", "enum": ["failed-case"]},
+    "failure": {
+      "type": "object",
+      "required": ["failure_mode", "return_code", "case"],
+      "properties": {
+        "failure_mode": {"type": "string"},
+        "return_code": {"type": "integer"},
+        "case": {"type": "object"},
+        "evidence": {"type": "string"}
+      }
+    },
+    "backend": {"type": "string", "enum": ["deepep", "deepep-hybrid", "mori", "aiter", "uccl", "flashinfer", "nccl-ep"]},
+    "mode": {"type": "string", "enum": ["normal", "ll"]},
+    "phase": {"type": "string", "enum": ["decode", "prefill"]},
+    "ep_size": {"type": "integer", "minimum": 1},
+    "world_size": {"type": "integer", "minimum": 1},
+    "nodes": {"type": "integer", "minimum": 1},
+    "topology_class": {"type": "string"},
+    "transport": {"type": "string"},
+    "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]},
+    "measurement_contract": {"type": "string",
+      "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1",
+               "mori-quant-combine-v1"]},
+    "publication_status": {"type": "string",
+      "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]},
+    "validity": {
+      "type": "object",
+      "required": ["execution_status", "semantic_correctness", "workload_identity",
+                   "measurement_conformance", "resource_conformance", "provenance_complete"],
+      "properties": {
+        "execution_status": {"type": "string", "enum": ["complete", "failed"]},
+        "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]},
+        "workload_identity": {"type": "string"},
+        "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]},
+        "resource_conformance": {"type": "string"},
+        "provenance_complete": {"type": "boolean"},
+        "anomaly_free": {"type": "boolean"}
+      }
+    },
+    "workload": {
+      "type": "object",
+      "required": ["source", "trace_signature", "cross_rank_consistent"],
+      "properties": {
+        "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "workload_id": {"type": ["string", "null"]},
+        "manifest_checksums": {"type": ["object", "null"]},
+        "trace_signature": {"type": "string"},
+        "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}},
+        "cross_rank_consistent": {"type": "boolean"},
+        "activation_profile": {"type": "string"},
+        "activation_identity": {"type": ["string", "null"]}
+      }
+    },
+    "shape": {
+      "type": "object",
+      "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"],
+      "properties": {
+        "hidden": {"type": "integer"}, "topk": {"type": "integer"},
+        "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"},
+        "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"]},
+        "routing": {"type": "string"},
+        "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"},
+        "kernel_gen": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "quant": {
+          "type": "object",
+          "properties": {
+            "combine_input_dtype": {"type": "string"},
+            "combine_accum_dtype": {"type": "string"},
+            "combine_output_dtype": {"type": "string"},
+            "combine_quant_mode": {"type": "string"},
+            "scale_layout": {"type": ["string", "null"]}
+          }
+        }
+      }
+    },
+    "reproduction": {
+      "type": "object",
+      "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract"],
+      "properties": {
+        "command": {"type": "string"},
+        "image": {"type": ["string", "null"]},
+        "image_digest": {"type": ["string", "null"]},
+        "image_arch": {"type": ["string", "null"]},
+        "squash_sha256": {"type": ["string", "null"]},
+        "git_run": {"type": ["object", "null"]},
+        "fp8_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dequant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dtype": {"type": "string"}, "combine_quant_mode": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "routing_step": {"type": "integer"}, "uneven_tokens": {"type": "string"},
+        "waive_anomaly": {"type": "boolean"}, "roundtrip_anomaly_threshold": {"type": "number"}
+      }
+    },
+    "backend_provenance": {"type": "object"},
+    "phase_profile": {"type": "object"},
+    "source_allocation": {
+      "type": "object",
+      "properties": {
+        "mode": {"type": "string", "enum": ["none", "linear", "empty-rank"]},
+        "routing_step": {"type": "integer"}
+      }
+    },
+    "placement": {
+      "type": "object",
+      "properties": {
+        "kind": {"type": "string", "enum": ["packed", "striped", "runtime-native", "adversarial"]},
+        "nodes": {"type": "integer"}, "gpus_per_node": {"type": "integer"},
+        "scale_up_domain": {"type": "integer"}, "ranks": {"type": "integer"}
+      }
+    },
+    "eplb": {
+      "type": "object",
+      "properties": {
+        "enabled": {"type": "boolean"},
+        "num_logical_experts": {"type": "integer"}, "num_physical_experts": {"type": "integer"},
+        "imbalance_before": {"type": "number"}, "imbalance_after": {"type": "number"},
+        "mapping_hash": {"type": ["string", "null"]}
+      }
+    },
+    "anomalies": {"type": "array", "items": {"type": "object"}},
+    "anomaly_summary": {
+      "type": "object",
+      "properties": {
+        "count": {"type": "integer"}, "waived": {"type": "boolean"},
+        "types": {"type": "array", "items": {"type": "string"}}
+      }
+    },
+    "rows": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip",
+                     "isolated_sum", "samples_pooled", "byte_contracts", "correct"],
+        "properties": {
+          "tokens_per_rank": {"type": "integer", "minimum": 1},
+          "global_tokens": {"type": "integer", "minimum": 1},
+          "dispatch": {"$ref": "#/definitions/percentiles"},
+          "combine": {"$ref": "#/definitions/percentiles"},
+          "roundtrip": {"$ref": "#/definitions/percentiles"},
+          "isolated_sum": {"type": "object"},
+          "samples_pooled": {"type": "integer", "minimum": 1},
+          "percentile_interpolation": {"type": "string"},
+          "per_rank_dispatch_us": {"type": "object"},
+          "raw_samples": {"type": "object"},
+          "byte_contracts": {
+            "type": "object",
+            "required": ["token_rank_payload_copies", "token_expert_payload_copies",
+                         "dispatch_bytes", "combine_bytes"],
+            "properties": {
+              "token_rank_payload_copies": {"type": "integer"},
+              "token_expert_payload_copies": {"type": "integer"},
+              "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"}
+            }
+          },
+          "roundtrip_tokens_per_second": {"type": ["number", "null"]},
+          "bandwidth": {
+            "type": "object",
+            "properties": {
+              "logical_payload_rate_gbps": {"type": "object"},
+              "backend_buffer_rate_gbps": {"type": "object"},
+              "algorithm_bandwidth_gbps": {"type": ["number", "null"]},
+              "bus_bandwidth_gbps": {"type": ["number", "null"]},
+              "wire_utilization": {"type": ["number", "null"]}
+            }
+          },
+          "fanout_hist": {"type": "array"},
+          "rank_load_hist": {"type": "array"},
+          "expert_load_cv": {"type": "number"}, "rank_load_cv": {"type": "number"},
+          "hotspot_ratio": {"type": "number"},
+          "dest_rank_load_max": {"type": "integer"}, "dest_rank_load_mean": {"type": "number"},
+          "empty_expert_count": {"type": "integer"}, "empty_rank_count": {"type": "integer"},
+          "source_token_stats": {"type": ["object", "null"]},
+          "anomalies": {"type": "array", "items": {"type": "object"}},
+          "correct": {"type": "boolean"}
+        }
+      }
+    }
+  },
+  "definitions": {
+    "percentiles": {
+      "type": "object",
+      "required": ["p50", "p90", "p95", "p99"],
+      "properties": {
+        "p50": {"type": "number"}, "p90": {"type": "number"},
+        "p95": {"type": "number"}, "p99": {"type": "number"}
+      }
+    }
+  }
+}
diff --git a/experimental/CollectiveX/schemas/ep-result-v5.schema.json b/experimental/CollectiveX/schemas/ep-result-v5.schema.json
new file mode 100644
index 0000000000..9c1f76c721
--- /dev/null
+++ b/experimental/CollectiveX/schemas/ep-result-v5.schema.json
@@ -0,0 +1,230 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/ep-result-v5.schema.json",
+  "title": "CollectiveX EP dispatch/combine result (v5)",
+  "description": "One (backend, phase, dtype, mode, contract, routing) sweep. v5 adds the required fixed-512-v1 sampling contract (8 timed iterations x 64 trials with 32 warmups on every SKU/backend) to v4's multi-dimensional validity, measured roundtrip, dual byte contracts, per-rank diagnostics, raw-sample histograms, and workload identity. record_type=failed-case marks an intentionally preserved failure skeleton (judge-by-data doctrine): empty rows + a failure block, exempt from the full-sweep requirements.",
+  "type": "object",
+  "required": ["schema_version", "family", "runner", "backend", "publication_status", "rows"],
+  "if": {"properties": {"record_type": {"const": "failed-case"}}, "required": ["record_type"]},
+  "then": {
+    "required": ["failure"],
+    "properties": {
+      "publication_status": {"const": "failed"},
+      "rows": {"maxItems": 0}
+    }
+  },
+  "else": {
+    "required": ["mode", "phase", "ep_size", "measurement_contract", "shape",
+                 "validity", "workload", "reproduction",
+                 "backend_provenance", "comparison_key"],
+    "properties": {
+      "rows": {"minItems": 1}
+    }
+  },
+  "properties": {
+    "schema_version": {"const": 5},
+    "family": {"const": "moe"},
+    "runner": {"type": "string"},
+    "case_id": {"type": ["string", "null"], "pattern": "^cxv1-[0-9a-f]{20}$"},
+    "record_type": {"type": "string", "enum": ["failed-case"]},
+    "failure": {
+      "type": "object",
+      "required": ["failure_mode", "return_code", "case"],
+      "properties": {
+        "failure_mode": {"type": "string"},
+        "return_code": {"type": "integer"},
+        "case": {"type": "object"},
+        "evidence": {"type": "string"}
+      }
+    },
+    "backend": {"type": "string", "enum": ["deepep", "deepep-hybrid", "mori", "aiter", "uccl", "flashinfer", "nccl-ep"]},
+    "mode": {"type": "string", "enum": ["normal", "ll"]},
+    "phase": {"type": "string", "enum": ["decode", "prefill"]},
+    "ep_size": {"type": "integer", "minimum": 1},
+    "world_size": {"type": "integer", "minimum": 1},
+    "nodes": {"type": "integer", "minimum": 1},
+    "topology_class": {"type": "string"},
+    "transport": {"type": "string"},
+    "resource_mode": {"type": "string", "enum": ["normalized", "tuned", "default"]},
+    "measurement_contract": {"type": "string",
+      "enum": ["layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1",
+               "mori-quant-combine-v1"]},
+    "publication_status": {"type": "string",
+      "enum": ["official", "comparable-experimental", "diagnostic", "invalid", "failed"]},
+    "validity": {
+      "type": "object",
+      "required": ["execution_status", "semantic_correctness", "workload_identity",
+                   "measurement_conformance", "sampling_conformance", "resource_conformance",
+                   "provenance_complete"],
+      "properties": {
+        "execution_status": {"type": "string", "enum": ["complete", "failed"]},
+        "semantic_correctness": {"type": "string", "enum": ["pass", "fail"]},
+        "workload_identity": {"type": "string"},
+        "workload_source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "measurement_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]},
+        "sampling_conformance": {"type": "string", "enum": ["conformant", "nonconformant"]},
+        "resource_conformance": {"type": "string"},
+        "provenance_complete": {"type": "boolean"},
+        "anomaly_free": {"type": "boolean"}
+      }
+    },
+    "workload": {
+      "type": "object",
+      "required": ["source", "trace_signature", "cross_rank_consistent"],
+      "properties": {
+        "source": {"type": "string", "enum": ["canonical-serialized", "seeded-runtime"]},
+        "workload_id": {"type": ["string", "null"]},
+        "manifest_checksums": {"type": ["object", "null"]},
+        "trace_signature": {"type": "string"},
+        "distinct_per_T_hashes": {"type": "array", "items": {"type": "string"}},
+        "cross_rank_consistent": {"type": "boolean"},
+        "activation_profile": {"type": "string"},
+        "activation_identity": {"type": ["string", "null"]}
+      }
+    },
+    "shape": {
+      "type": "object",
+      "required": ["hidden", "topk", "experts", "experts_per_rank", "dispatch_dtype", "routing"],
+      "properties": {
+        "hidden": {"type": "integer"}, "topk": {"type": "integer"},
+        "experts": {"type": "integer"}, "experts_per_rank": {"type": "integer"},
+        "dispatch_dtype": {"type": "string", "enum": ["bf16", "fp8", "fp8-pertoken", "fp8-directcast", "mxfp8", "mxfp4", "nvfp4"]},
+        "routing": {"type": "string"},
+        "eplb": {"type": "boolean"}, "num_logical_experts": {"type": "integer"},
+        "kernel_gen": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "quant": {
+          "type": "object",
+          "properties": {
+            "combine_input_dtype": {"type": "string"},
+            "combine_accum_dtype": {"type": "string"},
+            "combine_output_dtype": {"type": "string"},
+            "combine_quant_mode": {"type": "string"},
+            "scale_layout": {"type": ["string", "null"]}
+          }
+        }
+      }
+    },
+    "reproduction": {
+      "type": "object",
+      "required": ["command", "seed", "warmup", "iters", "trials", "measurement_contract",
+                   "sampling_contract", "samples_per_point", "warmup_semantics"],
+      "properties": {
+        "command": {"type": "string"},
+        "image": {"type": ["string", "null"]},
+        "image_digest": {"type": ["string", "null"]},
+        "image_arch": {"type": ["string", "null"]},
+        "squash_sha256": {"type": ["string", "null"]},
+        "git_run": {"type": ["object", "null"]},
+        "warmup": {"const": 32},
+        "iters": {"const": 8},
+        "trials": {"const": 64},
+        "warmup_semantics": {"const": "full-roundtrip-per-trial-point-v1"},
+        "fp8_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_quant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dequant_in_timing": {"type": ["boolean", "null"]},
+        "combine_dtype": {"type": "string"}, "combine_quant_mode": {"type": "string"},
+        "activation_profile": {"type": "string"},
+        "routing_step": {"type": "integer"}, "uneven_tokens": {"type": "string"},
+        "sampling_contract": {"const": "fixed-512-v1"},
+        "samples_per_point": {"const": 512},
+        "waive_anomaly": {"type": "boolean"}, "roundtrip_anomaly_threshold": {"type": "number"}
+      }
+    },
+    "backend_provenance": {"type": "object"},
+    "phase_profile": {"type": "object"},
+    "source_allocation": {
+      "type": "object",
+      "properties": {
+        "mode": {"type": "string", "enum": ["none", "linear", "empty-rank"]},
+        "routing_step": {"type": "integer"}
+      }
+    },
+    "placement": {
+      "type": "object",
+      "properties": {
+        "kind": {"type": "string", "enum": ["packed", "striped", "runtime-native", "adversarial"]},
+        "nodes": {"type": "integer"}, "gpus_per_node": {"type": "integer"},
+        "scale_up_domain": {"type": "integer"}, "ranks": {"type": "integer"}
+      }
+    },
+    "eplb": {
+      "type": "object",
+      "properties": {
+        "enabled": {"type": "boolean"},
+        "num_logical_experts": {"type": "integer"}, "num_physical_experts": {"type": "integer"},
+        "imbalance_before": {"type": "number"}, "imbalance_after": {"type": "number"},
+        "mapping_hash": {"type": ["string", "null"]}
+      }
+    },
+    "anomalies": {"type": "array", "items": {"type": "object"}},
+    "anomaly_summary": {
+      "type": "object",
+      "properties": {
+        "count": {"type": "integer"}, "waived": {"type": "boolean"},
+        "types": {"type": "array", "items": {"type": "string"}}
+      }
+    },
+    "rows": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["tokens_per_rank", "global_tokens", "dispatch", "combine", "roundtrip",
+                     "isolated_sum", "samples_pooled", "byte_contracts", "correct"],
+        "properties": {
+          "tokens_per_rank": {"type": "integer", "minimum": 1},
+          "global_tokens": {"type": "integer", "minimum": 1},
+          "dispatch": {"$ref": "#/definitions/percentiles"},
+          "combine": {"$ref": "#/definitions/percentiles"},
+          "roundtrip": {"$ref": "#/definitions/percentiles"},
+          "isolated_sum": {"type": "object"},
+          "samples_pooled": {"const": 512},
+          "trials": {"const": 64},
+          "percentile_interpolation": {"type": "string"},
+          "per_rank_dispatch_us": {"type": "object"},
+          "raw_samples": {"type": "object"},
+          "byte_contracts": {
+            "type": "object",
+            "required": ["token_rank_payload_copies", "token_expert_payload_copies",
+                         "dispatch_bytes", "combine_bytes"],
+            "properties": {
+              "token_rank_payload_copies": {"type": "integer"},
+              "token_expert_payload_copies": {"type": "integer"},
+              "dispatch_bytes": {"type": "integer"}, "combine_bytes": {"type": "integer"}
+            }
+          },
+          "roundtrip_tokens_per_second": {"type": ["number", "null"]},
+          "bandwidth": {
+            "type": "object",
+            "properties": {
+              "logical_payload_rate_gbps": {"type": "object"},
+              "backend_buffer_rate_gbps": {"type": "object"},
+              "algorithm_bandwidth_gbps": {"type": ["number", "null"]},
+              "bus_bandwidth_gbps": {"type": ["number", "null"]},
+              "wire_utilization": {"type": ["number", "null"]}
+            }
+          },
+          "fanout_hist": {"type": "array"},
+          "rank_load_hist": {"type": "array"},
+          "expert_load_cv": {"type": "number"}, "rank_load_cv": {"type": "number"},
+          "hotspot_ratio": {"type": "number"},
+          "dest_rank_load_max": {"type": "integer"}, "dest_rank_load_mean": {"type": "number"},
+          "empty_expert_count": {"type": "integer"}, "empty_rank_count": {"type": "integer"},
+          "source_token_stats": {"type": ["object", "null"]},
+          "anomalies": {"type": "array", "items": {"type": "object"}},
+          "correct": {"type": "boolean"}
+        }
+      }
+    }
+  },
+  "definitions": {
+    "percentiles": {
+      "type": "object",
+      "required": ["p50", "p90", "p95", "p99"],
+      "properties": {
+        "p50": {"type": "number"}, "p90": {"type": "number"},
+        "p95": {"type": "number"}, "p99": {"type": "number"}
+      }
+    }
+  }
+}
diff --git a/experimental/CollectiveX/schemas/workload-v1.schema.json b/experimental/CollectiveX/schemas/workload-v1.schema.json
new file mode 100644
index 0000000000..1416e52cd5
--- /dev/null
+++ b/experimental/CollectiveX/schemas/workload-v1.schema.json
@@ -0,0 +1,51 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://semianalysis/collectivex/schemas/workload-v1.schema.json",
+  "title": "CollectiveX canonical MoE routing workload manifest",
+  "description": "Manifest for a serialized routing trace (tests/workload.py). The <workload_id>.npz holds topk_idx/topk_weights; this manifest carries the identity, dimensions, routing profile, and SHA-256 checksums that gate cross-hardware comparison.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": ["schema_version", "workload_id", "generator_version", "gate_weight_format",
+               "dims", "routing_profile", "seed", "checksums"],
+  "properties": {
+    "schema_version": {"const": 1},
+    "workload_id": {"type": "string", "pattern": "^[0-9a-f]{16}$",
+      "description": "Immutable id = sha256(generator|routing|hidden|topk|experts|gt|seed)[:16]."},
+    "generator_version": {"type": "string",
+      "description": "Routing generator identity; bump when numerics change so stale files can't masquerade."},
+    "gate_weight_format": {"type": "string"},
+    "dims": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["hidden", "topk", "experts", "global_tokens", "experts_per_rank"],
+      "properties": {
+        "hidden": {"type": "integer", "minimum": 1},
+        "topk": {"type": "integer", "minimum": 1},
+        "experts": {"type": "integer", "minimum": 1},
+        "global_tokens": {"type": "integer", "minimum": 1},
+        "experts_per_rank": {"type": "integer", "minimum": 1}
+      }
+    },
+    "routing_profile": {"type": "string",
+      "description": "Current producers emit uniform, balanced, balanced-rank-local, zipf, or hotspot-single. The three named Zipf-strength aliases remain only to read historical manifests.",
+      "enum": ["uniform", "balanced", "balanced-rank-local", "zipf",
+               "zipf-mild", "zipf-moderate", "zipf-heavy", "hotspot-single"]},
+    "seed": {"type": "integer"},
+    "checksums": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["topk_idx", "topk_weights", "trace"],
+      "properties": {
+        "topk_idx": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "topk_weights": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
+        "trace": {"type": "string", "pattern": "^[0-9a-f]{64}$"}
+      }
+    },
+    "routing_stats": {"type": "object",
+      "description": "Realized fan-out / load / locality stats (advisory; not identity-defining)."},
+    "activation_profile": {"type": "string",
+      "description": "Value distribution of expert inputs (e.g. 'normal'); reserved for the value-sensitivity rig."},
+    "activation_identity": {"type": ["string", "null"],
+      "description": "Deterministic descriptor hash of the activation distribution (profile|seed|dims). Becomes a byte-hash once activations are serialized (model-trace)."}
+  }
+}
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 0000000000..7c6d16acdf
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""Summarize CollectiveX EP results for logs or a GitHub job summary.
+
+Plain-text mode is also the shard health gate: it fails when no complete EP result
+was produced. Markdown mode is reporting-only and always exits successfully.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+
+def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]:
+    """Load only EP result and failed-case documents from a result directory."""
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        base = os.path.basename(path)
+        if base.startswith("env_"):
+            continue
+        if runner and not base.startswith(f"{runner}_"):
+            continue
+        if ts and ts not in base:
+            continue
+        try:
+            with open(path) as fh:
+                doc = json.load(fh)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if isinstance(doc, dict) and doc.get("family") == "moe":
+            docs.append(doc)
+    return docs
+
+
+def _fnum(value, fmt: str) -> str:
+    return format(value, fmt) if isinstance(value, (int, float)) else "-"
+
+
+def _doc_status(doc: dict) -> str:
+    return str(doc.get("publication_status") or doc.get("status") or "unknown")
+
+
+def _execution_valid(doc: dict) -> bool:
+    return doc.get("record_type") != "failed-case" and doc.get("status") == "valid"
+
+
+def _completed(docs: list[dict]) -> list[dict]:
+    return sorted(
+        (doc for doc in docs if doc.get("record_type") != "failed-case"),
+        key=lambda doc: (doc.get("backend", ""), doc.get("phase", ""), doc.get("ep_size", 0)),
+    )
+
+
+def _failed(docs: list[dict]) -> list[dict]:
+    return sorted(
+        (doc for doc in docs if doc.get("record_type") == "failed-case"),
+        key=lambda doc: (doc.get("backend", ""), doc.get("phase", ""), doc.get("attempt_id", "")),
+    )
+
+
+def _shape_label(doc: dict) -> str:
+    shape = doc.get("shape") or {}
+    return (
+        f"H{shape.get('hidden', '?')} top{shape.get('topk', '?')} "
+        f"E{shape.get('experts', '?')} {shape.get('dispatch_dtype', '?')} "
+        f"{shape.get('routing', '?')}"
+    )
+
+
+def _sweep_table(doc: dict) -> list[str]:
+    rows = doc.get("rows") or []
+    if not rows:
+        return []
+    out = [
+        (f"\n**`{doc.get('backend')}` · {doc.get('phase')} · ep{doc.get('ep_size')} · "
+         f"{_shape_label(doc)}**\n"),
+        "| tokens/rank | fan-out | dispatch p50 us | combine p50 us | roundtrip p50 us | tokens/s | recv max | correct |",
+        "|--:|--:|--:|--:|--:|--:|--:|:--:|",
+    ]
+    for row in rows:
+        out.append(
+            f"| {row.get('tokens_per_rank')} | {_fnum(row.get('fanout_mean'), '.2f')} | "
+            f"{_fnum(row.get('dispatch_us_p50'), '.2f')} | "
+            f"{_fnum(row.get('combine_us_p50'), '.2f')} | "
+            f"{_fnum(row.get('roundtrip_us_p50'), '.2f')} | "
+            f"{_fnum(row.get('roundtrip_tokens_per_second'), '.3e')} | "
+            f"{row.get('recv_tokens_max', '-')} | {'yes' if row.get('correct') else 'no'} |"
+        )
+    return out
+
+
+def render_plain(docs: list[dict]) -> str:
+    out = ["CollectiveX EP results", "======================"]
+    complete = _completed(docs)
+    failed = _failed(docs)
+    if complete:
+        out.append(
+            f"  {'backend':<16}{'phase':<9}{'ep':>3} {'publication':<24}"
+            f"{'T*':>5}{'roundtrip p99 us':>19}  correct"
+        )
+        for doc in complete:
+            metrics = doc.get("metrics") or {}
+            correctness = doc.get("correctness") or {}
+            out.append(
+                f"  {str(doc.get('backend', '')):<16}{str(doc.get('phase', '')):<9}"
+                f"{str(doc.get('ep_size', '')):>3} {_doc_status(doc):<24}"
+                f"{str(metrics.get('headline_tokens_per_rank', '')):>5}"
+                f"{_fnum(metrics.get('roundtrip_us_p99'), '.1f'):>19}  "
+                f"{correctness.get('passed')}"
+            )
+    if failed:
+        out.append("\nFailed EP attempts:")
+        for doc in failed:
+            failure = doc.get("failure") or {}
+            out.append(
+                f"  {doc.get('backend', '?')}/{doc.get('phase', '?')} "
+                f"case={doc.get('case_id') or 'manual'} attempt={doc.get('attempt_id', '1')} "
+                f"mode={failure.get('failure_mode', 'unknown')} rc={failure.get('return_code', '?')}"
+            )
+    return "\n".join(out)
+
+
+def render_markdown(docs: list[dict]) -> str:
+    complete = _completed(docs)
+    failed = _failed(docs)
+    out = ["## CollectiveX EP results"]
+    if complete:
+        out += [
+            "",
+            "| backend | phase | mode | dtype | resource | ep | routing | publication | T* | roundtrip p50 us | roundtrip p99 us | correct |",
+            "|---|---|---|---|---|--:|---|---|--:|--:|--:|:--:|",
+        ]
+        for doc in complete:
+            metrics = doc.get("metrics") or {}
+            correctness = doc.get("correctness") or {}
+            shape = doc.get("shape") or {}
+            out.append(
+                f"| `{doc.get('backend', '')}` | {doc.get('phase', '')} | {doc.get('mode', '')} | "
+                f"{shape.get('dispatch_dtype', '-')} | {doc.get('resource_mode', '')} | "
+                f"{doc.get('ep_size', '')} | {shape.get('routing', '-')} | {_doc_status(doc)} | "
+                f"{metrics.get('headline_tokens_per_rank', '-')} | "
+                f"{_fnum(metrics.get('roundtrip_us_p50'), '.1f')} | "
+                f"{_fnum(metrics.get('roundtrip_us_p99'), '.1f')} | "
+                f"{'yes' if correctness.get('passed') else 'no'} |"
+            )
+        for doc in complete:
+            out += _sweep_table(doc)
+    if failed:
+        out += [
+            "\n### Failed attempts\n",
+            "| backend | phase | case | attempt | failure | rc |",
+            "|---|---|---|--:|---|--:|",
+        ]
+        for doc in failed:
+            failure = doc.get("failure") or {}
+            out.append(
+                f"| `{doc.get('backend', '')}` | {doc.get('phase', '')} | "
+                f"`{doc.get('case_id') or 'manual'}` | {doc.get('attempt_id', '1')} | "
+                f"{failure.get('failure_mode', 'unknown')} | {failure.get('return_code', '-')} |"
+            )
+    if not docs:
+        out.append("\n> No EP result files found.")
+    return "\n".join(out)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="CollectiveX EP result summary")
+    parser.add_argument("--results-dir", default="results")
+    parser.add_argument("--runner")
+    parser.add_argument("--ts")
+    parser.add_argument("--markdown", action="store_true",
+                        help="emit reporting-only GitHub summary markdown")
+    args = parser.parse_args()
+
+    docs = load_results(args.results_dir, args.runner, args.ts)
+    if args.markdown:
+        print(render_markdown(docs))
+        return 0
+
+    print(render_plain(docs))
+    valid = sum(_execution_valid(doc) for doc in docs)
+    if valid == 0:
+        print("ERROR: no complete, valid EP result was produced.")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py
new file mode 100644
index 0000000000..ac00f7f8c0
--- /dev/null
+++ b/experimental/CollectiveX/sweep_matrix.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""CollectiveX — sweep matrix resolver (the `setup` job of collectivex-sweep.yml).
+
+Resolves the requested suites into the GHA matrix of shards. A shard is one allocation that sweeps
+many cases sharing (sku, backend, node count). Large shards are chunked. Each case is enriched with
+model dims (hidden/topk/experts from workloads.yaml) + token ladder + canonical flag, so the in-
+container shard loop (run_in_container.sh SHARD mode) needs no further config lookup.
+
+Knobs: --backends sweeps every EP library in one matrix; --backend remaps the DeepEP matrix onto a
+single other library (capability-filtered). Emits a JSON matrix for ``fromJSON`` in the workflow.
+
+  python3 sweep_matrix.py --suites all --out matrix.json
+  python3 sweep_matrix.py --suites all --backend uccl --max-cases 12 --out matrix.json
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import sys
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
+sys.path.insert(0, os.path.join(HERE, "tests"))
+import yaml  # noqa: E402
+import generate_matrix as gm  # noqa: E402
+import capability as cap  # noqa: E402
+import ep_harness  # noqa: E402
+
+EP_TIMING_PROFILE = (f"{ep_harness.TIMED_ITERS_PER_TRIAL}:"
+                     f"{ep_harness.TRIALS_PER_POINT}:"
+                     f"{ep_harness.WARMUP_ITERS_PER_TRIAL}")
+
+
+def _dims(wl_cfg, name):
+    for sec in ("synthetic", "model_derived"):
+        m = (wl_cfg.get(sec) or {}).get(name)
+        if m:
+            return m.get("hidden"), m.get("topk"), m.get("experts", m.get("routed_experts"))
+    return None, None, None
+
+
+def _union_ladder(a, b):
+    """Union two token-point ladders; '' means the harness phase-default FULL ladder (a superset
+    of every suite's token_points), so union with '' is ''."""
+    if a == "" or b == "":
+        return ""
+    return " ".join(map(str, sorted({int(x) for x in (a.split() + b.split())})))
+
+
+def _ladder(suite_cfg, phase):
+    if phase == "decode" and suite_cfg.get("token_points_decode"):
+        return " ".join(map(str, suite_cfg["token_points_decode"]))
+    if phase == "prefill" and suite_cfg.get("token_points_prefill"):
+        return " ".join(map(str, suite_cfg["token_points_prefill"]))
+    if suite_cfg.get("token_points"):
+        return " ".join(map(str, suite_cfg["token_points"]))
+    return ""
+
+
+def _resolved_ladder(ladder, phase, backend, routing, platform):
+    """Apply backend/platform limits after expansion without capping the portable reference."""
+    if backend != "mori":
+        return ladder
+    if (platform == "mi355x" and phase == "prefill"
+            and routing not in {"uniform", "balanced", "balanced-rank-local"}):
+        return None
+    defaults = ep_harness.DECODE_LADDER if phase == "decode" else ep_harness.PREFILL_LADDER
+    points = [int(x) for x in ladder.split()] if ladder else list(defaults)
+    capped = [point for point in points if point <= 512]
+    return " ".join(map(str, capped)) if capped else None
+
+
+def _case_id(sku, case):
+    """Stable scheduled-case identity, including the scored token ladder."""
+    payload = json.dumps({"sku": sku, **case}, sort_keys=True, separators=(",", ":"))
+    return f"cxv1-{hashlib.sha256(payload.encode()).hexdigest()[:20]}"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX sweep matrix resolver")
+    ap.add_argument("--suites", default="all", help="'all' or comma-list of suite names")
+    backend_names = ",".join(cap.SWEEP_BACKENDS)
+    ap.add_argument("--backend", default="",
+                    help=f"select exactly one EP backend ({backend_names})")
+    ap.add_argument("--backends", default="",
+                    help=f"combined matrix: 'all' or a comma-list ({backend_names}); "
+                         "capability-filtered and overrides --backend")
+    ap.add_argument("--only-sku", default="", help="restrict to one workflow sku value")
+    ap.add_argument("--min-nodes", type=int, default=0,
+                    help="keep only shards whose tray count (nodes, blank=1) is >= this; "
+                         "e.g. 2 = rack-scale EP8 only (skip the single-tray EP4 cells)")
+    ap.add_argument("--max-nodes", type=int, default=0,
+                    help="keep only shards whose tray count (nodes, blank=1) is <= this; "
+                         "e.g. 1 = single-tray EP4 only (skip the rack-scale EP8 cells)")
+    ap.add_argument("--max-cases", type=int, default=128, help="chunk shards larger than this into sub-cells (128 = effectively no chunking for current suites; each shard's cases run consecutively in ONE allocation, amortizing runner/enroot/build startup)")
+    ap.add_argument("--out", default="")
+    a = ap.parse_args()
+
+    wl_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "workloads.yaml")))
+    suites_cfg = yaml.safe_load(open(os.path.join(HERE, "configs", "suites.yaml")))["suites"]
+    suite_names = list(suites_cfg) if a.suites == "all" else [s.strip() for s in a.suites.split(",")]
+
+    # --backends "all"|comma-list emits every requested implementation in one matrix.
+    all_backends = list(cap.SWEEP_BACKENDS)
+    if a.backends:
+        names = all_backends if a.backends == "all" else [x.strip() for x in a.backends.split(",") if x.strip()]
+        unknown = sorted(set(names) - set(all_backends))
+        if unknown:
+            raise SystemExit(f"unknown --backends values {unknown}; have {all_backends}")
+        targets = names
+    else:
+        target = a.backend or "deepep"
+        if target not in all_backends:
+            raise SystemExit(f"unknown --backend value {target!r}; have {all_backends}")
+        targets = [target]
+
+    # collect enriched cases, deduped globally (a config shared by several suites appears once)
+    seen = {}
+    shards: dict = {}
+    for sname in suite_names:
+        scfg = suites_cfg[sname]
+        for c in gm.generate(sname)["cases"]:
+            if int(c["samples_per_point"]) != ep_harness.TIMED_SAMPLES_PER_POINT:
+                raise SystemExit(f"case from {sname} violates fixed-512-v1: {c['samples_per_point']}")
+            if c.get("timing") != EP_TIMING_PROFILE:
+                raise SystemExit(f"case from {sname} has timing={c.get('timing')!r}; "
+                                 f"fixed-512-v1 requires {EP_TIMING_PROFILE}")
+            if c.get("warmup_semantics") != ep_harness.WARMUP_SEMANTICS:
+                raise SystemExit(f"case from {sname} has warmup_semantics="
+                                 f"{c.get('warmup_semantics')!r}; expected "
+                                 f"{ep_harness.WARMUP_SEMANTICS!r}")
+            plat = c["platform"]
+            beng0 = c["backend"]
+            if beng0 not in ("deepep", "mori"):
+                continue
+            sku = plat
+            if a.only_sku and sku != a.only_sku:
+                continue
+            phase = c["phase"]
+            rmode = c["resource_mode"]
+            lad = _ladder(scfg, phase)
+            h, t, e = _dims(wl_cfg, c["workload"])
+            # Derive physical topology from the public platform contract. Keep nodes explicit in
+            # every matrix cell even though manual launchers default a blank value to one node.
+            gpus_per_node = int(cap.PLATFORMS[plat]["gpus_per_node"])
+            scale_up_domain = int(cap.PLATFORMS[plat]["scale_up_domain"])
+            nodes = str(max(1, (int(c.get("ep") or gpus_per_node) + gpus_per_node - 1)
+                            // gpus_per_node))
+            # The base registry uses DeepEP to enumerate NVIDIA shapes and MoRI for AMD shapes.
+            # Apply the requested backend filter here; the portable NCCL/RCCL reference spans both.
+            if beng0 == "mori":
+                case_targets = [name for name in targets if name in ("mori", "nccl-ep")]
+            else:
+                case_targets = [name for name in targets if name != "mori"]
+            for beng in case_targets:
+                ok, _r = cap.resolve(
+                    plat, beng, mode=c["mode"], dtype=c["dtype"], contract=c["contract"],
+                    combine_quant_mode=c.get("combine_quant_mode", "none"), routing=c["routing"],
+                    eplb=bool(c.get("eplb")),
+                    activation_profile=c.get("activation_profile", "normal"),
+                )
+                if not ok:
+                    continue
+                lad_i = _resolved_ladder(lad, phase, beng, c["routing"], plat)
+                if lad_i is None:
+                    continue
+                case = {
+                    "suite": c["suite"], "workload": c["workload"],
+                    "required_publication": c.get("required_publication"),
+                    "backend": beng, "mode": c["mode"],
+                    "dtype": c["dtype"], "contract": c["contract"], "routing": c["routing"],
+                    "phase": phase, "ep": int(c["ep"]), "eplb": bool(c.get("eplb")),
+                    "combine_quant_mode": c.get("combine_quant_mode", "none"),
+                    "resource_mode": rmode,
+                    "activation_profile": c.get("activation_profile", "normal"),
+                    "placement": c.get("placement", "packed"),
+                    "routing_step": str(c.get("routing_step", 0)),
+                    "uneven_tokens": c.get("uneven_tokens", "none"),
+                    "hidden": "" if h in (None, 7168) else str(h),
+                    "topk": "" if t in (None, 8) else str(t),
+                    "experts": "" if e in (None, 256) else str(e),
+                    "samples_per_point": int(c["samples_per_point"]),
+                    "warmup_semantics": c["warmup_semantics"], "ladder": lad_i,
+                    "timing": c["timing"], "canonical": bool(c.get("canonical")), "nodes": nodes,
+                    "gpus_per_node": gpus_per_node, "scale_up_domain": scale_up_domain,
+                }
+                case["case_id"] = _case_id(sku, case)
+                sig = (
+                    sku, case["suite"], case["workload"], beng, c["mode"], c["dtype"],
+                    c["contract"], c["routing"], phase, case["ep"], case["eplb"],
+                    case["combine_quant_mode"], rmode, case["activation_profile"],
+                    case["placement"], case["routing_step"], case["uneven_tokens"],
+                    case["hidden"], case["topk"], case["experts"],
+                    case["samples_per_point"], case["warmup_semantics"], nodes,
+                    gpus_per_node, scale_up_domain, c["timing"],
+                )
+                if sig in seen:
+                    seen[sig]["ladder"] = _union_ladder(seen[sig]["ladder"], lad_i)
+                    continue
+                seen[sig] = case
+                # One allocation/build per (SKU, backend, tray count).
+                key = (sku, beng, nodes)
+                shards.setdefault(key, []).append(case)
+
+    # Per-backend chunk size. Fast backends run a whole build group
+    # in ONE allocation (max_cases, ~no chunking). flashinfer is SLOW (~3.2 min/case, heavy per-case MNNVL
+    # workspace setup) and intermittently hits `CUDA error: unspecified launch failure` under rapid
+    # back-to-back cases — so chunk it small: bounded, PARALLEL jobs, fewer successive setups per
+    # allocation. UCCL is not chunked because its current promoted shard fits comfortably.
+    SLOW_MAX_CASES = {"flashinfer": 12}   # 12 (not 16): flashinfer cases retry up to 3x for the intermittent
+                                          # MNNVL-barrier deadlock, so smaller chunks keep a chunk within --time.
+    include = []
+    for (sku, beng, nodes), cases in sorted(shards.items()):
+        if a.min_nodes and max(1, int(nodes or 1)) < a.min_nodes:
+            continue   # --min-nodes: skip single-tray (EP4) shards, keep only rack-scale (EP8+)
+        if a.max_nodes and max(1, int(nodes or 1)) > a.max_nodes:
+            continue   # --max-nodes: skip rack-scale (EP8+) shards, keep only single-tray (EP4)
+        mc = min(a.max_cases, SLOW_MAX_CASES.get(beng, a.max_cases))
+        for ci in range(0, len(cases), mc):
+            chunk = cases[ci:ci + mc]
+            part = ci // mc
+            sid = f"{sku}-{beng}" + (f"-n{nodes}" if nodes else "") + (f"-p{part}" if len(cases) > mc else "")
+            include.append({
+                "id": sid, "sku": sku, "backend": beng,
+                "launcher": cap.PLATFORMS[sku]["launcher"],
+                "gpus_per_node": cap.PLATFORMS[sku]["gpus_per_node"],
+                "scale_up_domain": cap.PLATFORMS[sku]["scale_up_domain"],
+                "nodes": nodes, "n": len(chunk), "cases": chunk,
+            })
+
+    n_cells = len(include)
+    n_cases = sum(x["n"] for x in include)
+    matrix = {"include": include}
+    if a.out:
+        with open(a.out, "w") as fh:
+            json.dump(matrix, fh)
+    print(f"resolved {n_cells} shard-cells, {n_cases} cases "
+          f"(suites={len(suite_names)} backends={a.backends or a.backend or 'deepep'})",
+          file=sys.stderr)
+    # stdout = the matrix JSON (for `$(...)` capture in the workflow)
+    print(json.dumps(matrix))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 0000000000..14b4ac041b
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode.
+
+The harness owns the deterministic shared routing trace, the comm-only timing, and
+the doc; this file owns only DeepEP's API calls and its correctness reference.
+`make_problem` materializes the harness-provided rank slice (no RNG here), so every
+SKU runs the identical routed workload.
+
+Correctness (per DeepEP's intranode test): a pure dispatch->combine round trip with no
+expert compute reconstructs x only after dividing by the number of ranks each token was
+sent to, so the harness expects combined ≈ x * is_token_in_rank.sum(dim=1).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    from deep_ep import Buffer  # type: ignore
+    import deep_ep  # for version/provenance
+except Exception as exc:  # pragma: no cover - needs the built DeepEP
+    print("ERROR: deep_ep import failed — DeepEP must be present/built at job setup. "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _deepep_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("deep_ep")
+    except Exception:
+        return getattr(deep_ep, "__version__", "unknown")
+
+
+# DeepEP's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token
+# block-128 scale (deep_ep 1.2.1 ships NO helper for this — utils is empty — so we
+# implement the exact convention its kernels expect: scales [T, H//128] float32, e4m3,
+# 448 = e4m3 max). Both directions of the cast run OUTSIDE the timed window (cast in
+# make_problem, dequant in stage), so fp8 quantization is NOT included in dispatch time.
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _per_token_cast_to_fp8(x):
+    # PER-BLOCK-128 scale layout (DeepEP default): one scale per 128-elem block per token.
+    # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32)
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)               # [T, H//128]
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _per_token_cast_to_fp8_pertoken(x):
+    # PER-TOKEN scale layout: ONE amax per token (over all H), broadcast across the H//128 blocks.
+    # Coarser than block-128 (slightly higher quant error) but the same scale transport cost.
+    T, H = x.shape
+    amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4)       # [T, 1]
+    x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn)
+    scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous()     # broadcast per-token
+    return x_fp8, scales
+
+
+def _directcast_to_fp8(x):
+    # DIRECT-CAST: clamp to the e4m3 range and cast with NO learned scale (unit scale). Carries no
+    # scale metadata (zero scale-transport overhead) but truncates activations above e4m3 max — the
+    # recipe MoRI PR311 replaced for accuracy. scales=ones so _per_block_dequant is the plain cast-back.
+    T, H = x.shape
+    x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn)
+    scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device)
+    return x_fp8, scales
+
+
+# dispatch_dtype value -> (scale_layout label, cast fn). All feed DeepEP's same (fp8, scales) kernel
+# input; they differ only in the quant recipe, so they are distinct OPERATING POINTS, not dtypes.
+_FP8_RECIPES = {
+    "fp8": ("per-block-128", _per_token_cast_to_fp8),
+    "fp8-pertoken": ("per-token", _per_token_cast_to_fp8_pertoken),
+    "fp8-directcast": ("direct-cast", _directcast_to_fp8),
+}
+
+
+def _per_block_dequant(x_fp8, scales):
+    # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16
+    R, H = x_fp8.shape
+    xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16)
+
+
+def _per_block_dequant_3d(x_fp8, scales):
+    # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16
+    E, S, H = x_fp8.shape
+    xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16)
+
+
+def _mnnvl_buffer_kwargs() -> dict:
+    """Cross-tray (NVL72/MNNVL) Buffer kwargs.
+
+    DeepEP V2's `Buffer` added `allow_mnnvl` (default False); when it is False DeepEP itself sets
+    `NVSHMEM_DISABLE_MNNVL=1` and the legacy buffer falls onto the intranode-only CUDA-IPC peer path,
+    which faults across NVL72 trays (cudaErrorIllegalAddress at csrc/legacy/buffer.hpp). On a real
+    multi-tray MNNVL allocation (the rack launcher exports CX_ALLOW_MNNVL=1) request allow_mnnvl=True
+    so the NVLink buffer spans trays over the fabric API. The bundled V1 `Buffer` predates the param
+    (its NVL buffer already spans MNNVL trays), so only pass it when the installed Buffer accepts it —
+    keeping x86 single-node and bundled-V1 rack paths byte-for-byte unchanged.
+    """
+    if os.environ.get("CX_ALLOW_MNNVL") != "1":
+        return {}
+    try:
+        import inspect
+        if "allow_mnnvl" in inspect.signature(Buffer.__init__).parameters:
+            return {"allow_mnnvl": True}
+    except (ValueError, TypeError):
+        pass
+    return {}
+
+
+class DeepEPBackend:
+    name = "deepep"
+    combine_needs_redispatch = False  # DeepEP combine reuses the handle (its own bench does too)
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    #   normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink.
+    #   ll mode: low_latency_dispatch/combine — verified RUNNING intranode over NVLink via
+    #   allow_nvlink_for_low_latency_mode (IBGDA not required intranode) on 8xH100.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast"}
+    SUPPORTED_MODES = {"normal", "ll"}
+    # Three contracts (review #3 + goal P1 runtime-visible):
+    #   layout-and-dispatch-v1     — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant
+    #                                OUTSIDE (preprocessing mirrors a producer handing quantized x).
+    #   cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm (DeepEP's
+    #                                own benchmark boundary). normal mode only.
+    #   runtime-visible-v1         — the serving-realistic boundary: dispatch INCLUDES the fp8
+    #                                quant (cast) + layout + comm + the recv-dequant that makes
+    #                                expert input consumable; combine starts from bf16 expert
+    #                                outputs. (normal mode; LL already times all of this in-kernel.)
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.ll = (args.mode == "ll")
+        self.contract = args.measurement_contract
+        # hoist layout out of the timed dispatch only for the cached contract in normal mode.
+        self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll
+        # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal
+        # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible
+        # by construction — the flag only changes normal mode's boundary.
+        self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest
+        # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is
+        # recorded in the artifact so the looser fp8 gate is explicit, not hidden.
+        self.fp8 = args.dispatch_dtype.startswith("fp8")
+        # fp8 scale-layout recipe (per-block-128 default / per-token / direct-cast) — all use the
+        # same DeepEP fp8 kernel; only the cast differs. Recorded so they're distinct operating points.
+        self.fp8_recipe, self._fp8_cast = _FP8_RECIPES.get(
+            args.dispatch_dtype, ("per-block-128", _per_token_cast_to_fp8))
+        self.scale_layout = self.fp8_recipe if self.fp8 else None
+        # direct-cast truncates above e4m3 (no scale) -> a touch looser gate than scaled recipes.
+        self.tolerance = ((1.5e-1 if self.fp8_recipe == "direct-cast" else 1.25e-1)
+                          if self.fp8 else 5e-2)
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _deepep_version()
+        if self.ll:
+            self._init_ll(args, dev_sms, ver)
+        else:
+            self._init_normal(args, rank, dev_sms, ver)
+
+    def _init_normal(self, args, rank, dev_sms, ver):
+        # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside
+        # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly.
+        self.fp8_in_timing = (self.runtime_visible if self.fp8 else None)
+        self.combine_needs_redispatch = False  # normal combine reuses the handle
+        # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points
+        # (review: a phase-dependent 2/4 GiB made the shared T=128 point differ between
+        # the decode and prefill sweeps). 4 GiB holds T up to 4096 (validated).
+        num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(4 * 1024 * 1024 * 1024)))
+        mnnvl_kw = _mnnvl_buffer_kwargs()
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0, **mnnvl_kw)
+        rm = args.resource_mode
+        tuned_src = None
+        if rm == "normalized":
+            num_sms = max(1, round(args.sm_fraction * dev_sms))   # ~same device fraction as MoRI
+        elif rm == "tuned":
+            # Best-available for the installed DeepEP: its OWN default SM count
+            # (Buffer.num_sms — the library's analytic choice; it deliberately uses
+            # fewer SMs). get_dispatch_config(num_ranks) returns the recommended Config
+            # but doesn't expose num_sms to Python, and the default already reflects it.
+            num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+            tuned_src = "deepep-default-num_sms"
+        else:  # default — the bring-up budget
+            num_sms = args.num_sms
+        try:
+            Buffer.set_num_sms(num_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            raise RuntimeError(f"DeepEP did not apply requested num_sms={num_sms}: {exc!r}") from exc
+        applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+        if applied_num_sms != num_sms:
+            raise RuntimeError(
+                f"DeepEP num_sms mismatch: requested={num_sms} applied={applied_num_sms}")
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "normal", "resource_mode": rm, "requested_num_sms": num_sms,
+            "num_sms": applied_num_sms, "device_sms": dev_sms,
+            "sm_fraction": (applied_num_sms / dev_sms), "tuned_source": tuned_src or "n/a",
+            "num_nvl_bytes": num_nvl_bytes, "allow_mnnvl": bool(mnnvl_kw),
+            "fp8_recipe": self.fp8_recipe if self.fp8 else "n/a",
+            "scale_layout": self.scale_layout,
+        }
+
+    def _init_ll(self, args, dev_sms, ver):
+        # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink
+        # via allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch
+        # so for fp8 the quantization IS inside the timed window (recorded honestly). The
+        # buffer is sized for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical),
+        # so LL is a decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop).
+        # set_num_sms does NOT apply (the LL kernel picks its own occupancy) — recorded n/a.
+        self.fp8_in_timing = (True if self.fp8 else None)
+        self.combine_needs_redispatch = True   # re-dispatch (untimed) before each timed combine
+        self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128"))
+        self.experts = args.experts
+        rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+            self.num_max, args.hidden, self.world_size, args.experts)
+        # one QP per local expert is the DeepEP convention for LL
+        self.num_qps = max(1, args.experts // self.world_size)
+        mnnvl_kw = _mnnvl_buffer_kwargs()
+        self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True,
+                             num_qps_per_rank=self.num_qps,
+                             allow_nvlink_for_low_latency_mode=True, **mnnvl_kw)
+        self.backend_provenance = {
+            "deepep_version": ver,
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{ver}",
+            "mode": "ll", "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel",
+            "num_max_dispatch_tokens_per_rank": self.num_max,
+            "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps,
+            "low_latency_mode": True, "use_fp8": self.fp8, "allow_mnnvl": bool(mnnvl_kw),
+        }
+
+    def buffer_cap(self, args):
+        # LL is sized for a fixed num_max; cap the sweep there (reported, not silent).
+        return self.num_max if self.ll else None
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                  topk_weights=weights.to(torch.float32), layout=None)
+        if self.fp8 and not self.ll and not self.runtime_visible:
+            # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing,
+            # mirrors the real producer that hands the dispatcher already-quantized activations).
+            # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel.
+            p.x_fp8, p.x_scales = self._fp8_cast(x)
+        if self.cache_layout:
+            # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed)
+            # so the timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None
+            # and dispatch computes it inside the timed window.)
+            ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+            p.layout = (ntr, ntpe, itir)
+        return p
+
+    def dispatch(self, p):
+        if self.ll:
+            return self._dispatch_ll(p)
+        if p.layout is not None:                       # cached-layout-comm-only-v1
+            num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout
+        else:                                          # layout-and-dispatch / runtime-visible (timed layout)
+            (num_tokens_per_rank, _, num_tokens_per_expert,
+             is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        ref_fp8 = ref_scales = None
+        if self.fp8:
+            if self.runtime_visible:
+                # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch.
+                x_fp8, x_scales = self._fp8_cast(p.x)
+                ref_fp8, ref_scales = x_fp8, x_scales      # for the correctness reference
+            else:
+                x_fp8, x_scales = p.x_fp8, p.x_scales      # pre-cast (untimed)
+            x_in = (x_fp8, x_scales)
+        else:
+            x_in = p.x
+        recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        out = types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales)
+        if self.fp8 and self.runtime_visible:
+            # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the
+            # timed window (the contract's "expert input genuinely consumable" boundary). stage()
+            # then no-ops for this contract.
+            recv_fp8, recv_scales = recv_x
+            out.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+            out.rv_staged = True
+        return out
+
+    def _dispatch_ll(self, p):
+        # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the
+        # cast IS inside this timed op — fp8_in_timing=True). recv is the expert-major
+        # 3D layout [num_local_experts, num_max*world, hidden] (+scales when fp8).
+        recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch(
+            p.x, p.topk_idx, self.num_max, self.experts,
+            use_fp8=self.fp8, return_recv_hook=False)
+        return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle)
+
+    def stage(self, p, h):
+        # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv
+        # to bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in
+        # both precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.)
+        if getattr(h, "rv_staged", False):
+            return None   # runtime-visible already produced bf16 combine_input inside dispatch (timed)
+        if self.ll:
+            if self.fp8:
+                recv_fp8, recv_scales = h.recv_x
+                h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales)
+            else:
+                h.combine_input = h.recv_x
+        elif self.fp8:
+            recv_fp8, recv_scales = h.recv_x
+            h.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+        else:
+            h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        if self.ll:
+            # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones.
+            combined_x, _event, _hook = self.buffer.low_latency_combine(
+                h.combine_input, p.topk_idx, p.topk_weights, h.handle)
+            return combined_x
+        combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle,
+                                               topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        if self.ll:
+            # LL combine reduces each token's topk expert copies weighted by topk_weights;
+            # with no expert compute each copy is (the kernel's fp8 cast of) x, so
+            # combined ≈ x * sum(topk_weights). fp8 quant error is covered by self.tolerance.
+            wsum = p.topk_weights.sum(dim=1, keepdim=True)
+            return p.x.float() * wsum, p.T
+        # normal: round trip with no expert compute reconstructs x*(#destination ranks);
+        # for fp8 compare against the dequantized cast that was actually sent.
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        ref = p.x.float()
+        if self.fp8:
+            # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast.
+            x_fp8 = getattr(h, "ref_fp8", None)
+            x_scales = getattr(h, "ref_scales", None)
+            if x_fp8 is None:
+                x_fp8, x_scales = p.x_fp8, p.x_scales
+            ref = _per_block_dequant(x_fp8, x_scales).float()
+        return ref * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        if self.ll:
+            return int(h.recv_count.sum().item())  # token-copies received across local experts
+        rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x
+        return int(rx.shape[0])
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
new file mode 100644
index 0000000000..ab39be1a86
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer).
+
+The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA +
+warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer`
+(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72
+rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter drives the single-
+NVLink-domain path (`num_of_hybrid_ep_ranks_per_nvlink_domain == world_size`, <=8 ranks). That domain
+is ONE node on x86 — but on a GB200/GB300 NVL72 the MNNVL fabric makes multiple trays a single NVLink
+domain, so the SAME path spans trays: gb300 EP8 (8 ranks / 2 trays) is validated `transport=mnnvl`,
+decode 8/8 + prefill 6/6 (run 28480519588). The container build is done by runtime/run_in_container.sh
+`cx_build_deepep_hybrid` (CUDA-13 cccl include + libnvshmem symlink fixes; pip-installed so it persists
+across the EP8 multi-srun's separate srun steps).
+
+API (pinned on B300, branch e0a5b1d):
+  HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...)
+  .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle)
+  .combine(hidden, handle=) -> [T, hidden]
+
+CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is
+reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8
+round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses
+the SAME "ranks" factor as ep_flashinfer (per-rank-sum combine, no gate re-weight). bf16 tol 5e-2.
+
+STATUS: bf16 / normal / layout-and-dispatch-v1. Single-NVLink-domain path (<=8 ranks) validated on x86
+single-node AND across GB300 NVL72 trays at EP8 via MNNVL (one NVLink domain, run 28480519588). fp8 and
+the cross-RACK (>1 NVL72, IBGDA/RDMA) path are further lift.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import deep_ep
+    HybridEPBuffer = deep_ep.HybridEPBuffer
+except Exception as exc:  # pragma: no cover - needs the hybrid-ep build
+    print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job "
+          "setup (cx_build_deepep_hybrid). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _deepep_hybrid_version() -> str:
+    return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep"))
+
+
+class DeepEPHybridBackend:
+    name = "deepep-hybrid"
+    # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed
+    # before a timed combine); the harness times dispatch and combine separately (like ep_deepep).
+    combine_needs_redispatch = False
+    # Capabilities — run_ep.py REJECTS anything outside these before construction.
+    SUPPORTED_PRECISIONS = {"bf16"}        # fp8 = use_fp8 path, further lift
+    SUPPORTED_MODES = {"normal"}
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+    SUPPORTED_COMBINE_DTYPES = {"bf16"}
+    SUPPORTED_COMBINE_QUANT_MODES = {"none"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.contract = args.measurement_contract
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        self.tolerance = 5e-2
+        self.fp8_in_timing = None
+        self.top_k = int(args.topk)
+        self.num_experts = int(args.experts)
+        self.hidden = int(args.hidden)
+        self.local_experts = max(1, self.num_experts // world_size)
+        # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap).
+        self.max_tokens = int(os.environ.get("CX_HYBRIDEP_MAX_TOKENS", "4096"))
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _deepep_hybrid_version()
+
+        # Construct the HybridEPBuffer treating all ranks as ONE NVLink domain (default
+        # num_of_hybrid_ep_ranks_per_nvlink_domain == world_size). On x86 that domain is one node; on a
+        # GB200/GB300 NVL72 the MNNVL fabric makes 2 trays one NVLink domain, so EP8 (8 ranks) is covered
+        # by this same path (validated transport=mnnvl). SM counts default.
+        try:
+            self.buffer = HybridEPBuffer(
+                self.group, hidden_dim=self.hidden,
+                max_num_of_tokens_per_rank=self.max_tokens,
+                num_local_experts=self.local_experts, use_fp8=False)
+        except Exception as exc:
+            raise RuntimeError(
+                f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} "
+                f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc
+        if rank == 0:
+            print(f"[deepep-hybrid] HybridEPBuffer constructed (single NVLink domain, world={world_size}, "
+                  f"local_experts={self.local_experts}, hidden={self.hidden})", file=sys.stderr)
+
+        self.backend_provenance = {
+            "deepep_commit": ver, "branch": "hybrid-ep",
+            "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)",
+            "mode": "normal", "transport": "nvlink-domain",  # one node (x86) or one NVL72 MNNVL domain (gb300 EP8)
+            "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel",
+            "max_num_tokens": self.max_tokens, "top_k": self.top_k,
+            "num_experts": self.num_experts, "local_experts": self.local_experts,
+            "routing_factor": "ranks",
+        }
+
+    def buffer_cap(self, args):
+        return self.max_tokens
+
+    def make_problem(self, T, idx, weights, x):
+        return types.SimpleNamespace(
+            T=int(T), x=x,
+            topk_idx=idx.to(torch.int64),
+            topk_weights=weights.to(torch.float32),
+        )
+
+    def dispatch(self, p):
+        # HybridEPBuffer.dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) ->
+        #   (recv_hidden [n_recv, H], recv_x2, None, handle).
+        out = self.buffer.dispatch(p.x, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+                                   num_of_experts=self.num_experts)
+        recv = out[0] if isinstance(out, (tuple, list)) else out
+        handle = None
+        if isinstance(out, (tuple, list)):
+            for o in out:
+                if isinstance(o, tuple):
+                    handle = o
+        return types.SimpleNamespace(recv=recv, recv_payload=recv, handle=handle, combine_input=None)
+
+    def stage(self, p, h):
+        # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token.
+        h.combine_input = h.recv_payload
+        return None
+
+    def combine(self, p, h):
+        # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks").
+        comb = self.buffer.combine(h.combine_input, handle=h.handle)
+        return comb[0] if isinstance(comb, (tuple, list)) else comb
+
+    def expected(self, p, h):
+        # Round trip, identity expert, per-RANK-sum combine (no gate weights): each source token is
+        # x * (distinct ranks among its top_k experts) — same as ep_flashinfer's "ranks" factor.
+        ref = p.x.float()
+        epr = max(1, self.num_experts // self.world_size)
+        ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1)        # [T, topk]
+        present = torch.zeros(ranks.shape[0], self.world_size, device=ranks.device, dtype=torch.float32)
+        present.scatter_(1, ranks, 1.0)
+        factor = present.sum(dim=1, keepdim=True)                                # [T, 1] distinct ranks
+        return ref * factor, p.T
+
+    def recv_tokens(self, h):
+        rp = h.recv_payload
+        if torch.is_tensor(rp) and rp.dim() >= 1:
+            return int(rp.shape[0])
+        return 0
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_flashinfer.py b/experimental/CollectiveX/tests/ep_flashinfer.py
new file mode 100644
index 0000000000..5d81b29a73
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_flashinfer.py
@@ -0,0 +1,815 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — FlashInfer EP (NVIDIA), normal mode.
+
+This file owns ONLY FlashInfer's MoE-AllToAll API calls + its correctness reference;
+the harness (ep_harness.py) owns the deterministic shared routing trace, the comm-only
+timing, the correctness gate, and the provenance-tagged doc. The adapter protocol
+(make_problem / dispatch / stage / combine / expected / buffer_cap / recv_tokens /
+finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py exactly.
+
+WHAT FLASHINFER PROVIDES (flashinfer 0.6.8.post1, NVIDIA container):
+  * `flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts)` — a class
+    holding an MNNVL symmetric workspace, with
+      .dispatch(token_selected_experts, input_payloads: list[Tensor],
+                runtime_max_tokens_per_rank, ...)  -> recv payload(s)
+      .combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False) -> combined
+  * module-level `flashinfer.comm.trtllm_moe_alltoall` and the lower-level
+    `moe_a2a_dispatch` / `moe_a2a_combine` / `moe_a2a_initialize` /
+    `get_workspace_size_per_rank` — the TensorRT-LLM one-sided path. Selected by
+    env CX_FLASHINFER_TRTLLM=1 (provenance trtllm=True); covers goal's
+    "TensorRT-LLM NVLink one-sided AllToAll EP".
+
+The exact kwarg names for dispatch/combine and the Mapping constructor differ across
+FlashInfer point releases. This adapter has NO GPU to validate against, so EVERY
+FlashInfer API call is wrapped to fail LOUD + SPECIFIC (the call site, the kwargs
+tried, and the underlying error) so the parent's GHA smoke shows precisely what to fix
+rather than a bare TypeError. See `_call_variants` and `_build_mapping`.
+
+CORRECTNESS (`expected`): FlashInfer's MoeAlltoAll is expert-centric (TensorRT-LLM MoE
+A2A): `dispatch` sends each token to its top_k selected experts; `combine` gathers the
+per-expert results back and reduces the top_k copies for each SOURCE token. With an
+identity expert (the harness does NO expert compute) and a combine that does NOT apply
+the gate weights (the public `combine(payload, ...)` takes no topk_weights — gate
+weighting is the MoE epilogue, not the comm), the round trip yields:
+      combined ≈ x * top_k          (sum of top_k identical copies of x)
+This is structurally DeepEP-LL-like (per-expert reduce) but WITHOUT LL's weight multiply.
+The alternative (combine applies softmax gate weights, like DeepEP LL) would give
+`x * sum(topk_weights)`. We LEAD with `x * top_k` and document both; the parent's GHA
+validates which FlashInfer actually implements and flips ONE constant (_ROUTING_FACTOR).
+Tolerance bf16 ~5e-2 (FlashInfer dispatch keeps bf16 end-to-end; no fp8 round-trip yet).
+
+STATUS: normal / layout-and-dispatch-v1. Dispatch precisions: bf16; fp8/fp8-pertoken/
+fp8-directcast (e4m3, DeepEP convention); mxfp8/mxfp4/nvfp4 (OCP-microscaling via
+FlashInfer's native quantizers — the A2A moves [q, scale_factor] as a payload LIST, dequant
+in stage()). Combine stays bf16 (MoeAlltoAll.combine has no output_dtype in 0.6.8.post1).
+The MoeAlltoAll workspace bootstraps inside the single torch.distributed NCCL group of
+same-user ranks (MNNVL symmetric memory) — the launcher/image owns CAP_SYS_PTRACE / FABRIC
+plumbing; the H200 runner denies the ptrace capability required by MNNVL fd sharing.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import flashinfer  # for version/provenance
+    import flashinfer.comm as fi_comm  # MoeAlltoAll / trtllm_moe_alltoall / moe_a2a_* live here
+except Exception as exc:  # pragma: no cover - needs the FlashInfer wheel on the container
+    print("ERROR: flashinfer import failed — FlashInfer must be present on the container at job "
+          "setup (cx_build_flashinfer: `pip install flashinfer-python`). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _flashinfer_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("flashinfer-python")
+    except Exception:
+        try:
+            import importlib.metadata as _md
+            return _md.version("flashinfer")
+        except Exception:
+            return getattr(flashinfer, "__version__", "unknown")
+
+
+# --- The round-trip routing factor (see module docstring). LEAD = top_k (sum of top_k
+# identical copies, combine does NOT weight). If GHA shows FlashInfer's combine applies
+# the gate weights instead, flip this to "weight-sum" and the reference becomes
+# x * sum(topk_weights). This is the ONE knob the parent edits after the first GHA run. ---
+_ROUTING_FACTOR = os.environ.get("CX_FLASHINFER_ROUTING_FACTOR", "ranks")  # "ranks" | "topk" | "weight-sum"
+
+
+def _loud(where: str, attempted, exc: Exception) -> RuntimeError:
+    """Build a LOUD + SPECIFIC error for a failed FlashInfer call so the parent's GHA smoke
+    shows exactly which API/kwargs to fix (no GPU here to discover the right names)."""
+    return RuntimeError(
+        f"FlashInfer EP adapter: {where} failed against flashinfer {_flashinfer_version()}. "
+        f"Attempted: {attempted}. Underlying error: {exc!r}. "
+        f"FIX: inspect the installed flashinfer.comm signatures "
+        f"(python3 -c 'import flashinfer.comm as c; help(c.MoeAlltoAll)') and adjust the "
+        f"kwarg names / Mapping construction in tests/ep_flashinfer.py.")
+
+
+def _call_variants(where: str, fn, variants):
+    """Try a sequence of (args, kwargs) plausible signatures for one FlashInfer call.
+    Returns (result, chosen_index). Raises a LOUD error listing EVERY attempt if all fail.
+    Used so a renamed kwarg surfaces as a precise, actionable message in GHA — not a
+    silent fallback (the harness contract forbids faking) and not a bare TypeError."""
+    errors = []
+    for i, (args, kwargs) in enumerate(variants):
+        try:
+            return fn(*args, **kwargs), i
+        except TypeError as exc:        # wrong kwarg name / arity — try the next signature
+            errors.append(f"  variant[{i}] args={_shape_repr(args)} kwargs={list(kwargs)} -> {exc!r}")
+        # any non-TypeError (e.g. a real CUDA/runtime error) is NOT a signature problem —
+        # re-raise immediately, wrapped, so it isn't masked by trying other signatures.
+        except Exception as exc:
+            raise _loud(where, _shape_repr(args) + f" kwargs={list(kwargs)}", exc)
+    raise _loud(where, "all signature variants exhausted:\n" + "\n".join(errors),
+                TypeError("no matching signature"))
+
+
+def _shape_repr(args):
+    out = []
+    for a in args:
+        if torch.is_tensor(a):
+            out.append(f"Tensor{tuple(a.shape)}:{a.dtype}")
+        elif isinstance(a, (list, tuple)):
+            out.append("[" + ",".join(
+                f"Tensor{tuple(t.shape)}:{t.dtype}" if torch.is_tensor(t) else repr(t) for t in a) + "]")
+        else:
+            out.append(repr(a))
+    return "(" + ", ".join(out) + ")"
+
+
+def _build_mapping(world_size, rank, gpus_per_node):
+    """Construct the FlashInfer Mapping for PURE EP. FlashInfer's Mapping REQUIRES
+    world_size == tp_size*pp_size*cp_size, and realizes MoE-EP as a VIEW over the TP dimension
+    (moe_ep_size ranks taken from the tp ranks). So pure EP across all ranks =
+    tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1 (pp=cp=1). The kwarg set varies
+    across releases, so try the plausible constructors defensively; record which worked (logged
+    at rank 0). Raises a LOUD error (listing every attempt) if none construct."""
+    Mapping = getattr(fi_comm, "Mapping", None) or getattr(flashinfer, "Mapping", None)
+    if Mapping is None:
+        raise _loud("Mapping lookup",
+                    "flashinfer.comm.Mapping / flashinfer.Mapping not found",
+                    AttributeError("Mapping"))
+    # tp_size=world_size so the world_size==tp*pp*cp invariant holds; moe_ep_size=world_size = full EP.
+    variants = [
+        ((), dict(world_size=world_size, rank=rank, gpus_per_node=gpus_per_node,
+                  tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)),
+        ((), dict(world_size=world_size, rank=rank, gpus_per_node=gpus_per_node,
+                  tp_size=world_size, moe_ep_size=world_size)),
+        ((), dict(world_size=world_size, rank=rank, gpus_per_node=gpus_per_node,
+                  tp_size=world_size)),
+    ]
+    # Omitting gpus_per_node is only topology-equivalent for a one-node world. Rack runs must fail
+    # on an older Mapping API instead of silently describing all ranks as one physical node.
+    if gpus_per_node == world_size:
+        variants += [
+            ((), dict(world_size=world_size, rank=rank,
+                  tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1)),
+            ((), dict(world_size=world_size, rank=rank, tp_size=world_size,
+                      moe_ep_size=world_size)),
+            ((), dict(world_size=world_size, rank=rank, moe_ep_size=world_size, moe_tp_size=1,
+                  tp_size=world_size)),
+            ((), dict(world_size=world_size, rank=rank, tp_size=world_size)),
+            ((world_size, rank), dict(tp_size=world_size, moe_ep_size=world_size,
+                                      moe_tp_size=1)),
+        ]
+    mapping, idx = _call_variants("Mapping(...)", Mapping, variants)
+    return mapping, idx
+
+
+# --------------------------------------------------------------------------------------
+# Quantized dispatch recipes. FlashInfer's MoE A2A dispatch takes input_payloads as a LIST
+# of [local_num_tokens, *] tensors and moves them as bytes (dtype-agnostic) — so a quantized
+# dispatch = pass [q, scale_factor] as the payload list, recv [recv_q, recv_sf], then DEQUANT
+# in stage() (UNTIMED, outside the comm window — the quant/dequant mirrors a producer handing
+# already-quantized activations, exactly like ep_deepep's layout-and-dispatch-v1 contract).
+#
+# Two families:
+#   * e4m3 block-128 / per-token / direct-cast — pure-torch (identical convention to ep_deepep,
+#     so FlashInfer-fp8 and DeepEP-fp8 are the SAME operating point on different transports).
+#   * mxfp8 / mxfp4 / nvfp4 — FlashInfer's native OCP-microscaling quantizers (mxfp8_quantize,
+#     mxfp4_quantize, nvfp4_quantize) + their matching dequantizers. These check goal's
+#     "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable here precisely because the A2A is a byte
+#     mover and FlashInfer ships the quantize/dequantize kernels (flashinfer 0.6.8.post1).
+# The comm-correctness gate compares against the DEQUANTIZED cast that was actually sent
+# (ref = dequant(quant(x)) * factor), so it verifies the COMM, not the quantizer — same as
+# ep_deepep.expected(). Tolerance per format (4-bit fp4 is far looser than 8-bit fp8).
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _e4m3_block128_cast(x):
+    # PER-BLOCK-128 e4m3 (DeepEP default convention): scales [T, H//128] f32.
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _e4m3_pertoken_cast(x):
+    T, H = x.shape
+    amax = x.float().abs().amax(dim=1, keepdim=True).clamp(min=1e-4)
+    x_fp8 = (x.float() * (_FP8_MAX / amax)).to(torch.float8_e4m3fn)
+    scales = (amax / _FP8_MAX).expand(T, H // _FP8_BLOCK).contiguous()
+    return x_fp8, scales
+
+
+def _e4m3_directcast(x):
+    T, H = x.shape
+    x_fp8 = x.float().clamp(-_FP8_MAX, _FP8_MAX).to(torch.float8_e4m3fn)
+    scales = torch.ones((T, H // _FP8_BLOCK), dtype=torch.float32, device=x.device)
+    return x_fp8, scales
+
+
+def _e4m3_dequant_nd(x_fp8, scales):
+    # Works for [R,H]+[R,H//128] (2D) and [E,S,H]+[E,S,H//128] (3D recv). Last dim is H; scale
+    # repeats per 128-block.
+    *lead, H = x_fp8.shape
+    blocks = H // _FP8_BLOCK
+    xv = x_fp8.float().reshape(*lead, blocks, _FP8_BLOCK)
+    return (xv * scales.reshape(*lead, blocks, 1)).reshape(*lead, H).to(torch.bfloat16)
+
+
+class _MicroscaleRecipe:
+    """FlashInfer-native mxfp8 / mxfp4 / nvfp4 quant+dequant, validated on the runner via the
+    library's own kernels. Quantize on a flat [N, H] view (the A2A moves per-token payloads),
+    keep the swizzled scale-factor as a SECOND payload, dequant the 3D recv by flattening the
+    [ep, max_tokens] dims to [N, H] (the SF swizzle is per-row so the flatten is layout-safe),
+    then reshaping back. Imports flashinfer lazily so a wheel without these kernels fails LOUD."""
+
+    _MX_BLOCK = 32   # mxfp8 e8m0 block size
+    _NV_VEC = 16     # nvfp4 e4m3 scale block size (sf_vec_size)
+
+    _MXFP4_VEC = 32  # mxfp4 e8m0 block size (sf_vec_size)
+    # OCP e2m1 magnitudes indexed by (exp<<1)|mant (3 low bits); bit3 = sign.
+    _E2M1_MAG = (0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0)
+
+    def __init__(self, kind):
+        self.kind = kind  # "mxfp8" | "nvfp4" | "mxfp4"
+        # mxfp4 is reachable after all: mxfp4_quantize() forces a tile-padded SWIZZLED SF, but the
+        # lower-level fp4_quantize(sf_vec_size=32, sf_use_ue8m0=True, is_sf_swizzled_layout=False)
+        # emits e2m1 + e8m0 in a LINEAR per-token layout (movable through the A2A). dequant is a manual
+        # e2m1 LUT * 2^(e8m0-127) (no flashinfer linear-mxfp4 dequant exists; mxfp4_dequantize wants
+        # swizzled). The dispatch gate is consistency-based, so this validates the comm honestly.
+        import flashinfer as _fi
+        self._fi = _fi
+        need = {"mxfp8": ("mxfp8_quantize",),
+                "nvfp4": ("fp4_quantize", "e2m1_and_ufp8sf_scale_to_float"),
+                "mxfp4": ("fp4_quantize",)}[kind]
+        for fn in need:
+            if not hasattr(_fi, fn):
+                raise _loud(f"{kind} quantizer lookup", f"flashinfer.{fn} not found",
+                            AttributeError(fn))
+
+    def cast(self, x):
+        # Returns (q, sf) — BOTH per-token (first-dim == T) so the A2A moves them as a payload list.
+        # mxfp8: q [T,H] e4m3, sf [T, H/32] e8m0(uint8), LINEAR (is_sf_swizzled_layout=False).
+        # nvfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/16] uint8 (ufp8 e4m3), per-tensor global sf.
+        # mxfp4: q [T, H/2] uint8 (packed e2m1), sf [T, H/32] uint8 (e8m0), LINEAR — via fp4_quantize.
+        fi = self._fi
+        xt = x.contiguous()
+        T, H = xt.shape
+        if self.kind == "mxfp8":
+            q, sf = fi.mxfp8_quantize(xt, is_sf_swizzled_layout=False)
+            sf = sf.reshape(T, H // self._MX_BLOCK)
+        elif self.kind == "mxfp4":
+            q, sf = fi.fp4_quantize(xt, sf_vec_size=self._MXFP4_VEC, sf_use_ue8m0=True,
+                                    is_sf_swizzled_layout=False)
+            if sf.dim() == 1:
+                sf = sf.reshape(T, -1)
+        else:  # nvfp4: global_scale maps amax -> the max representable (e4m3max * e2m1max = 448*6);
+               # dequant divides by it. (the reciprocal — amax/(448*6) — yields ~0 output, relerr~1.)
+            gsf = ((_FP8_MAX * 6.0) / xt.float().abs().amax().clamp(min=1e-4)).reshape(1)
+            q, sf = fi.fp4_quantize(xt, global_scale=gsf, sf_vec_size=self._NV_VEC,
+                                    sf_use_ue8m0=False, is_sf_swizzled_layout=False)
+            self._gsf = gsf
+            if sf.dim() == 1:
+                sf = sf.reshape(T, -1)
+        return q.contiguous(), sf.contiguous()
+
+    def dequant_nd(self, q, sf):
+        # q/sf are recv tensors — 2D [T,*] (the x_ref path) or 3D [E,S,*] (the stage recv path).
+        # Flatten leading dims to [N,*], dequant on device, reshape back. NO host round-trip.
+        lead = q.shape[:-1]
+        N = 1
+        for d in lead:
+            N *= d
+        if self.kind == "mxfp8":
+            # Manual DEVICE e8m0 dequant (FlashInfer ships only a CPU mxfp8_dequantize_host, too slow
+            # in the timing loop): x ~= q_e4m3 * 2^(sf_uint8 - 127), per block-32. Verified to match
+            # mxfp8_dequantize_host on the runner (see cx_fi_quant_smoke).
+            H = q.shape[-1]
+            B = self._MX_BLOCK
+            qf = q.reshape(N, H // B, B).float()
+            sff = sf.reshape(N, H // B).float()
+            out = (qf * torch.pow(torch.tensor(2.0, device=q.device), sff - 127.0).unsqueeze(-1)).reshape(N, H)
+        elif self.kind == "mxfp4":
+            # Manual e2m1 (LUT) + e8m0 block-32 decode (no flashinfer linear-mxfp4 dequant exists).
+            Hp = q.shape[-1]
+            H = Hp * 2
+            qb = q.reshape(N, Hp)
+            lut = torch.tensor(self._E2M1_MAG, device=q.device, dtype=torch.float32)
+            def _dec(nib):  # nib uint8 [N,Hp] 0..15 -> signed e2m1 magnitude
+                sign = 1.0 - 2.0 * ((nib >> 3) & 1).float()
+                return sign * lut[(nib & 0x7).long()]
+            lo = _dec(qb & 0xF)
+            hi = _dec((qb >> 4) & 0xF)          # byte packs [v_lo, v_hi]
+            vals = torch.stack([lo, hi], dim=-1).reshape(N, H)
+            blk = H // self._MXFP4_VEC
+            scale = torch.pow(torch.tensor(2.0, device=q.device), sf.reshape(N, blk).float() - 127.0)
+            out = (vals.view(N, blk, self._MXFP4_VEC) * scale.view(N, blk, 1)).reshape(N, H)
+        else:  # nvfp4 — DEVICE dequant (e2m1 + ufp8 e4m3 scale + per-tensor global), linear layout.
+            qf = q.reshape(N, q.shape[-1]).contiguous()
+            sff = sf.reshape(N, sf.shape[-1]).contiguous()
+            # dequant divides by the global scale -> pass its RECIPROCAL (verified on the runner:
+            # quant gsf=(448*6)/amax + dequant 1/gsf -> relerr ~0.09 = the 4-bit nvfp4 floor).
+            gsf = getattr(self, "_gsf", None)
+            out = self._fi.e2m1_and_ufp8sf_scale_to_float(
+                qf, sff, global_scale_tensor=(1.0 / gsf).cpu() if gsf is not None else None,
+                sf_vec_size=self._NV_VEC, is_sf_swizzled_layout=False)
+        H = out.shape[-1]
+        # e2m1_and_ufp8sf_scale_to_float returns on CPU; move back to the payload's device.
+        return out.reshape(*lead, H).to(device=q.device, dtype=torch.bfloat16)
+
+
+# dispatch_dtype -> (label, kind). kind selects the cast/dequant path in make_problem/stage.
+# mxfp4 uses fp4_quantize(sf_use_ue8m0=True, is_sf_swizzled_layout=False) — a LINEAR e8m0 SF that
+# moves per-token through the A2A (mxfp4_quantize's tile-padded swizzled SF does NOT; that was the
+# old blocker). mxfp8/mxfp4/nvfp4 + the e4m3 fp8 recipes cover the OCP-microscaling dispatch goal.
+_QUANT_RECIPES = {
+    "fp8":            ("per-block-128", "e4m3"),
+    "fp8-pertoken":   ("per-token", "e4m3"),
+    "fp8-directcast": ("direct-cast", "e4m3"),
+    "mxfp8":          ("mxfp8-e8m0-block32", "mxfp8"),
+    "mxfp4":          ("mxfp4-e8m0-block32", "mxfp4"),
+    "nvfp4":          ("nvfp4-e4m3-block16", "nvfp4"),
+}
+_E4M3_CASTS = {"fp8": _e4m3_block128_cast, "fp8-pertoken": _e4m3_pertoken_cast,
+               "fp8-directcast": _e4m3_directcast}
+# Per-format comm-correctness tolerance (round-trip of the dequantized cast through the comm).
+_QUANT_TOL = {"e4m3": 1.25e-1, "mxfp8": 1.5e-1, "mxfp4": 3.5e-1, "nvfp4": 3.0e-1}
+
+
+class FlashInferBackend:
+    name = "flashinfer"
+    # FlashInfer combine reuses the dispatch workspace/handle (no re-dispatch needed before
+    # a timed combine), mirroring DeepEP normal mode — combine consumes the recv payload.
+    # MoeAlltoAll is a stateful idle->dispatched->idle FSM (asserts "dispatch called twice without
+    # combine"). The harness times dispatch in isolation (loops it) AND combine in isolation. Setting
+    # this True makes the combine-timing loop run an untimed dispatch+stage (pre=) before each combine
+    # sample, so combine always sees a "dispatched" state; dispatch() resets the FSM to idle at its
+    # start so the dispatch-timing loop + the roundtrip (paired) timing all stay valid.
+    combine_needs_redispatch = True
+    # MoeAlltoAll's paired dispatch/combine FSM means isolated/looped dispatch timing corrupts the
+    # symmetric workspace (CUDA launch failure). Only the PAIRED roundtrip is measurable — the
+    # harness times the roundtrip and mirrors it into dispatch/combine (isolated_sum is N/A here).
+    # The roundtrip IS goal P0's headline metric, so this is the right measurement for this backend.
+    roundtrip_only = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel).
+    #   bf16            : MoeAlltoAll keeps bf16 payloads end-to-end (no quant round trip).
+    #   fp8*            : e4m3 dispatch (per-block-128 / per-token / direct-cast) — SAME convention
+    #                     as ep_deepep, so FlashInfer-fp8 == DeepEP-fp8 operating point, different
+    #                     transport (the TRT-LLM throughput A2A vs DeepEP NVLink).
+    #   mxfp8/mxfp4/nvfp4: OCP-microscaling dispatch via FlashInfer's native quantizers. The A2A
+    #                     moves [q, scale_factor] as a payload LIST (byte-agnostic), dequant in
+    #                     stage(). Covers goal's "MXFP8 / MXFP4 / NVFP4 dispatch" — reachable on
+    #                     this working path because FlashInfer ships the quantize/dequantize kernels.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8", "fp8-pertoken", "fp8-directcast",
+                            "mxfp8", "mxfp4", "nvfp4"}
+    SUPPORTED_MODES = {"normal"}
+    # Only the contract whose timing boundary FlashInfer can honor: layout (the dispatch
+    # send-counts) is computed inside dispatch and cannot be hoisted to a separate untimed
+    # step the way DeepEP's get_dispatch_layout can — so cached-layout-comm-only-v1 and
+    # runtime-visible-v1 (fp8) are NOT offered.
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+    # Combine path: bf16 (default) OR a quantized COMBINE OUTPUT via the newer flashinfer
+    # moe_a2a_combine output_dtype (fp8 e4m3 wired; the bundled 0.6.8.post1 has no output_dtype, so
+    # a combine-quant run upgrades FlashInfer first via cx_build_flashinfer_latest). nvfp4/mxfp8
+    # combine reserved (fp4/e8m0 output packing — extend once fp8-combine is GHA-validated).
+    SUPPORTED_COMBINE_DTYPES = {"bf16", "fp8", "nvfp4"}
+    SUPPORTED_COMBINE_QUANT_MODES = {"none", "fp8", "nvfp4"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.contract = args.measurement_contract
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # Quant recipe (None for bf16). e4m3 = pure-torch cast (DeepEP convention); mx/nvfp4 =
+        # FlashInfer-native quantizer. dispatch passes [q, sf]; stage() dequants (UNTIMED).
+        self.dispatch_dtype = args.dispatch_dtype
+        self.quant_label, self.quant_kind = _QUANT_RECIPES.get(args.dispatch_dtype, (None, None))
+        self._micro = None
+        if self.quant_kind in ("mxfp8", "mxfp4", "nvfp4"):
+            self._micro = _MicroscaleRecipe(self.quant_kind)   # lazy flashinfer import, LOUD if absent
+        elif self.quant_kind == "e4m3":
+            self._e4m3_cast = _E4M3_CASTS[args.dispatch_dtype]
+        # bf16 round-trip error ~5e-3 (tol 5e-2); fp8 e4m3 ~1/16; fp4 (4-bit) far looser. Per-format
+        # tolerance recorded in the artifact so the looser quant gate is explicit, not hidden.
+        self.tolerance = _QUANT_TOL.get(self.quant_kind, 5e-2)
+        # The quant CAST + recv-DEQUANT run in make_problem/stage (OUTSIDE the timed comm window) —
+        # the layout-and-dispatch-v1 contract (producer hands quantized activations). Recorded honestly.
+        self.fp8_in_timing = False if self.quant_kind else None
+        self.scale_layout = self.quant_label
+
+        # Combine-side quant (SEPARATE axis from dispatch): a quantized COMBINE OUTPUT via the newer
+        # flashinfer moe_a2a_combine output_dtype (the bundled 0.6.8.post1 has NO output_dtype, so a
+        # combine-quant run upgrades FlashInfer first — cx_build_flashinfer_latest). The combine
+        # kernel emits the per-source-token reduction already as fp8 + per-token scales; we dequant
+        # (cached, untimed) for the correctness gate. The quantized reduction is what's TIMED.
+        self.combine_dtype = getattr(args, "combine_dtype", "bf16")
+        self.combine_quant = self.combine_dtype not in ("bf16", None, "")
+        self.combine_input_dtype = self.combine_dtype
+        self.combine_quant_mode = getattr(args, "combine_quant_mode", "none")
+        self.combine_quant_in_timing = True if self.combine_quant else None
+        self.combine_dequant_in_timing = False if self.combine_quant else None
+        self._qc_out_dtype = None
+        self._qc_scale_shape = None   # cached working output_scales shape (discovered on first combine)
+        if self.combine_quant:
+            import inspect as _inspect
+            if "output_dtype" not in str(_inspect.signature(fi_comm.MoeAlltoAll.combine)):
+                raise RuntimeError(
+                    "combine-quant requested but flashinfer.comm.MoeAlltoAll.combine has NO output_dtype — "
+                    "this wheel (likely 0.6.8.post1) predates PR3376/3643. The run must upgrade FlashInfer "
+                    "first (CX_COMBINE_DTYPE!=bf16 triggers cx_build_flashinfer_latest in run_in_container.sh).")
+            # fp8 -> e4m3 output + UE8M0 uint8 vec-32 scales (= MXFP8). nvfp4 -> uint8 packed-e2m1
+            # output + e4m3 vec-16 scales + a per-tensor output_scalar_scale (the fp4 path).
+            self._qc_out_dtype = {"fp8": torch.float8_e4m3fn, "nvfp4": torch.uint8}.get(self.combine_dtype)
+            if self._qc_out_dtype is None:
+                raise RuntimeError(f"combine_dtype={self.combine_dtype} not wired (fp8|nvfp4)")
+            # quantized-combine round-trip is looser than the bf16 reconstruction (fp8 ~1/16 +
+            # whatever the dispatch added); keep at least the dispatch tol.
+            self.tolerance = max(self.tolerance, 1.6e-1)
+
+        # TensorRT-LLM lineage: MoeAlltoAll LIVES IN flashinfer.comm.trtllm_moe_alltoall (the
+        # "throughput backend" — the TRT-LLM NVLink one-sided AllToAll over an MNNVL symmetric
+        # workspace). So this adapter's DEFAULT path IS the TRT-LLM one-sided EP; CX_FLASHINFER_TRTLLM
+        # only flips the provenance label (there is no separate functional path — both call the same
+        # moe_a2a_dispatch/combine kernels). Kept as a label so the artifact can be tagged trtllm.
+        self.trtllm = os.environ.get("CX_FLASHINFER_TRTLLM", "0") == "1"
+
+        self.top_k = int(args.topk)
+        self.num_experts = int(args.experts)
+        # Workspace/buffer ceiling. The MoeAlltoAll symmetric workspace is sized for
+        # max_num_tokens per rank; the sweep is capped at this (buffer_cap) so a too-large T
+        # is dropped (reported) rather than overflowing. 4096 holds the prefill ladder top.
+        self.max_num_tokens = int(os.environ.get("CX_FLASHINFER_MAX_TOKENS", "4096"))
+
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _flashinfer_version()
+
+        # Build the pure-EP Mapping (defensive over kwarg variants; logs which worked).
+        gpus_per_node = int(args.gpus_per_node or world_size)
+        self.mapping, map_variant = _build_mapping(world_size, rank, gpus_per_node)
+        if rank == 0:
+            print(f"[flashinfer] Mapping constructed via variant #{map_variant} "
+                  f"(world={world_size} rank={rank} gpus_per_node={gpus_per_node} "
+                  f"tp={world_size} moe_ep={world_size} moe_tp=1)",
+                  file=sys.stderr)
+
+        # Construct the comm object. MoeAlltoAll (in flashinfer.comm.trtllm_moe_alltoall) IS the
+        # TRT-LLM throughput-backend one-sided A2A — it allocates its MNNVL symmetric workspace
+        # internally and calls the same moe_a2a_dispatch/combine kernels the functional API exposes.
+        # So we ALWAYS construct it; the trtllm flag only tags provenance (no separate path).
+        self.path = "trtllm_moe_alltoall" if self.trtllm else "moe_alltoall"
+        self.a2a = None
+        self.workspace = None
+        self.ws_size = None
+        self._init_moe_alltoall(ver)
+
+        self.backend_provenance = {
+            "flashinfer_version": ver,
+            "flashinfer_commit": os.environ.get("FLASHINFER_COMMIT") or f"pkg-{ver}",
+            # Exact instantiated library stack (flashinfer/cubin/jit-cache + cutlass-dsl + torch),
+            # captured after any upgrade and handed across rack srun steps.
+            "flashinfer_stack": os.environ.get("CX_FLASHINFER_STACK"),
+            "gpus_per_node": gpus_per_node,
+            "mode": "normal", "path": self.path, "trtllm": self.trtllm,
+            # MoeAlltoAll's home module — proves this EP path IS the TRT-LLM one-sided throughput A2A.
+            "backend_lineage": "flashinfer.comm.trtllm_moe_alltoall.MoeAlltoAll",
+            "transport": "trtllm-throughput-backend-onesided",
+            # quant provenance (None/bf16 path -> nulls). scale_layout + dispatch_dtype name the recipe.
+            "dispatch_dtype": self.dispatch_dtype, "quant_kind": self.quant_kind,
+            "scale_layout": self.scale_layout, "quant_in_timing": self.fp8_in_timing,
+            # combine-side quant (a SEPARATE axis): a quantized COMBINE OUTPUT (fp8 e4m3) when set.
+            "combine_dtype": self.combine_dtype, "combine_quant": self.combine_quant,
+            "combine_quant_in_timing": self.combine_quant_in_timing,
+            "resource_mode": args.resource_mode,
+            # FlashInfer MoE A2A occupancy is fixed by the library (a symmetric-memory kernel, not
+            # an SM/CU budget we set) — like DeepEP LL. Recorded as a fixed-kernel run so the
+            # resource_profile maps it to resource_class=fixed-kernel (excluded from the Pareto).
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "fixed-kernel",
+            "max_num_tokens": self.max_num_tokens, "top_k": self.top_k,
+            "num_experts": self.num_experts,
+            "mapping_variant": map_variant,
+            "routing_factor": _ROUTING_FACTOR,
+            # MNNVL symmetric workspace — comm bootstrapped via torch.distributed (TorchDistBackend),
+            # NOT MPI, so it works under torchrun without mpi4py / an MPI launch.
+            "workspace": "mnnvl-symmetric", "mnnvl_comm": getattr(self, "_mnnvl_comm", "n/a"),
+        }
+
+    def _init_moe_alltoall(self, ver):
+        """Class path: flashinfer.comm.MoeAlltoAll(mapping, max_num_tokens, top_k, num_experts)."""
+        MoeAlltoAll = getattr(fi_comm, "MoeAlltoAll", None)
+        if MoeAlltoAll is None:
+            raise _loud("MoeAlltoAll lookup", "flashinfer.comm.MoeAlltoAll not found",
+                        AttributeError("MoeAlltoAll"))
+        # The MNNVL symmetric workspace bootstraps its cross-rank comm via MPI by default
+        # (MnnvlMemory.get_comm -> MpiComm().Split) — which fails under torchrun (no mpi4py / no MPI
+        # launch). FlashInfer ships a TorchDistBackend; wrap it in an MnnvlConfig so the workspace
+        # uses the torch.distributed NCCL group torchrun already set up. This is the no-MPI path.
+        mnnvl_config = None
+        try:
+            from flashinfer.comm.mnnvl import MnnvlConfig, TorchDistBackend, MnnvlMemory
+            mnnvl_config = MnnvlConfig(comm_backend=TorchDistBackend(group=None))
+            # get_comm() returns the cached class-level comm if set, else MPI-Splits. Register the
+            # torch-dist comm explicitly so the workspace bootstrap NEVER touches MPI/mpi4py.
+            if MnnvlMemory.comm is None:
+                MnnvlMemory.set_comm_from_config(self.mapping, mnnvl_config)
+            if self.rank == 0:
+                print("[ep_flashinfer] MNNVL via TorchDistBackend (no MPI)", flush=True)
+        except Exception as exc:  # older flashinfer without TorchDistBackend -> fall back (will MPI-fail loudly)
+            if self.rank == 0:
+                print(f"[ep_flashinfer] WARN: no TorchDistBackend ({exc!r}); MoeAlltoAll will need MPI",
+                      flush=True)
+        self._mnnvl_comm = "torch-dist" if mnnvl_config else "mpi-default"  # provenance built later
+        # kwarg names have drifted across releases; hidden_size is REQUIRED (else MoeAlltoAll asserts
+        # "hidden_size must be provided if workspace_size_per_rank is not provided"); mnnvl_config
+        # supplies the torch-dist comm. Try with mnnvl_config first, then without (older releases).
+        hs = int(self.args.hidden)
+        mc = dict(mnnvl_config=mnnvl_config) if mnnvl_config is not None else {}
+        variants = [
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs, **mc)),
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs)),
+            ((self.mapping,), dict(max_num_tokens=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs,
+                                   ep_size=self.world_size)),
+            ((self.mapping, self.max_num_tokens, self.top_k, self.num_experts, hs), {}),
+            ((self.mapping,), dict(max_num_tokens_per_rank=self.max_num_tokens, top_k=self.top_k,
+                                   num_experts=self.num_experts, hidden_size=hs)),
+        ]
+        self.a2a, idx = _call_variants("MoeAlltoAll(...)", MoeAlltoAll, variants)
+        self.path = "moe_alltoall"
+        if self.rank == 0:
+            print(f"[flashinfer] MoeAlltoAll constructed via variant #{idx}", file=sys.stderr)
+
+    def buffer_cap(self, args):
+        # The symmetric workspace is sized for max_num_tokens per rank; cap the sweep there
+        # (reported by the harness, never silently truncated).
+        return self.max_num_tokens
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        # token_selected_experts is commonly int32 in TensorRT-LLM kernels; keep an int32 copy
+        # alongside the int64 (the harness/expected use int64; the kernel call uses int32).
+        # input_payloads = [x] for bf16, or [q, scale_factor] for a quantized dispatch — the cast
+        # runs HERE (UNTIMED preprocessing). x_ref = the dequantized cast = the COMM correctness
+        # reference (so the gate verifies the all-to-all, not the quantizer).
+        p = types.SimpleNamespace(
+            T=int(T), x=x,
+            topk_idx=idx.to(torch.int64),
+            topk_idx_i32=idx.to(torch.int32),
+            topk_weights=weights.to(torch.float32),
+            payloads=None, x_ref=None,
+        )
+        if self.quant_kind == "e4m3":
+            q, sf = self._e4m3_cast(x)
+            p.payloads = [q, sf]
+            p.x_ref = _e4m3_dequant_nd(q, sf)
+        elif self._micro is not None:
+            q, sf = self._micro.cast(x)
+            p.payloads = [q, sf]
+            p.x_ref = self._micro.dequant_nd(q, sf)   # 2D recv path (lead=(T,)) = source-token ref
+        else:  # bf16
+            p.payloads = [x]
+            p.x_ref = x
+        return p
+
+    def _reset_moe_fsm(self):
+        # Force the MoeAlltoAll FSM back to idle so a fresh dispatch is legal. The harness loops
+        # dispatch in isolation (and re-dispatches before each combine); a pending "dispatched"
+        # state from a prior un-combined dispatch would assert. Discarding it is fine for timing
+        # (each dispatch re-populates the workspace). Defensive: the internal attr may move.
+        a = getattr(self, "a2a", None)
+        st = getattr(a, "_state", None)
+        if st is not None and getattr(st, "phase", "idle") != "idle":
+            try:
+                st.phase = "idle"
+            except Exception:
+                pass
+
+    def dispatch(self, p):
+        self._reset_moe_fsm()
+        # MoeAlltoAll.dispatch(token_selected_experts, input_payloads, runtime_max_tokens_per_rank)
+        # -> a LIST of recv tensors [ep_size, max_tokens, *] (one per input payload, same order).
+        # input_payloads = p.payloads ([x] bf16, or [q, scale_factor] for a quantized dispatch).
+        variants = [
+            ((p.topk_idx_i32, p.payloads, p.T), {}),
+            ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens_per_rank=p.T)),
+            ((p.topk_idx_i32, p.payloads), dict(runtime_max_tokens=p.T)),
+            ((p.topk_idx, p.payloads, p.T), {}),                  # int64 idx fallback
+        ]
+        recv, idx = _call_variants("MoeAlltoAll.dispatch(...)", self.a2a.dispatch, variants)
+        recv_list = list(recv) if isinstance(recv, (list, tuple)) else [recv]
+        recv_q = recv_list[0]
+        recv_sf = recv_list[1] if len(recv_list) > 1 else None
+        return types.SimpleNamespace(recv=recv, recv_q=recv_q, recv_sf=recv_sf,
+                                     recv_payload=self._first_payload(recv),
+                                     dispatch_variant=idx, combine_input=None)
+
+    @staticmethod
+    def _first_payload(recv):
+        """dispatch may return a Tensor, a (payloads, meta) tuple, or a list of payloads.
+        Return the first payload Tensor (the routed x on this rank) for recv_tokens/staging."""
+        if torch.is_tensor(recv):
+            return recv
+        if isinstance(recv, (list, tuple)) and recv:
+            head = recv[0]
+            if torch.is_tensor(head):
+                return head
+            if isinstance(head, (list, tuple)) and head and torch.is_tensor(head[0]):
+                return head[0]
+        return recv  # leave as-is; recv_tokens guards with is_tensor
+
+    def stage(self, p, h):
+        # No expert compute (identity expert). For bf16, the recv IS the "expert output" as-is —
+        # combine reads back from the SAME workspace dispatch populated, so we hand recv[0] straight
+        # to combine (NO clone — a clone of the workspace-backed recv broke the layout and
+        # async-corrupted CUDA; combine is called payload_in_workspace=False so the kernel stages it).
+        # For a QUANTIZED dispatch, DEQUANT the recv (recv_q + recv_sf) -> bf16 HERE (UNTIMED, outside
+        # the comm window): this is the bf16 "expert input" that combine reduces. The dequant produces
+        # a fresh tensor (not workspace-backed), which combine stages via payload_in_workspace=False.
+        if self.quant_kind:
+            # Dequant is UNTIMED preprocessing (layout-and-dispatch-v1) — but FlashInfer is
+            # roundtrip_only, so stage() runs INSIDE the timed dispatch->combine loop. The recv is
+            # DETERMINISTIC for a fixed problem (same x + routing -> same workspace contents), so we
+            # dequant ONCE and cache it on the problem; steady-state timing then measures comm only
+            # (the dequant is amortized, exactly as DeepEP's separately-timed stage is untimed). This
+            # keeps FlashInfer-fp8 comparable to DeepEP-fp8 (same timing boundary) and stops the
+            # CPU-side nvfp4 dequant from dominating the roundtrip.
+            ci = getattr(p, "_combine_input_cache", None)
+            if ci is None:
+                ci = (_e4m3_dequant_nd(h.recv_q, h.recv_sf) if self.quant_kind == "e4m3"
+                      else self._micro.dequant_nd(h.recv_q, h.recv_sf))
+                p._combine_input_cache = ci
+            h.combine_input = ci
+        else:
+            h.combine_input = h.recv_payload
+        if self.rank == 0 and not getattr(self, "_shape_logged", False) and torch.is_tensor(h.combine_input):
+            self._shape_logged = True
+            print(f"[ep_flashinfer] dtype={self.dispatch_dtype} recv_q={tuple(h.recv_q.shape)}:{h.recv_q.dtype}"
+                  f" combine_input={tuple(h.combine_input.shape)}:{h.combine_input.dtype}", flush=True)
+        return None
+
+    def combine(self, p, h):
+        if self.combine_quant:
+            return self._combine_quant(p, h)
+        # MoeAlltoAll.combine(payload, runtime_max_tokens_per_rank, payload_in_workspace=False)
+        # -> the per-source-token reduced result on this rank ([T, hidden] bf16). Because the
+        # dispatch populated the symmetric workspace, the data is already there: try
+        # payload_in_workspace=True first (no payload re-copy), then the explicit-payload forms.
+        # payload_in_workspace=False FIRST: combine_input is a cloned external tensor (see stage),
+        # so the kernel copies it into the workspace itself — avoids the exact-pointer requirement
+        # that payload_in_workspace=True enforces (which raised a RuntimeError, not a TypeError, so
+        # _call_variants would not fall through to it).
+        variants = [
+            ((h.combine_input, p.T), dict(payload_in_workspace=False)),
+            ((h.combine_input, p.T), {}),
+            ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T, payload_in_workspace=False)),
+            ((h.combine_input,), dict(runtime_max_tokens_per_rank=p.T)),
+        ]
+        combined, idx = _call_variants("MoeAlltoAll.combine(...)", self.a2a.combine, variants)
+        h.combine_variant = idx
+        return self._as_tensor(combined)
+
+    _QC_VEC = 32   # fp8 combine output uses UE8M0 scales, vector size 32 (flashinfer main source)
+
+    def _combine_quant(self, p, h):
+        # Quantized COMBINE OUTPUT. Pinned from the flashinfer-main source: combine(output_dtype=
+        # float8_e4m3fn) emits the reduced result as e4m3 + UE8M0 scale factors "packed in torch.uint8,
+        # vector size 32" (linear layout) — i.e. MXFP8 (e4m3 + e8m0 block-32). So output_scales MUST be
+        # uint8 [T, H/32] (the kernel WRITES it; first run failed "float32 vs uint8"). We dequant
+        # (cached, UNTIMED — deterministic recv) via e8m0: x = e4m3 * 2^(scale_uint8 - 127) per block-32.
+        # The fp8 reduction is what's TIMED. CX_QC_SCALE override: "block32" (default) | "pertoken"[T,1].
+        H = int(getattr(self, "hidden", 0)) or int(self.args.hidden)
+        T = p.T
+        if self.combine_dtype == "nvfp4":
+            # NVFP4 combine: uint8 packed-e2m1 output + e4m3 (float8) scales vec-16 + per-tensor scalar.
+            blocks = max(1, H // 16)
+            sc = torch.zeros(T, blocks, device=self.device, dtype=torch.float8_e4m3fn)
+            self._qc_scalar = float(os.environ.get("CX_QC_NVFP4_SCALAR", "1.0"))
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype,
+                      output_scales=sc, output_scalar_scale=self._qc_scalar)
+            label = f"nvfp4 output_scales=e4m3[{T},{blocks}] scalar={self._qc_scalar}"
+        elif os.environ.get("CX_QC_SCALE") == "scalar":
+            # DIRECT-CAST fp8 combine: a single per-tensor output_scalar_scale, NO per-block
+            # output_scales (the unscaled/global-scaled e4m3 emit — goal "Direct-cast FP8 combine").
+            # The working mxfp8 path emits SCALED e4m3+e8m0; this probes whether the same kernel also
+            # supports the scalar-only mode. If the kernel REQUIRES per-block output_scales for fp8
+            # output, the call below raises and the run records that (the documented kernel limit).
+            sc = None
+            self._qc_scalar = float(os.environ.get("CX_QC_FP8_SCALAR", "1.0"))
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype,
+                      output_scalar_scale=self._qc_scalar)
+            label = f"fp8-directcast output_scalar_scale={self._qc_scalar} (no per-block scales)"
+        else:
+            # MXFP8 combine: e4m3 output + UE8M0 uint8 scales vec-32 (the main-source spec).
+            mode = os.environ.get("CX_QC_SCALE", "block32")
+            blocks = 1 if mode == "pertoken" else max(1, H // self._QC_VEC)
+            sc = torch.zeros(T, blocks, device=self.device, dtype=torch.uint8)
+            kw = dict(payload_in_workspace=False, output_dtype=self._qc_out_dtype, output_scales=sc)
+            label = f"mxfp8 output_scales=uint8[{T},{blocks}]"
+        try:
+            out = self.a2a.combine(h.combine_input, T, **kw)
+        except Exception as exc:
+            raise _loud(f"MoeAlltoAll.combine({label})",
+                        f"quant-combine call failed ({self.combine_dtype}; per the main-source spec)", exc)
+        if self.rank == 0 and not getattr(self, "_qc_logged", False):
+            self._qc_logged = True
+            oq = out[0] if isinstance(out, (tuple, list)) else out
+            print(f"[ep_flashinfer] combine-quant {label} OK out={tuple(oq.shape)}:{oq.dtype}", flush=True)
+        return self._finish_qcombine(p, out, sc, H)
+
+    def _finish_qcombine(self, p, out, sc, H):
+        # Dequant the quantized combine output (cached, UNTIMED) -> bf16 for the correctness gate.
+        #   mxfp8: e4m3 * 2^(UE8M0_uint8 - 127), per block-32.
+        #   nvfp4: e2m1_and_ufp8sf_scale_to_float(packed-e2m1, e4m3-scales, global=1/scalar), vec-16.
+        out_q = out[0] if isinstance(out, (tuple, list)) else out
+        cached = getattr(p, "_qc_dequant", None)
+        if cached is None:
+            T = out_q.shape[0]
+            if self.combine_dtype == "nvfp4":
+                gsf = torch.tensor([1.0 / max(1e-6, getattr(self, "_qc_scalar", 1.0))], dtype=torch.float32)
+                # nvfp4 dequant via the flashinfer e2m1 decoder (linear layout, vec-16)
+                import flashinfer as _fi
+                # the combine wrote the nvfp4 scales as float8_e4m3fn, but the e2m1 decoder wants the
+                # raw ufp8 bytes as uint8 — reinterpret (same 1-byte storage), don't cast.
+                sc_u8 = sc.reshape(T, -1).contiguous().view(torch.uint8)
+                o = _fi.e2m1_and_ufp8sf_scale_to_float(
+                    out_q.reshape(T, -1).contiguous(), sc_u8,
+                    global_scale_tensor=gsf, sf_vec_size=16, is_sf_swizzled_layout=False)
+                cached = o.reshape(T, H).to(device=out_q.device, dtype=torch.bfloat16)
+            elif sc is None:
+                # direct-cast fp8: single global scalar, no per-block scales -> x = e4m3 * scalar
+                cached = (out_q.float() * float(getattr(self, "_qc_scalar", 1.0))).to(torch.bfloat16)
+                p._qc_dequant = cached
+                return cached
+            else:
+                of = out_q.float()
+                blocks = sc.shape[-1] if torch.is_tensor(sc) and sc.dim() >= 2 else 1
+                if blocks > 1 and (H % blocks) == 0:
+                    bs = H // blocks
+                    scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float() - 127.0)  # e8m0
+                    cached = (of.view(T, blocks, bs) * scale.view(T, blocks, 1)).reshape(T, H).to(torch.bfloat16)
+                else:
+                    scale = torch.pow(torch.tensor(2.0, device=of.device), sc.float().reshape(T, 1) - 127.0)
+                    cached = (of * scale).to(torch.bfloat16)
+            p._qc_dequant = cached
+        return cached
+
+    @staticmethod
+    def _as_tensor(x):
+        if torch.is_tensor(x):
+            return x
+        if isinstance(x, (list, tuple)) and x and torch.is_tensor(x[0]):
+            return x[0]
+        raise _loud("combine result", f"expected a Tensor, got {type(x)}",
+                    TypeError("non-tensor combine result"))
+
+    def expected(self, p, h):
+        # Round trip, identity expert. FlashInfer combine takes NO gate weights and reduces the
+        # recv [ep_size, max_tokens, hidden] over the ep_size (per-RANK) axis — so each source token
+        # is reconstructed as x * (number of DISTINCT ranks its top_k experts land on), exactly like
+        # DeepEP normal mode (combine does not re-weight). Factor is computed from the routing trace:
+        #   "ranks" (default) -> x * distinct_ranks_per_token   (per-rank-sum combine)
+        #   "topk"            -> x * top_k                       (if combine sums every expert copy)
+        #   "weight-sum"      -> x * sum(topk_weights)           (if combine applies the gate)
+        # For a quantized dispatch, compare against the DEQUANTIZED cast that was actually sent
+        # (p.x_ref = dequant(quant(x))), so the gate verifies the COMM not the quantizer. bf16 -> x.
+        ref = (p.x_ref if p.x_ref is not None else p.x).float()
+        if _ROUTING_FACTOR == "weight-sum":
+            factor = p.topk_weights.sum(dim=1, keepdim=True)        # [T, 1]
+        elif _ROUTING_FACTOR == "topk":
+            factor = float(self.top_k)
+        else:  # "ranks": distinct ranks among each token's top_k experts (vectorized)
+            epr = max(1, self.num_experts // self.world_size)
+            ranks = (p.topk_idx.long() // epr).clamp_(0, self.world_size - 1)   # [T, topk]
+            present = torch.zeros(ranks.shape[0], self.world_size,
+                                  device=ranks.device, dtype=torch.float32)
+            present.scatter_(1, ranks, 1.0)
+            factor = present.sum(dim=1, keepdim=True)               # [T, 1] distinct ranks/token
+        return ref * factor, p.T
+
+    def recv_tokens(self, h):
+        # Realized token-copies received on this rank (the routed payload's first dim). FlashInfer
+        # pads to max_num_tokens-per-source-rank; the row count is the realistic recv-buffer size
+        # the harness reports (it does NOT gate on this — recv_total>0 is the only liveness check).
+        rp = h.recv_payload
+        if torch.is_tensor(rp) and rp.dim() >= 1:
+            return int(rp.shape[0])
+        return 0
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 0000000000..0ae2c15133
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,1130 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`)
+implement a small duck-typed protocol; this module owns the source-tokens-per-rank
+sweep, the timing, the correctness gate, and the provenance-tagged JSON doc.
+
+Fair-comparison contract (see docs/methodology.md):
+  * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs +
+    gate weights are generated once from a fixed seed over the *global* batch and are
+    identical on every SKU; each rank materializes its slice. So every platform runs
+    the *same* problem (no per-rank/per-platform RNG in the adapters).
+  * **Explicit measurement contract** (review #3): adapters conform to a NAMED timing
+    boundary, they do not each choose their own. layout-and-dispatch-v1 times the
+    routing-layout step inside dispatch (the only contract MoRI can honor); cached-
+    layout-comm-only-v1 hoists it out (DeepEP). Combine excludes staging in both.
+    Serial = SUM of the two isolated medians (NOT a measured chained op).
+  * **Correct collective percentile**: each iteration's latency is reduced MAX across
+    ranks first (a collective finishes with its slowest rank), THEN percentiled —
+    `median_i(max_r)`, not `max_r(median_i)`.
+  * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and
+    `global_tokens = T * ep_size` are recorded for the weak/strong-scaling x toggle.
+
+stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported
+lazily inside run_sweep) so this file `py_compile`s without torch.
+
+Backend protocol:
+    name, mode, combine_needs_redispatch, backend_provenance(dict)
+    buffer_cap(args) -> int|None
+    make_problem(T, idx, weights, x) -> problem   # materialize this rank's trace slice
+    dispatch(problem) -> handle                   # pure dispatch comm (timed)
+    stage(problem, handle)                        # untimed expert-output placement
+    combine(problem, handle) -> tensor            # pure combine comm (timed)
+    expected(problem, handle) -> (tensor, n_cmp)  # correctness reference
+    recv_tokens(handle) -> int                    # realized tokens received this rank
+    finalize(rc) -> int|NoReturn
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+
+# v5 adds fixed-512-v1 sampling to the v4 result contract. Historical v4 artifacts retain their
+# original variable-sample semantics and validate against ep-result-v4.schema.json.
+SCHEMA_VERSION = 5
+
+# Every comparison-grade EP point uses the same literal timing profile on every SKU/backend.
+# Eight timed iterations keep each MoRI burst well below its sustained-iteration wedge, 64 trials
+# provide 512 observations per operation, and 32 warmups meet Blackwell's measured clock-ramp floor.
+SAMPLING_CONTRACT = "fixed-512-v1"
+TIMED_SAMPLES_PER_POINT = 512
+TIMED_ITERS_PER_TRIAL = 8
+TRIALS_PER_POINT = 64
+WARMUP_ITERS_PER_TRIAL = 32
+WARMUP_SEMANTICS = "full-roundtrip-per-trial-point-v1"
+
+# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal
+# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a
+# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+
+# Dispatch-payload element size (bytes/element of hidden) for the derived-bandwidth estimate.
+# fp8/mxfp8 = 1B e4m3/e8m0; mxfp4/nvfp4 = 0.5B (4-bit e2m1, 2 values/byte) — the headline metric
+# is measured LATENCY (dtype-independent); only the secondary GB/s estimate uses this.
+_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1, "fp8-pertoken": 1, "fp8-directcast": 1,
+                "mxfp8": 1, "mxfp4": 0.5, "nvfp4": 0.5}
+
+# Phase profiles (goal P2 "decode/prefill representation"): decode/prefill are token-size REGIMES
+# that also carry distinct serving semantics — NOT merely ladder aliases. Emitted into the doc so a
+# T=128 point launched under "prefill" is never silently read as decode (the shared-T overlap is
+# the same kernel at the same T; the phase records what serving situation it stands in). Each point
+# is ONE MoE layer, ONE step, a SINGLE dispatch+combine collective pair — not a whole model or
+# several concurrent layers.
+PHASE_PROFILE = {
+    "decode": {"regime": "decode", "tokens_per_iter": "1 (or few) per active sequence",
+               "microbatch": "one decode step across the active sequences",
+               "routing_variability": "varies step-to-step; this static microbenchmark measures one step",
+               "represents": "one MoE layer · one decode step · one dispatch+combine collective"},
+    "prefill": {"regime": "prefill", "chunk": "chunked-prefill — many tokens/sequence per MoE layer",
+                "request_mixture": "tokens of one chunk entering a single MoE layer at once",
+                "represents": "one MoE layer · one prefill chunk · one dispatch+combine collective"},
+}
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+    """CLI args shared by every backend (the entrypoint adds --backend)."""
+    ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+                    help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder")
+    ap.add_argument("--tokens-ladder", default="",
+                    help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)")
+    ap.add_argument("--dispatch-dtype", default="bf16",
+                    choices=["bf16", "fp8", "fp8-pertoken", "fp8-directcast",
+                             "mxfp8", "mxfp4", "nvfp4"])
+    # Combine-path precision/quant is a SEPARATE axis from dispatch (review: don't let
+    # dispatch_dtype=fp8 imply the whole EP path is quantized). Today every backend combines
+    # bf16 with no quant (combine_quant_mode=none); a future quantized combine (e.g. ROCm/MoRI
+    # PR311) sets these WITHOUT changing --dispatch-dtype. Defaults reproduce today exactly;
+    # capability.py gates unsupported values.
+    ap.add_argument("--combine-dtype", default="bf16", choices=["bf16", "fp8", "nvfp4"],
+                    help="combine OUTPUT precision (bf16 default; fp8=MXFP8 e4m3+e8m0, nvfp4=e2m1 — "
+                         "quantized combine via flashinfer-main moe_a2a_combine output_dtype)")
+    ap.add_argument("--combine-quant-mode", default="none",
+                    help="combine quantization mode; 'none' today. capability.py rejects unwired modes")
+    # Activation VALUE distribution of expert inputs (goal P2). normal = seeded N(0,1) (the only
+    # latency-relevant one under bf16 combine — bf16 is value-independent); the others stress a
+    # FUTURE quantized combine's scale computation (amax/outliers/saturation). routing.py owns
+    # the generators; capability.py gates which a backend/mode admits.
+    ap.add_argument("--activation-profile", default="normal",
+                    choices=["normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation"],
+                    help="value distribution of expert inputs (routing.ACTIVATION_PROFILES)")
+    # uniform = realistic top-k (fan-out ≈5.3 over EP8); balanced = load-equalized,
+    # one-expert-per-rank (fan-out = ep_size); balanced-rank-local = fan-out 1 (min
+    # comm) edge case; zipf = skewed; hotspot-single = adversarial receive concentration.
+    ap.add_argument("--routing", default="uniform",
+                    choices=["uniform", "balanced", "balanced-rank-local", "zipf",
+                             "hotspot-single"])
+    # Retained in legacy identity only. Synthetic temporal modes were removed because separate
+    # processes cannot measure adaptation across steps.
+    ap.add_argument("--routing-step", type=int, default=0,
+                    help="legacy field; promoted and manual synthetic runs require 0")
+    # Uneven source-token allocation (goal P2 "support uneven source-token allocation"): per-rank
+    # token counts vary (global may not divide EP); empty-source-rank case included. Default 'none'
+    # = every rank gets exactly the ladder T (perfectly even; source-token CV 0) — no behavior
+    # change for existing runs. 'linear' ramps counts ~0.5T..1.5T; 'empty-rank' zeroes rank 0.
+    ap.add_argument("--uneven-tokens", default="none", choices=["none", "linear", "empty-rank"],
+                    help="per-rank source-token allocation skew (records source_token_stats)")
+    # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical
+    # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform
+    # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew.
+    ap.add_argument("--eplb", action="store_true",
+                    help="apply EPLB expert replication/placement to the routing trace")
+    ap.add_argument("--num-redundant-experts", type=int, default=32,
+                    help="EPLB: redundant physical expert slots (rounded up to a multiple of ep_size)")
+    # Canonical serialized workload (goal P1): consume pre-generated trace bytes instead of the
+    # seeded runtime generator, so a result is provably the SAME workload as another machine's
+    # (checksum match). Points at a dir of <workload_id>.npz/.manifest.json (make_workloads.py).
+    ap.add_argument("--workload-dir", default="",
+                    help="dir of canonical workload traces; empty = seeded runtime generation (dev)")
+    ap.add_argument("--case-id", default="")
+    ap.add_argument("--suite", default="")
+    ap.add_argument("--workload-name", default="")
+    ap.add_argument("--required-publication", default="")
+    ap.add_argument("--mode", default="normal", choices=["normal", "ll"],
+                    help="kernel path: normal or low-latency (LL); LL is backend-dependent")
+    # Measurement contract — the EXPLICIT timing boundary every adapter must conform to
+    # (review #3: adapters must not each decide their own boundary). Backends declare
+    # SUPPORTED_CONTRACTS; run_ep.py rejects an unsupported one.
+    #   layout-and-dispatch-v1   — dispatch timing INCLUDES routing-layout generation
+    #                              (the only contract MoRI can honor; its layout is
+    #                              computed inside the kernel and cannot be hoisted).
+    #   cached-layout-comm-only-v1 — layout computed ONCE untimed; dispatch times pure
+    #                              comm (DeepEP-only; matches DeepEP's own benchmark).
+    # Combine excludes staging in BOTH (staging is untimed for every backend).
+    #   runtime-visible-v1       — the serving-realistic boundary: dispatch starts from what the
+    #                              runtime has right after routing and INCLUDES required quant /
+    #                              scale creation / layout / packing / comm / sync; combine starts
+    #                              from expert outputs and ends when token outputs are consumable.
+    #                              (DeepEP-only today; the FP8 cast moves INSIDE the timed window.)
+    ap.add_argument("--measurement-contract", default="layout-and-dispatch-v1",
+                    choices=["layout-and-dispatch-v1", "cached-layout-comm-only-v1",
+                             "runtime-visible-v1"])
+    ap.add_argument("--num-sms", type=int, default=24,
+                    help="DeepEP comm-SM budget in 'default' resource-mode (MoRI uses block_num/warps)")
+    # Resource regime (review: budgets were neither normalized nor tuned):
+    #   normalized — each backend restricted to ~sm_fraction of its device's units
+    #                (DeepEP set_num_sms(frac·SMs); MoRI block_num≈frac·CUs). Fraction-
+    #                based, recorded — an approximate apples-to-apples, not identical work.
+    #   tuned      — each backend's recommended/auto launch config (best achievable).
+    #   default    — DeepEP --num-sms / MoRI 80 blocks (the bring-up budget).
+    ap.add_argument("--resource-mode", default="normalized",
+                    choices=["normalized", "tuned", "default"])
+    ap.add_argument("--sm-fraction", type=float, default=0.18,
+                    help="normalized mode: fraction of device SMs/CUs dedicated to comms (~24/132)")
+    ap.add_argument("--num-ep-groups", type=int, default=1,
+                    help="concurrent EP groups; >1 is REJECTED (real subgroup PGs unimplemented)")
+    ap.add_argument("--seed", type=int, default=67)
+    # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks +
+    # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us
+    # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within
+    # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless.
+    ap.add_argument("--warmup", type=int, default=WARMUP_ITERS_PER_TRIAL,
+                    help=f"untimed full roundtrips before each trial/point; fixed by "
+                         f"{SAMPLING_CONTRACT} to {WARMUP_ITERS_PER_TRIAL}")
+    ap.add_argument("--iters", type=int, default=TIMED_ITERS_PER_TRIAL,
+                    help=f"timed iterations per trial; fixed by {SAMPLING_CONTRACT} to "
+                         f"{TIMED_ITERS_PER_TRIAL}")
+    ap.add_argument("--trials", type=int, default=TRIALS_PER_POINT,
+                    help=f"timed trials; fixed by {SAMPLING_CONTRACT} to {TRIALS_PER_POINT}")
+    ap.add_argument("--allow-unknown-provenance", action="store_true",
+                    help="permit a run with unpinned backend commit/version (default: fail)")
+    # Anomaly waiver (goal P1: roundtrip/isolated_sum threshold -> diagnostic unless explicitly
+    # waived). Without this, a measured roundtrip implausibly larger/smaller than its components
+    # (e.g. the open LL-FP8 anomaly) demotes the result to 'diagnostic'. Pass to keep it
+    # comparable-experimental/official AFTER the cause is understood + documented.
+    ap.add_argument("--waive-anomaly", action="store_true",
+                    help="do not let a flagged timing anomaly demote publication_status to diagnostic")
+    ap.add_argument("--roundtrip-anomaly-threshold", type=float, default=3.0,
+                    help="roundtrip p99 > threshold x isolated_sum p99 is flagged as an anomaly")
+    # provenance / output
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    # Structured placement metadata (goal P2 topology): GPUs/node + scale-up domain + placement
+    # kind let routing locality (local/same-node/cross-domain copy fractions) be computed and let
+    # packed/striped/adversarial be distinguished. gpus-per-node=0 -> single node (= ep_size).
+    ap.add_argument("--gpus-per-node", type=int, default=0)
+    ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)")
+    ap.add_argument("--placement", default="packed",
+                    choices=["packed", "striped", "runtime-native", "adversarial"])
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+    """Return (ladder, dropped): explicit spec else the phase default; positive ints;
+    clamped to `cap` with dropped points reported (never silently truncated)."""
+    if spec and spec.strip():
+        want = [int(t) for t in spec.replace(",", " ").split() if t]
+    else:
+        want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+    want = sorted({t for t in want if t > 0})
+    if cap is not None:
+        return [t for t in want if t <= cap], [t for t in want if t > cap]
+    return want, []
+
+
+def conditioning_ladder(ladder: list[int], gradual: bool) -> list[int]:
+    """Return untimed warm-up shapes without adding scored points."""
+    if not ladder:
+        return []
+    if not gradual:
+        warm_top = min(ladder[-1], 128)
+        return [point for point in ladder if point <= warm_top] or [ladder[0]]
+    points, value = [], 1
+    while value < ladder[-1]:
+        points.append(value)
+        value *= 2
+    points.append(ladder[-1])
+    return points
+
+
+def sampling_contract_error(iters: int, trials: int, warmup: int) -> str | None:
+    """Return a user-facing error unless the exact cross-SKU timing profile is used."""
+    expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL)
+    observed = (iters, trials, warmup)
+    if observed != expected:
+        return (f"{SAMPLING_CONTRACT} requires exactly iters:trials:warmup="
+                f"{expected[0]}:{expected[1]}:{expected[2]} on every SKU/backend; got "
+                f"{observed[0]}:{observed[1]}:{observed[2]} "
+                f"({iters * trials if iters > 0 and trials > 0 else 'invalid'} timed samples)")
+    return None
+
+
+def source_token_counts(nominal_T: int, ep_size: int, mode: str) -> list[int]:
+    """Per-rank source-token counts for the uneven-allocation study (goal P2). 'none' = even
+    (every rank nominal_T; global = nominal_T*ep). 'linear' = a deterministic ramp ~0.5T..1.5T
+    (mean ≈ T, so global tokens stay ~the same but ranks are imbalanced). 'empty-rank' = rank 0
+    gets 0 and the rest share evenly (the empty-source-rank case). Deterministic => identical on
+    every rank. Counts are clamped to >=0; total need not divide ep_size."""
+    if mode == "none" or ep_size <= 1:
+        return [nominal_T] * ep_size
+    if mode == "empty-rank":
+        if ep_size < 2:
+            return [nominal_T]
+        # rank 0 empty; spread ep_size*T across the remaining ranks (keeps ~global constant).
+        total = nominal_T * ep_size
+        per = max(1, total // (ep_size - 1))
+        return [0] + [per] * (ep_size - 1)
+    # linear ramp from ~0.5T to ~1.5T across ranks (mean ≈ T). At least 1 token/rank.
+    if ep_size == 1:
+        return [nominal_T]
+    lo, hi = 0.5 * nominal_T, 1.5 * nominal_T
+    return [max(1, int(round(lo + (hi - lo) * r / (ep_size - 1)))) for r in range(ep_size)]
+
+
+def _stats_vec(xs: list[int]) -> dict:
+    """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token
+    or load summary without dumping the full vector."""
+    n = len(xs) or 1
+    mean = sum(xs) / n
+    var = sum((x - mean) ** 2 for x in xs) / n
+    cv = (var ** 0.5 / mean) if mean > 0 else 0.0
+    return {"min": min(xs) if xs else 0, "mean": round(mean, 3),
+            "max": max(xs) if xs else 0, "cv": round(cv, 4),
+            "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n}
+
+
+def percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+    """Per-iteration CUDA-event latencies (µs) for THIS rank.
+
+    Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync
+    before the start event so its GPU work can't bleed in), then times `fn(pre_result)`
+    — how combine is isolated when it consumes the dispatch state and needs a fresh
+    untimed dispatch+stage before every sample. Returns the raw per-iteration series;
+    the caller reduces across ranks per iteration before percentiling.
+    """
+    def sample():
+        arg = pre() if pre is not None else None
+        if pre is not None:
+            torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True)
+        e = torch.cuda.Event(enable_timing=True)
+        s.record()
+        fn(arg) if pre is not None else fn()
+        e.record()
+        torch.cuda.synchronize()
+        return s.elapsed_time(e) * 1000.0  # ms -> us
+
+    for _ in range(max(0, warmup)):
+        if pre is not None:
+            a = pre()
+            torch.cuda.synchronize()
+            fn(a)
+        else:
+            fn()
+        # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn
+        # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back
+        # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort
+        # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync.
+        torch.cuda.synchronize()
+    return [sample() for _ in range(iters)]
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows share a curve — built from the FIXED config ONLY
+    (tokens_per_rank is the x-axis and is excluded). op/backend/mode/phase/ep_size/
+    topology are in the key, so EP4 vs EP8, normal vs LL, decode vs prefill, and
+    different SKUs are labelled distinct, never silently overlaid."""
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], meta["phase"],
+        str(meta["ep_size"]), str(meta["nodes"]), meta.get("resource_mode", "default"),
+        meta["topology_class"], meta["comparison_class"], meta["measurement_contract"],
+        json.dumps(meta["shape"], sort_keys=True),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _reduce_vec(torch, dist, device, vals, op):
+    t = torch.tensor(vals, device=device, dtype=torch.float64)
+    dist.all_reduce(t, op=op)
+    return [float(x) for x in t.tolist()]
+
+
+def _reduce_int(torch, dist, device, v: int, op) -> int:
+    t = torch.tensor([int(v)], device=device, dtype=torch.int64)
+    dist.all_reduce(t, op=op)
+    return int(t.item())
+
+
+def _allgather_floats(torch, dist, device, v: float) -> list[float]:
+    """Gather one scalar from every rank -> list indexed by rank (for per-rank diagnostics:
+    which rank is the straggler, the rank spread). all_reduce can't do this — it collapses."""
+    world = dist.get_world_size()
+    out = [torch.zeros(1, device=device, dtype=torch.float64) for _ in range(world)]
+    dist.all_gather(out, torch.tensor([float(v)], device=device, dtype=torch.float64))
+    return [float(x.item()) for x in out]
+
+
+def _histogram(xs: list[float], nbins: int = 40) -> dict:
+    """Compact distribution of pooled cross-rank-max samples (for p99-spike debugging without
+    storing every sample). Equal-width bins between min and max."""
+    if not xs:
+        return {"n": 0}
+    lo, hi = min(xs), max(xs)
+    if hi <= lo:
+        return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]}
+    counts = [0] * nbins
+    span = hi - lo
+    for x in xs:
+        b = min(nbins - 1, int((x - lo) / span * nbins))
+        counts[b] += 1
+    return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts}
+
+
+def _provenance_unknown(prov: dict) -> list[str]:
+    return [k for k, v in prov.items() if isinstance(v, str) and v.strip().lower() == "unknown"]
+
+
+REQUIRED_GIT_RUN_FIELDS = ("repo", "run_id", "run_attempt", "ref", "source_sha", "job", "artifact")
+REQUIRED_BACKEND_PROVENANCE = {
+    "deepep": ("deepep_version", "deepep_commit"),
+    "deepep-hybrid": ("deepep_commit", "branch"),
+    "flashinfer": ("flashinfer_version", "flashinfer_commit", "flashinfer_stack"),
+    "uccl": ("uccl_version", "uccl_commit"),
+    "mori": ("mori_commit",),
+    "nccl-ep": ("nccl_version",),
+}
+
+
+def _resolved_provenance_value(field: str, value) -> bool:
+    if value is None:
+        return False
+    text = str(value).strip().lower()
+    if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}:
+        return False
+    if "capture-failed" in text:
+        return False
+    if field.endswith("_commit"):
+        if text in {"main", "hybrid-ep", "uccl", "pkg-uccl"}:
+            return False
+        if text.endswith(("-unknown", "-none", "-main", "-hybrid-ep")):
+            return False
+    return True
+
+
+def backend_provenance_issues(backend: str, prov: dict) -> list[str]:
+    return [field for field in REQUIRED_BACKEND_PROVENANCE.get(backend, ())
+            if not _resolved_provenance_value(field, prov.get(field))]
+
+
+def _provenance_complete(prov: dict, args) -> bool:
+    git_run = getattr(args, "git_run", None) or {}
+    return (
+        not _provenance_unknown(prov)
+        and not backend_provenance_issues(getattr(args, "backend", ""), prov)
+        and bool(getattr(args, "image_digest", ""))
+        and all(git_run.get(key) for key in REQUIRED_GIT_RUN_FIELDS)
+    )
+
+
+def _resource_profile(prov: dict, args) -> dict:
+    """Map backend-specific provenance onto the backend-INDEPENDENT resource vocabulary (goal P3):
+    requested vs achieved comm-unit fraction, configured units/warps, and a conformance class.
+    DeepEP units = SMs (num_sms); MoRI units = CU blocks (block_num)."""
+    dev = prov.get("device_sms") or prov.get("device_cus")
+    cfg = prov.get("num_sms") if prov.get("num_sms") is not None else prov.get("block_num")
+    requested = args.sm_fraction if args.resource_mode == "normalized" else None
+    achieved = (cfg / dev) if (cfg and dev) else None
+    floored = bool(prov.get("block_num_floored"))
+    # FIXED-KERNEL split (goal P3 / immediate P0): a kernel whose comm occupancy is fixed by the
+    # library and NOT a normalized/tuned SM/CU budget (DeepEP LL: num_sms=None, low_latency_mode,
+    # tuned_source=ll-fixed-kernel) is NOT a resource-constrained run. It gets resource_class=
+    # fixed-kernel + conformance not-applicable, and is excluded from resource-Pareto comparisons.
+    fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", "")))
+    if fixed_kernel:
+        resource_class, cls = "fixed-kernel", "not-applicable"
+    elif floored:
+        resource_class, cls = "resource-constrained", "minimum-functional"  # needed MORE than requested
+    elif args.resource_mode == "normalized":
+        resource_class, cls = "resource-constrained", "resource-conforming"
+    elif args.resource_mode == "tuned":
+        resource_class = "backend-tuned"
+        cls = "best-known" if "default" not in str(prov.get("tuned_source", "")) else "backend-default"
+    else:
+        resource_class, cls = "backend-default", "backend-default"
+    # within tolerance? (normalized only — did we hit the requested fraction?)
+    tol = 0.10
+    target_achieved = (requested is not None and achieved is not None
+                       and abs(achieved - requested) <= tol) if requested else None
+    return {
+        "comm_units_kind": "sm" if prov.get("num_sms") is not None else "cu_block",
+        "requested_fraction": requested, "configured_units": cfg, "device_units": dev,
+        "achieved_fraction": round(achieved, 4) if achieved else None,
+        "warps_dispatch": prov.get("dispatch_warps"), "warps_combine": prov.get("combine_warps"),
+        "qps_per_rank": prov.get("num_qps_per_rank"),
+        "persistent_bytes": prov.get("num_nvl_bytes") or prov.get("num_rdma_bytes") or prov.get("heap_size"),
+        "tuned_source": prov.get("tuned_source"),
+        # resource_class: fixed-kernel | resource-constrained | backend-tuned | backend-default.
+        # fixed-kernel + backend-* are NOT normalized resource-constrained runs (excluded from Pareto).
+        "resource_class": resource_class,
+        "conformance_class": cls, "tolerance": tol, "target_achieved_within_tol": target_achieved,
+        "nonconforming": floored, "fixed_kernel": fixed_kernel,
+        "pareto_eligible": (resource_class == "resource-constrained" and not floored),
+    }
+
+
+def _derive_publication_status(v: dict) -> str:
+    """Machine-derive the publication state from the validity dimensions (goal P1). No caller
+    may hand-label a result 'official' — it must earn every gate here."""
+    if v["execution_status"] != "complete":
+        return "failed"
+    if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \
+       or v["workload_identity"] == "inconsistent":
+        return "invalid"
+    sound = (v["semantic_correctness"] == "pass"
+             and v["workload_identity"].startswith("consistent")
+             and v["measurement_conformance"] == "conformant")
+    # resource-nonconforming but otherwise sound -> diagnostic (not a fair cross-platform point)
+    if v["resource_conformance"].endswith("nonconforming"):
+        return "diagnostic"
+    # A run with a different sample basis can remain useful diagnostic evidence, but it must never
+    # be promoted to a comparable or official result.
+    if v.get("sampling_conformance") != "conformant":
+        return "diagnostic"
+    # contract-level anomaly (goal P1-e/f): a flagged roundtrip/isolated_sum mismatch demotes to
+    # diagnostic unless explicitly waived (validity.anomaly_free reflects the waiver).
+    if not v.get("anomaly_free", True):
+        return "diagnostic"
+    if sound and v["provenance_complete"] and v["workload_source"] == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"   # measurement sound, missing a publication requirement
+    return "diagnostic"
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+    """Drive the source-tokens-per-rank sweep for one fully-specified line."""
+    sampling_error = sampling_contract_error(args.iters, args.trials, args.warmup)
+    if sampling_error:
+        if rank == 0:
+            print(f"ERROR: {sampling_error}")
+        return 2
+    import routing  # torch-based; imported lazily so the module byte-compiles without torch
+    import eplb     # stdlib planner + torch remap (the EPLB transform)
+
+    ep_size = world_size  # num_ep_groups removed (was metadata-only; no real subgroups)
+    # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the
+    # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL
+    # experts then remapped to physical (build_trace), so the whole sweep runs over the
+    # balanced physical placement with no adapter change.
+    eplb_on = getattr(args, "eplb", False)
+    num_logical = getattr(args, "num_logical_experts", args.experts)
+    if args.experts % ep_size != 0:
+        if rank == 0:
+            print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+        return 2
+    experts_per_rank = args.experts // ep_size
+    elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2)
+
+    # Provenance gate (review #1): refuse a comparison run with unpinned backend info.
+    unknown = _provenance_unknown(backend.backend_provenance)
+    if unknown and not args.allow_unknown_provenance:
+        if rank == 0:
+            print(f"ERROR: unpinned provenance {unknown} in {backend.backend_provenance}; "
+                  f"set the commit/version env or pass --allow-unknown-provenance.")
+        return 4
+
+    cap = backend.buffer_cap(args)
+    ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+    if rank == 0 and dropped:
+        print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+              f"(hidden={args.hidden}); not silently truncated.")
+    if not ladder:
+        if rank == 0:
+            print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+        return 2
+    gradual_ramp = bool(getattr(backend, "needs_gradual_ramp", False))
+    # MoRI fp8 (e4m3fnuz direct-cast): the per-rank relErr gate is unstable at single-token
+    # granularity — run 28318788729 flipped a whole fp8 doc invalid on the T=1 point alone
+    # while the values were fine (rank-0 max_rel 3e-4).
+    # A requested T=1 is not scored/emitted at fp8. Warm-only ramp points never enter `ladder`.
+    unscored_T = set()
+    if (gradual_ramp
+            and str(getattr(args, "dispatch_dtype", "bf16")).startswith("fp8")):
+        unscored_T = {t for t in ladder if t < 2}
+        if rank == 0 and unscored_T:
+            print(f"NOTE: {backend.name} fp8: T<2 ramp points run UNSCORED "
+                  f"(single-token relErr instability)")
+
+    MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM
+    # temporal snapshot index — defined BEFORE the EPLB block (which builds a reference trace with
+    # step=routing_step); the EPLB path runs only when eplb_on, so a late definition raised an
+    # UnboundLocalError on zipf+eplb canonical runs (caught as a preserved failed-case).
+    routing_step = int(getattr(args, "routing_step", 0))
+    if routing_step != 0:
+        if rank == 0:
+            print("ERROR: nonzero routing-step requires a future stateful trace-replay benchmark")
+        return 2
+
+    # EPLB plan (once): estimate logical load from the global logical trace at the largest
+    # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB
+    # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps
+    # to physical when the plan is present; otherwise it's the identity (logical == physical).
+    eplb_plan = None
+    if eplb_on:
+        ref_idx, _ = routing.build_global_routing(max(ladder) * ep_size, num_logical, args.topk,
+                                                  args.routing, args.seed, num_logical // ep_size,
+                                                  step=routing_step)
+        load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist()
+        eplb_plan = eplb.build_plan(load, args.experts, ep_size)
+        if rank == 0:
+            print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); "
+                  f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> "
+                  f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts "
+                  f"replicated (hottest {eplb_plan['max_replicas']}x)")
+
+    canonical = bool(getattr(args, "workload_dir", ""))
+    uneven = getattr(args, "uneven_tokens", "none")
+    if canonical and uneven != "none":
+        if rank == 0:
+            print(f"ERROR: --uneven-tokens={uneven} is incompatible with --workload-dir "
+                  f"(canonical workloads are serialized at a fixed global-token count per id); "
+                  f"use seeded-runtime for the uneven-allocation study.")
+        return 2
+    loaded_workload_ids, loaded_checksums = [], {}
+    if canonical:
+        import workload as _wl
+
+    def build_trace(gt):
+        # canonical: load pre-serialized trace bytes (verified by checksum) so this run is
+        # provably the SAME workload as any other consuming the same files. else: seeded gen.
+        if canonical:
+            wid = _wl.compute_workload_id(args.routing, args.hidden, args.topk, num_logical, gt,
+                                          args.seed, step=routing_step)
+            idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True)
+            idx_l = torch.from_numpy(idx_np).to(torch.int64)
+            w = torch.from_numpy(w_np).to(torch.float32)
+            if wid not in loaded_workload_ids:
+                loaded_workload_ids.append(wid)
+                loaded_checksums[wid] = man.get("checksums")
+        else:
+            idx_l, w = routing.build_global_routing(gt, num_logical, args.topk, args.routing,
+                                                    args.seed, num_logical // ep_size, step=routing_step)
+        return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w
+
+    # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold
+    # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually
+    # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone
+    # and is also cold-jump-safe for MoRI.
+    warm_shapes = conditioning_ladder(ladder, gradual_ramp)
+    for wt in warm_shapes:
+        # Warm-only shapes need not have canonical manifests: they are never measured or emitted.
+        wi, ww = routing.build_global_routing(
+            wt * ep_size, num_logical, args.topk, args.routing, args.seed,
+            num_logical // ep_size, step=routing_step,
+        )
+        if eplb_plan is not None:
+            wi = eplb.remap_idx(wi, eplb_plan)
+        wsi, wsw = routing.rank_slice(wi, ww, rank, wt)
+        wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16,
+                                      profile=args.activation_profile)
+        wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx)
+        for _ in range(8):
+            wh = backend.dispatch(wp)
+            backend.stage(wp, wh)
+            backend.combine(wp, wh)
+    torch.cuda.synchronize()
+    try:
+        dist.barrier()
+    except Exception:
+        pass
+    import random as _random
+    elem_dispatch = elem_bytes          # fp8=1 / bf16=2 (dispatch payload element size)
+    tol = getattr(backend, "tolerance", 5e-2)
+
+    # ---- Pass 1: build the per-T problem ONCE (deterministic trace + cached layout per
+    # contract), run the correctness gate ONCE. Timing is Pass 2 (pooled over trials). ----
+    problems, gate, gts = {}, {}, {}
+    routing_hashes = set()
+    for T in ladder:
+        # Per-rank source-token counts (goal P2 uneven allocation). mode 'none' => [T]*ep,
+        # gt = T*ep, offsets = 0,T,2T,... — byte-identical to the even path. Otherwise counts
+        # vary (global may not divide ep) and rank 0 may be empty.
+        counts = source_token_counts(T, ep_size, uneven)
+        offsets = [sum(counts[:r]) for r in range(ep_size)]
+        gt = sum(counts)
+        gts[T] = gt
+        idx_g, w_g = build_trace(gt)
+        rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g)
+        gpn = args.gpus_per_node or ep_size
+        # placement-aware locality (goal P2): packed/striped/adversarial change which physical
+        # node/domain a rank sits on, so the local/same-node/cross-domain copy fractions differ.
+        rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T),
+                                                      gpn, args.scale_up_domain or None,
+                                                      placement=args.placement)
+        rstats["source_token_stats"] = _stats_vec(counts)
+        routing_hashes.add(rstats["routing_hash"])
+        my_off, my_cnt = offsets[rank], counts[rank]
+        idx_s = idx_g[my_off:my_off + my_cnt].contiguous()
+        w_s = w_g[my_off:my_off + my_cnt].contiguous()
+        x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16,
+                                     profile=args.activation_profile)
+        problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x)
+        h = backend.dispatch(problem)
+        backend.stage(problem, h)
+        combined = backend.combine(problem, h)
+        torch.cuda.synchronize()
+        recv_local = backend.recv_tokens(h)
+        exp, n_cmp = backend.expected(problem, h)
+        # empty source rank (my_cnt==0): nothing to reconstruct locally — gate passes vacuously.
+        if n_cmp > 0:
+            max_abs = (combined[:n_cmp].float() - exp[:n_cmp].float()).abs().max().item()
+            max_rel = max_abs / (exp[:n_cmp].float().abs().max().item() + 1e-6)
+        else:
+            max_rel = 0.0
+        problems[T] = problem
+        gate[T] = {"rstats": rstats, "recv_local": recv_local,
+                   "max_rel": max_rel, "local_ok": 1 if max_rel < tol else 0}
+
+    # ---- Pass 2: N timed trials. Token order is randomized PER TRIAL (seeded ⇒ identical
+    # on every rank, so collectives stay lock-step) so warmup/clock drift can't correlate
+    # with T. Per-iteration cross-rank MAX samples are POOLED across trials, then
+    # percentiled (review #3: p99 from one 50-iter run is just the max). MoRI keeps
+    # ascending order — it wedges on a cold jump to a large T. ----
+    disp_pool = {T: [] for T in ladder}     # pooled per-iteration cross-rank MAX (dispatch)
+    comb_pool = {T: [] for T in ladder}     # ... combine
+    rt_pool = {T: [] for T in ladder}       # ... INDEPENDENTLY-MEASURED round trip (goal P1)
+    disp_local = {T: [] for T in ladder}    # THIS rank's own dispatch samples (per-rank diag)
+    order = list(ladder)
+    rng = _random.Random(args.seed)
+    shuffle_ok = not getattr(backend, "needs_gradual_ramp", False)
+    for trial in range(args.trials):
+        if shuffle_ok:
+            rng.shuffle(order)
+        for T in order:
+            problem = problems[T]
+            # One universal conditioning schedule: immediately before every timed point, every
+            # SKU/backend executes the same number of complete dispatch->stage->combine roundtrips.
+            # Operation-specific time_us warmups stay at zero below. This reaches Blackwell's
+            # measured >=30-iteration clock floor without making MoRI execute a >=200-call burst.
+            for _ in range(args.warmup):
+                wh = backend.dispatch(problem)
+                backend.stage(problem, wh)
+                backend.combine(problem, wh)
+                torch.cuda.synchronize()
+            # roundtrip_only backends (stateful paired dispatch/combine FSM, e.g. FlashInfer
+            # MoeAlltoAll): isolated/looped dispatch timing corrupts the symmetric workspace, so
+            # ONLY the paired roundtrip is measurable. Mirror rt into disp/comb (flagged) so the
+            # schema + plot have values; isolated_sum is meaningless for these (== 2x roundtrip).
+            roundtrip_only = getattr(backend, "roundtrip_only", False)
+
+            def rt_once(p=problem):
+                hh = backend.dispatch(p)
+                backend.stage(p, hh)
+                return backend.combine(p, hh)
+
+            if roundtrip_only:
+                rt_iters = time_us(torch, lambda p=problem: rt_once(p), 0, args.iters)
+                disp_iters = comb_iters = rt_iters
+            else:
+                disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p),
+                                     0, args.iters)
+
+                def prep(p=problem):
+                    hh = backend.dispatch(p)
+                    backend.stage(p, hh)
+                    return hh
+                if backend.combine_needs_redispatch:
+                    comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+                                         0, args.iters, pre=prep)
+                else:
+                    hh = prep()
+                    comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+                                         0, args.iters)
+                # MEASURED round trip (goal P1: not a sum of percentiles): one timed region over
+                # dispatch -> stage (no-op "expert" transform) -> combine -> output ready. Captures
+                # shared sync / launch amortization / overlap that the isolated_sum cannot.
+                rt_iters = time_us(torch, lambda p=problem: rt_once(p), 0, args.iters)
+            # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled.
+            disp_pool[T] += _reduce_vec(torch, dist, device, disp_iters, MAX)
+            comb_pool[T] += _reduce_vec(torch, dist, device, comb_iters, MAX)
+            rt_pool[T] += _reduce_vec(torch, dist, device, rt_iters, MAX)
+            disp_local[T] += disp_iters
+
+    # ---- Pass 3: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ----
+    def pcts(xs):
+        return {"p50": percentile(xs, 50), "p90": percentile(xs, 90),
+                "p95": percentile(xs, 95), "p99": percentile(xs, 99)}
+    rows = []
+    all_anomalies = []                                       # contract-level anomalies (goal P1)
+    thr_rt = float(getattr(args, "roundtrip_anomaly_threshold", 3.0))
+    for T in ladder:
+        if T in unscored_T:   # ran (ramp safety) but not scored — symmetric on every rank
+            continue
+        gt = gts[T]
+        g = gate[T]
+        rstats = g["rstats"]
+        d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T]
+        dp, cp, rtp = pcts(d), pcts(c), pcts(rt)
+        # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op
+        # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput
+        # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency.
+        isum = {k: dp[k] + cp[k] for k in dp}
+        recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM)
+        recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX)
+        recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN)
+        global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN)
+        max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0]
+        point_ok = bool(global_ok) and recv_total > 0
+        # Per-rank diagnostics: gather each rank's own dispatch median -> spread + straggler.
+        per_rank_med = _allgather_floats(torch, dist, device, percentile(disp_local[T], 50))
+        slowest_rank = max(range(len(per_rank_med)), key=lambda i: per_rank_med[i])
+        rmean = sum(per_rank_med) / len(per_rank_med)
+        # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv
+        # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy
+        # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert.
+        token_rank_copies = rstats["routed_copies"]
+        token_expert_copies = gt * args.topk
+        H = args.hidden
+        # Bandwidth semantics (goal P1 "distinguish all bandwidth concepts"): the ONLY rates we can
+        # defensibly publish are logical-payload (canonical routed bytes / latency) and backend-
+        # buffer (recv-tensor bytes / latency). algorithm/bus/wire bandwidth are NULL — EP
+        # dispatch/combine have no standard busBW model and we have no transport counters, so we
+        # must NOT imply physical NVLink/XGMI/RDMA utilization.
+        def _rate(nbytes, us):
+            return round(nbytes / (us * 1e3), 3) if (us and us > 0) else None
+        disp_bytes_l = token_rank_copies * H * elem_dispatch
+        comb_bytes_l = token_rank_copies * H * 2
+        buf_disp = recv_max * H * elem_dispatch
+        buf_comb = recv_max * H * 2
+        bandwidth = {
+            "logical_payload_rate_gbps": {
+                "dispatch": _rate(disp_bytes_l, dp["p50"]), "combine": _rate(comb_bytes_l, cp["p50"]),
+                "roundtrip": _rate(disp_bytes_l + comb_bytes_l, rtp["p50"])},
+            "backend_buffer_rate_gbps": {
+                "dispatch": _rate(buf_disp, dp["p50"]), "combine": _rate(buf_comb, cp["p50"])},
+            "algorithm_bandwidth_gbps": None, "bus_bandwidth_gbps": None, "wire_utilization": None,
+            "basis": ("logical = canonical routed-payload copies x hidden x dtype / latency; "
+                      "buffer = backend recv tensor / latency; alg/bus/wire = null (no defined "
+                      "EP busBW formula, no transport counters) — NOT physical link utilization"),
+        }
+        # Contract-level anomaly checks (goal P1) — attached to the ROW and rolled into validity.
+        #   roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99 — a
+        #     chained op shouldn't be far larger than its parts (the open LL-FP8 case).
+        #   roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained
+        #     op can't finish faster than its slowest required component (sync semantics violated).
+        row_anoms = []
+        if isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]:
+            row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T,
+                              "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2),
+                              "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt})
+        floor = max(dp["p50"], cp["p50"])
+        if rtp["p50"] > 0 and floor > 0 and rtp["p50"] < 0.95 * floor:
+            row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T,
+                              "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)})
+        all_anomalies.extend(row_anoms)
+        rows.append({
+            "tokens_per_rank": T, "global_tokens": gt,
+            "dispatch": dp, "combine": cp, "roundtrip": rtp, "isolated_sum": isum,
+            # flat aliases kept for back-compat with v3 readers
+            "dispatch_us_p50": dp["p50"], "dispatch_us_p90": dp["p90"], "dispatch_us_p99": dp["p99"],
+            "combine_us_p50": cp["p50"], "combine_us_p90": cp["p90"], "combine_us_p99": cp["p99"],
+            "roundtrip_us_p50": rtp["p50"], "roundtrip_us_p90": rtp["p90"],
+            "roundtrip_us_p95": rtp["p95"], "roundtrip_us_p99": rtp["p99"],
+            "isolated_sum_us_p50": isum["p50"], "isolated_sum_us_p99": isum["p99"],
+            "samples_pooled": len(d), "trials": args.trials,
+            "percentile_interpolation": "nearest-rank",
+            "recv_tokens_max": recv_max, "recv_tokens_min": recv_min,
+            "recv_tokens_mean": recv_total / world_size, "recv_tokens_total": recv_total,
+            "per_rank_dispatch_us": {"min": min(per_rank_med), "mean": rmean,
+                                     "max": max(per_rank_med), "spread": max(per_rank_med) - min(per_rank_med),
+                                     "slowest_rank": slowest_rank},
+            # dispatch carries its dtype's element size; combine input is bf16 (2B).
+            "dispatch_logical_bytes": token_rank_copies * H * elem_dispatch,
+            "combine_logical_bytes": token_rank_copies * H * 2,
+            "byte_contracts": {
+                "token_rank_payload_copies": token_rank_copies,
+                "token_expert_payload_copies": token_expert_copies,
+                "dispatch_bytes": token_rank_copies * H * elem_dispatch,
+                "combine_bytes": token_rank_copies * H * 2,
+                "fp8_scale_bytes": (token_rank_copies * (H // 128) * 4) if elem_dispatch == 1 else 0,
+                "routing_index_bytes": token_expert_copies * 4,   # int32 topk_idx
+                "gate_weight_bytes": token_expert_copies * 4,     # f32 topk_weights
+            },
+            "byte_contract": "logical-routed-payload-v1",
+            # throughput from the MEASURED round trip ONLY (not isolated_sum).
+            "roundtrip_tokens_per_second": (gt / (rtp["p50"] * 1e-6)) if rtp["p50"] > 0 else None,
+            "raw_samples": {"dispatch": _histogram(d), "combine": _histogram(c), "roundtrip": _histogram(rt)},
+            # distinguished bandwidth concepts (goal P1) — logical + buffer real, alg/bus/wire null.
+            "bandwidth": bandwidth,
+            # full load + fanout statistics in EVERY row (goal P2 "report full load and fanout"):
+            "fanout_mean": rstats["fanout_mean"], "fanout_max": rstats["fanout_max"],
+            "fanout_min": rstats["fanout_min"], "fanout_hist": rstats["fanout_hist"],
+            "routed_copies": rstats["routed_copies"],
+            "expert_load_min": rstats["expert_load_min"], "expert_load_max": rstats["expert_load_max"],
+            "expert_load_mean": rstats["expert_load_mean"], "expert_load_cv": rstats["expert_load_cv"],
+            "rank_load_cv": rstats["rank_load_cv"], "hotspot_ratio": rstats["hotspot_ratio"],
+            "dest_rank_load_max": rstats["dest_rank_load_max"],
+            "dest_rank_load_mean": rstats["dest_rank_load_mean"],
+            "empty_expert_count": rstats["empty_expert_count"],
+            "empty_rank_count": rstats["empty_rank_count"],
+            "rank_load_hist": rstats["rank_load_hist"],
+            "source_token_stats": rstats.get("source_token_stats"),
+            "routing_hash": rstats["routing_hash"], "locality": rstats.get("locality"),
+            "anomalies": row_anoms,
+            "correct": point_ok, "max_rel_error": max_rel,
+        })
+        if rank == 0:
+            print(f"  T={T:<5} disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} comb {cp['p50']:6.1f}/{cp['p99']:6.1f} "
+                  f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(d)} fanout={rstats['fanout_mean']:.2f} "
+                  f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} "
+                  f"straggler=r{slowest_rank} correct={point_ok}")
+
+    # Cross-rank workload-identity proof: every rank must have built the SAME global routing
+    # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and
+    # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing.
+    trace_sig = int(hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()[:15], 16)
+    sig_min = _reduce_int(torch, dist, device, trace_sig, MIN)
+    sig_max = _reduce_int(torch, dist, device, trace_sig, MAX)
+    routing_consistent = (sig_min == sig_max == trace_sig)
+
+    if rank != 0:
+        return 0
+
+    # status=valid requires correctness AND a proven-identical routing trace across ranks.
+    all_ok = bool(rows) and all(r["correct"] for r in rows) and routing_consistent
+
+    # ---- Multi-dimensional validity (goal P1) -> MACHINE-DERIVED publication_status. Adapters
+    # never self-label "official"; status is a pure function of these gates. ----
+    prov = backend.backend_provenance
+    provenance_complete = _provenance_complete(prov, args)
+    floored = bool(prov.get("block_num_floored"))
+    # fixed-kernel (DeepEP LL) is NOT a normalized resource-constrained run -> conformance N/A
+    # (immediate P0 "split LL fixed-kernel from normalized-resource"). Not a conformance failure.
+    fixed_kernel = bool(prov.get("low_latency_mode")) or ("fixed-kernel" in str(prov.get("tuned_source", "")))
+    resource_conformance = ("not-applicable" if fixed_kernel
+                            else "minimum-functional-nonconforming" if floored
+                            else ("resource-conforming" if args.resource_mode == "normalized"
+                                  else "backend-default" if args.resource_mode in ("tuned", "default")
+                                  else "unspecified"))
+    # record the canonical workload identity consumed (one trace per T -> set of ids/checksums).
+    if canonical and loaded_workload_ids:
+        args.workload_id = (loaded_workload_ids[0] if len(loaded_workload_ids) == 1
+                            else f"set:{len(loaded_workload_ids)}:{loaded_workload_ids[0]}")
+        args.workload_checksums = loaded_checksums
+    canonical_workload = bool(getattr(args, "workload_id", None))
+    # Activation-value identity (scaffold): today activations are seeded N(0,1) and NOT serialized,
+    # so identity is the deterministic descriptor (profile|seed|hidden|generator). When a value rig
+    # (lognormal / model-trace) lands, this becomes the byte-hash of the serialized activations.
+    activation_identity = hashlib.sha256(
+        f"{args.activation_profile}|seed={args.seed}|hidden={args.hidden}|gen=collectivex-activation-v1"
+        .encode()).hexdigest()[:16]
+    # EPLB mapping identity hash (goal P2) — over the replica placement, not just the counts.
+    eplb_mapping_hash = None
+    if eplb_plan is not None:
+        eplb_mapping_hash = hashlib.sha256(json.dumps(
+            {"phys2log": eplb_plan["phys2log"], "rank_of_phys": eplb_plan["rank_of_phys"],
+             "replicas": eplb_plan["replicas"]}, sort_keys=True).encode()).hexdigest()[:16]
+    # Anomaly roll-up (goal P1-e/f): any flagged row anomaly demotes publication_status to
+    # diagnostic, unless --waive-anomaly (set AFTER the cause is understood + documented).
+    waived = bool(getattr(args, "waive_anomaly", False))
+    anomaly_free = (len(all_anomalies) == 0) or waived
+    validity = {
+        "execution_status": "complete" if rows else "failed",
+        "semantic_correctness": "pass" if (rows and all(r["correct"] for r in rows)) else "fail",
+        "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent",
+        "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime",
+        "measurement_conformance": "conformant",   # run_ep gate rejects nonconformant pre-run
+        "sampling_conformance": "conformant",      # fixed-512-v1 gate rejects any other profile
+        "resource_conformance": resource_conformance,
+        "provenance_complete": provenance_complete,
+        # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above).
+        "anomaly_free": anomaly_free,
+    }
+    publication_status = _derive_publication_status(validity)
+
+    shape = {  # FIXED line identity (no T, no per-backend resource knobs)
+        "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+        "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype,
+        "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical,
+        # DeepEP kernel generation (v1 = NVSHMEM, v2 = NCCL-Gin) — part of line identity so a V2 run
+        # is never conflated with V1 in comparison identity. Derived from deepep_version;
+        # "n-a" for non-DeepEP backends. (Existing V1 docs lack this field -> read as "v1".)
+        "kernel_gen": ("v2" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("2")
+                       else "v1" if str((backend.backend_provenance or {}).get("deepep_version", "")).startswith("1")
+                       else "n-a"),
+        # temporal snapshot + uneven allocation change the realized workload, so they are part of
+        # the line identity (fold into comparison_key). Default 0/none reproduce the prior key for
+        # non-temporal even runs in spirit (the value is recorded either way).
+        "routing_step": routing_step, "uneven_tokens": uneven,
+        # value distribution of expert inputs — part of the workload identity (review: quant
+        # combine can be value-sensitive). "normal" today; folds into comparison_key.
+        "activation_profile": args.activation_profile,
+        # Combine contract, SEPARATE from dispatch. Today bf16/none for every backend regardless
+        # of dispatch_dtype; a quant-combine backend (PR311) reports its actuals via attrs. In
+        # shape so it folds into comparison_key — a quant-combine run is never compared to a bf16 one.
+        "quant": {
+            "combine_input_dtype": getattr(backend, "combine_input_dtype", args.combine_dtype),
+            "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"),
+            "combine_output_dtype": getattr(backend, "combine_output_dtype", "bf16"),
+            "combine_quant_mode": getattr(backend, "combine_quant_mode", args.combine_quant_mode),
+            "scale_layout": getattr(backend, "scale_layout", None),
+        },
+    }
+    meta = {
+        "op": "ep-dispatch-combine", "backend": backend.name, "mode": args.mode,
+        "phase": args.phase, "world_size": world_size, "ep_size": ep_size,
+        "resource_mode": args.resource_mode,
+        "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+        "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+        # honest contract name (was the misleading "comm-only-v1": dispatch INCLUDES layout
+        # under layout-and-dispatch-v1). Adapters declare which they conform to.
+        "measurement_contract": args.measurement_contract, "shape": shape,
+        # structured placement metadata (goal P2 topology) — replaces the bare topology string.
+        "placement": {
+            "kind": args.placement, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+            "gpus_per_node": args.gpus_per_node or ep_size,
+            "scale_up_domain": args.scale_up_domain or ((args.gpus_per_node or ep_size) * 1),
+            "ranks": ep_size, "transport": args.transport,
+        },
+    }
+    headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "attempt_id": os.environ.get("CX_ATTEMPT_ID", "1"),
+        "case_id": args.case_id or None, "suite": args.suite or None,
+        "workload_name": args.workload_name or None,
+        "required_publication": args.required_publication or None,
+        "runner": args.runner, "transport": args.transport,
+        # Multi-dimensional validity + machine-derived publication status (goal P1). `status`
+        # is a back-compat alias (legacy v3 readers) — publication_status is authoritative.
+        "validity": validity,
+        "publication_status": publication_status,
+        "status": "valid" if all_ok else "invalid",
+        "workload": {
+            "source": validity["workload_source"],
+            "workload_id": getattr(args, "workload_id", None),
+            "manifest_checksums": getattr(args, "workload_checksums", None),
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+            # within-run (cross-rank) identity is PROVEN here; cross-hardware identity holds
+            # only if another run records the SAME trace_signature / workload_id.
+            "cross_rank_consistent": routing_consistent,
+            # value-distribution identity of the expert inputs (scaffold; see activation_identity above).
+            "activation_profile": args.activation_profile,
+            "activation_identity": activation_identity,
+        },
+        "comparison_key": comparison_key(meta),
+        "x_axis": {"primary": "tokens_per_rank",
+                   "global_relation": "global_tokens = tokens_per_rank * ep_size"},
+        "backend_provenance": backend.backend_provenance,
+        # backend-independent resource vocabulary + conformance class (goal P3).
+        "resource_profile": _resource_profile(backend.backend_provenance, args),
+        "reproduction": {
+            "command": getattr(args, "reproduction_command", ""),
+            "distributed_launcher": getattr(args, "distributed_launcher", None),
+            "image": getattr(args, "image", "") or None,
+            "image_digest": getattr(args, "image_digest", "") or None,
+            "image_arch": getattr(args, "image_arch", None),
+            "squash_sha256": getattr(args, "squash_sha256", None),
+            "git_run": getattr(args, "git_run", None),   # repo/run/attempt/ref/sha/job/artifact
+            # redaction (goal P1): command + provenance carry NO hostnames/IPs/UUIDs/private paths;
+            # per-node env (hostnames, GPU UUIDs, NIC GUIDs) lives in a separate private env_json,
+            # excluded from public workflow artifacts and never inlined into this record.
+            "redaction": "no hostnames/IPs/UUIDs/private-paths in command or provenance",
+            "seed": args.seed, "warmup": args.warmup, "iters": args.iters,
+            "trials": args.trials, "samples_per_point": TIMED_SAMPLES_PER_POINT,
+            "sampling_contract": SAMPLING_CONTRACT,
+            "warmup_semantics": WARMUP_SEMANTICS,
+            "measurement_contract": args.measurement_contract,
+            "dispatch_dtype": args.dispatch_dtype, "mode": args.mode,
+            "combine_dtype": args.combine_dtype, "combine_quant_mode": args.combine_quant_mode,
+            "activation_profile": args.activation_profile,
+            "routing_step": routing_step, "uneven_tokens": uneven,
+            "waive_anomaly": waived,
+            "roundtrip_anomaly_threshold": thr_rt,
+            # whether (de)quantization is inside the timed window. fp8_quant_in_timing kept as a
+            # back-compat alias (dispatch-side fp8); combine_* are the quant-combine generalization
+            # (None today — no quant combine is wired). A backend sets these when it quantizes.
+            "fp8_quant_in_timing": getattr(backend, "fp8_in_timing", None),
+            "combine_quant_in_timing": getattr(backend, "combine_quant_in_timing", None),
+            "combine_dequant_in_timing": getattr(backend, "combine_dequant_in_timing", None),
+        },
+        **meta,
+        "correctness": {"passed": all_ok,
+                        "max_rel_error": max((r["max_rel_error"] for r in rows), default=None),
+                        "tolerance": getattr(backend, "tolerance", 5e-2), "points": len(rows),
+                        # honest scope: round-trip reconstruction + non-silent recv, NOT a full
+                        # per-token routing/ordering/weight/padding proof (review #3).
+                        "scope": "roundtrip-reconstruction-smoke-v1"},
+        "routing_identity": {   # cryptographic workload-identity proof (review #3)
+            "consistent_across_ranks": routing_consistent,
+            "trace_signature": f"{trace_sig:015x}",
+            "distinct_per_T_hashes": sorted(routing_hashes),
+        },
+        # EPLB plan + the per-rank load imbalance it removes (the headline of the zipf+EPLB
+        # comparison). enabled=False when the run did not apply EPLB.
+        # EPLB mapping IDENTITY (goal P2): logical/physical counts + a hash of the replica
+        # placement (phys2log/rank_of_phys/replicas). Two EPLB runs are only an official comparison
+        # if their mapping_hash matches; zipf vs zipf+eplb is a RECOVERY
+        # experiment, not the same raw workload.
+        "eplb": ({"enabled": True, "num_logical_experts": num_logical,
+                  "num_physical_experts": args.experts,
+                  "num_redundant": args.experts - num_logical,
+                  "imbalance_before": eplb_plan["imbalance_before"],
+                  "imbalance_after": eplb_plan["imbalance_after"],
+                  "replicated_experts": eplb_plan["replicated_experts"],
+                  "max_replicas": eplb_plan["max_replicas"],
+                  "mapping_hash": eplb_mapping_hash}
+                 if eplb_plan else {"enabled": False}),
+        "routing_profile": {
+            "routing": args.routing,
+            "fanout_mean": sum(r["fanout_mean"] for r in rows) / len(rows),
+            "fanout_max": max(r["fanout_max"] for r in rows),
+            "headline_hash": headline["routing_hash"],
+        },
+        "metrics": {   # p99 is the headline percentile (review #3); p50/p90/p95 also kept per row
+            "headline_tokens_per_rank": headline["tokens_per_rank"],
+            "headline_percentile": "p99",
+            "dispatch_us_p50": headline["dispatch_us_p50"], "dispatch_us_p99": headline["dispatch_us_p99"],
+            "combine_us_p50": headline["combine_us_p50"], "combine_us_p99": headline["combine_us_p99"],
+            "roundtrip_us_p50": headline["roundtrip_us_p50"], "roundtrip_us_p99": headline["roundtrip_us_p99"],
+            "isolated_sum_us_p50": headline["isolated_sum_us_p50"], "isolated_sum_us_p99": headline["isolated_sum_us_p99"],
+            "isolated_sum_label": "sum of isolated dispatch+combine percentiles — NOT a measured chained op",
+            "roundtrip_tokens_per_second": headline["roundtrip_tokens_per_second"],
+        },
+        # phase semantics (goal P2): decode/prefill are regimes with distinct serving meaning, not
+        # just ladder aliases — a point is one MoE layer / one step / one collective.
+        "phase_profile": PHASE_PROFILE.get(args.phase, {"regime": args.phase}),
+        # source-token allocation across ranks (goal P2 uneven allocation). 'none' = even.
+        "source_allocation": {
+            "mode": uneven, "routing_step": routing_step,
+            "note": ("even — every rank gets the ladder T (global = T*ep_size)" if uneven == "none"
+                     else "uneven — per-rank source-token counts vary; see rows[].source_token_stats "
+                          "(global may not divide ep_size; empty-source-rank possible)"),
+        },
+        # contract-level timing anomalies (goal P1) — aggregate of the per-row flags; demotes
+        # publication_status to diagnostic unless --waive-anomaly (validity.anomaly_free).
+        "anomalies": all_anomalies,
+        "anomaly_summary": {"count": len(all_anomalies), "waived": waived,
+                            "types": sorted({a["type"] for a in all_anomalies})},
+        "rows": rows,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"{backend.name} ep-dispatch-combine [{args.phase}/{args.mode}/{args.measurement_contract}]: "
+          f"status={doc['status']} {len(rows)} pts, routing_consistent={routing_consistent}, "
+          f"headline T={headline['tokens_per_rank']} disp_p99={headline['dispatch_us_p99']:.1f}us "
+          f"-> {args.out}")
+    return 0 if all_ok else 1
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 0000000000..8c40daaf32
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode.
+
+The harness owns the deterministic shared routing trace and the comm-only timing;
+this file owns MoRI's API and the ionic_rdma-fabric constraints found on MI355X:
+the whole symmetric heap is one RDMA MR
+capped at ~4 GiB (hold at 2 GiB; bound buffers via max_num_inp_token_per_rank ⇒
+buffer_cap); combine() resets recv_num (read it before combine; compare only the
+first T rows); and the post-shmem_finalize teardown asserts (finalize hard-exits).
+
+`make_problem` now materializes the harness-provided rank slice, so MoRI honors the
+requested routing (it no longer always-uniform) and runs the identical workload to
+the NVIDIA SKUs. combine_needs_redispatch=True: combine consumes recv_num, so the
+harness re-dispatches (untimed) before each timed combine sample.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set BEFORE
+# `import mori`. 2 GiB registers on the MI355X ionic_rdma NICs; larger fails.
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
+                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+import torch
+import torch.distributed as dist
+
+try:
+    import mori  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the AMD MoRI image
+    print("ERROR: mori import failed — needs the AMD MoRI image "
+          f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr)
+    raise
+
+# e4m3fnuz (the ROCm-native fp8) finite max. AMD's "fnuz" (finite, no -0/Inf/NaN-unsigned) e4m3
+# saturates at 240.0 — the dispatch fp8 cast scales each block so its amax maps to this.
+_FP8_FNUZ_MAX = 240.0
+_FP8_BLOCK = 128  # MoRI/DeepSeek blockwise fp8: one scale per 128-elem hidden block (7168%128==0)
+
+
+def _mori_quant_introspect():
+    """Describe MoRI's quant API (enum members + ctor/dispatch signatures + quant/scale helpers).
+
+    FNUZ fp8 dispatch on MoRI keys off EpDispatchCombineConfig.quant_type, which PR311 extended with
+    QuantType::Fp8BlockwiseQuant — but how that value is EXPOSED to Python (enum attr vs accepted
+    string vs int) differs by build. We print this to stderr at construction so a GHA run's log is
+    self-documenting: even if the run wedges or the quant_type guess is wrong, the next iteration has
+    MoRI's exact surface without needing interactive SSH (which stalls on the shared cluster)."""
+    import inspect
+    info = {}
+    ops = getattr(mori, "ops", None)
+    try:
+        info["config_sig"] = str(inspect.signature(mori.ops.EpDispatchCombineConfig.__init__))
+    except Exception as e:
+        info["config_sig"] = f"<err {e!r}>"
+    for meth in ("dispatch", "combine"):
+        try:
+            info[f"{meth}_sig"] = str(inspect.signature(getattr(mori.ops.EpDispatchCombineOp, meth)))
+        except Exception as e:
+            info[f"{meth}_sig"] = f"<err {e!r}>"
+    # Any enum / helper whose name mentions quant or scale (the QuantType enum + any quantize fn).
+    surface = {}
+    for nm in (dir(ops) if ops else []):
+        if nm.startswith("_"):
+            continue
+        if "quant" in nm.lower() or "scale" in nm.lower():
+            obj = getattr(ops, nm)
+            members = {}
+            for m in dir(obj):
+                if m.startswith("_"):
+                    continue
+                try:
+                    members[m] = int(getattr(obj, m))
+                except Exception:
+                    members[m] = str(type(getattr(obj, m)).__name__)
+            surface[nm] = members or str(type(obj).__name__)
+    info["quant_surface"] = surface
+    # LL-kernel surface (upstream MoRI HAS low-latency kernels — test_dispatch_combine_async_ll.py
+    # + the documented HT/LL adaptive switch — so normal-only is an ADAPTER limit, not a vendor
+    # property, UNLESS this pinned build predates them). Print the kernel-type enum + any ll/async
+    # attrs so the next GHA log answers "does this build expose LL?" without interactive SSH.
+    kt = getattr(ops, "EpDispatchCombineKernelType", None) if ops else None
+    if kt is not None:
+        members = {}
+        for m in dir(kt):
+            if not m.startswith("_"):
+                try:
+                    members[m] = int(getattr(kt, m))
+                except Exception:
+                    members[m] = str(type(getattr(kt, m)).__name__)
+        info["kernel_type_surface"] = members
+    else:
+        info["kernel_type_surface"] = "<EpDispatchCombineKernelType absent>"
+    info["ll_surface"] = sorted(nm for nm in (dir(ops) if ops else [])
+                                if not nm.startswith("_")
+                                and ("ll" == nm.lower()[-2:] or "latency" in nm.lower()
+                                     or "async" in nm.lower()))
+    return info
+
+
+def _mori_quant_type_validator():
+    """MoRI's own quant_type normalizer if exposed (mori.ops.dispatch_combine._normalize_quant_type)
+    — validates a candidate CHEAPLY (no 2 GiB heap alloc) by raising on an invalid value. The config
+    ctor stores any string; only the OP normalizes it, so a config-only probe can't tell a valid mode
+    from an invalid one (that cost us a 90-min MI355X run on the wrong 'fp8_blockwise' guess)."""
+    try:
+        from mori.ops.dispatch_combine import _normalize_quant_type  # type: ignore
+        return _normalize_quant_type
+    except Exception:
+        return None
+
+
+def _fp8_quant_type_candidates():
+    """Ordered (value, label) candidates for MoRI's fp8 quant_type. fp8_direct_cast is the validated
+    mode on the mori-0227-2 image (the GHA self-introspection found the valid set is
+    ['none','fp8_direct_cast']; 'fp8_blockwise' is in the python map but THIS build's
+    _normalize_quant_type rejects it). Prefer the direct-cast string, then the typed enum member, then
+    fallbacks — __init__ keeps the first that MoRI's _normalize_quant_type accepts."""
+    ops = mori.ops
+    out = [("fp8_direct_cast", "str:fp8_direct_cast")]
+    enum = getattr(ops, "EpDispatchCombineQuantType", None)
+    if enum is not None:
+        for pref in ("Fp8DirectCast", "Fp8BlockwiseQuant"):
+            if hasattr(enum, pref):
+                out.append((getattr(enum, pref), f"EpDispatchCombineQuantType.{pref}"))
+    for s in ("fp8", "Fp8", "fp8_blockwise"):
+        out.append((s, f"str:{s}"))
+    return out
+
+
+def _quant_blockwise_fp8_fnuz(x, block=_FP8_BLOCK):
+    """bf16 [T,H] -> (e4m3fnuz [T,H], f32 per-block scales [T,H//block]). Per-128-block amax scaling
+    onto the fnuz finite range. Caller-side quantization (MoRI transports the fp8 payload + scales;
+    the combine reduces and the harness dequantizes for the consistency-correctness gate)."""
+    T, H = x.shape
+    assert H % block == 0, f"hidden {H} not a multiple of fp8 block {block}"
+    nb = H // block
+    xb = x.float().view(T, nb, block)
+    amax = xb.abs().amax(dim=2).clamp_min(1e-8)          # [T, nb]
+    scale = amax / _FP8_FNUZ_MAX                          # f32 dequant scale
+    xq = (xb / scale.unsqueeze(2)).clamp(-_FP8_FNUZ_MAX, _FP8_FNUZ_MAX).to(torch.float8_e4m3fnuz)
+    return xq.view(T, H), scale
+
+
+def _dequant_blockwise_fp8_fnuz(xq, scale, block=_FP8_BLOCK):
+    """Inverse of _quant_blockwise_fp8_fnuz: e4m3fnuz [T,H] + f32 [T,H//block] -> bf16-range f32 [T,H]."""
+    T, H = xq.shape
+    nb = H // block
+    return (xq.float().view(T, nb, block) * scale.unsqueeze(2)).view(T, H)
+
+
+class MoRIBackend:
+    name = "mori"
+    combine_needs_redispatch = True
+    # MoRI wedges on a COLD dispatch jumping straight to a large T (validated on
+    # MI355X); the harness ramps this backend's ladder geometrically from 1.
+    needs_gradual_ramp = True
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). DISPATCH precision and the SEPARATE combine path are distinct axes
+    # (review: dispatch_dtype=fp8 must NOT imply quantized combine). bf16 is the default; fp8 routes
+    # the AMD-native DIRECT-CAST path (quant_type=fp8_direct_cast — the only fp8 mode this MoRI build
+    # accepts; GHA introspection found the valid set is ['none','fp8_direct_cast']): the kernel casts
+    # bf16<->e4m3fnuz internally for transport (scale_dim=0, no caller scales) and returns the recv
+    # buffer as bf16 again. The combine OUTPUT stays bf16 so SUPPORTED_COMBINE_DTYPES is unchanged.
+    # Keep in sync with capability.py CAP["mori"].
+    SUPPORTED_DISPATCH_DTYPES = {"bf16", "fp8"}  # fp8 = e4m3fnuz direct-cast (FNUZ dispatch variant)
+    SUPPORTED_COMBINE_DTYPES = {"bf16"}         # + "fp8" once the PR311 quant combine OUTPUT lands
+    SUPPORTED_COMBINE_QUANT_MODES = {"none"}    # + the PR311 mode id once validated
+    SUPPORTED_PRECISIONS = SUPPORTED_DISPATCH_DTYPES  # back-compat alias (run_ep.py / older refs)
+    # UPSTREAM MoRI HAS LL kernels (test_dispatch_combine_async_ll.py + the documented HT/LL
+    # adaptive switch) — normal-only is this ADAPTER's current wiring, not a vendor property.
+    # The introspection probe now prints the pinned build's kernel-type/LL surface; wire mode=ll
+    # once a build exposing it is confirmed (goal.md AMD-parity item).
+    SUPPORTED_MODES = {"normal"}
+    # MoRI computes its routing layout INSIDE the dispatch kernel (block_num/warps launch);
+    # it cannot be hoisted, so MoRI honors only the layout-and-dispatch contract. Cross-
+    # vendor comparisons must therefore use layout-and-dispatch-v1 (the common contract).
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        assert (args.dispatch_dtype in self.SUPPORTED_DISPATCH_DTYPES
+                and args.mode in self.SUPPORTED_MODES
+                and getattr(args, "combine_dtype", "bf16") in self.SUPPORTED_COMBINE_DTYPES
+                and getattr(args, "combine_quant_mode", "none") in self.SUPPORTED_COMBINE_QUANT_MODES), \
+            "run_ep.py must reject unsupported dispatch/mode/combine before constructing the backend"
+        self.fp8_in_timing = None  # set when fp8 dispatch is used (whether the cast is timed)
+        # Combine-path quant timing (None today — no quant combine wired). PR311 sets these +
+        # the combine_* dtype attrs ep_harness reads via getattr; until then ep_harness records
+        # combine bf16 / none from the args defaults.
+        self.combine_quant_in_timing = None
+        self.combine_dequant_in_timing = None
+        self.ep_size = world_size
+        self.experts_per_rank = args.experts // self.ep_size
+        dev_cus = torch.cuda.get_device_properties(device).multi_processor_count
+        # Resource regime — map the comm budget onto CUs to mirror DeepEP's SM fraction.
+        #   normalized: block_num ≈ sm_fraction · CUs (≈ the same device fraction);
+        #   tuned: MoRI launch auto-tuning (API not present in this build — uses default,
+        #          labeled tuned_source); default: the 80-block bring-up budget.
+        # MoRI DEADLOCKS at T>=32 when block_num is reduced toward the normalized target
+        # (validated on MI355X g15: block_num=46 wedges, 80 completes T=32/64 with the
+        # realistic fan-out≈5.3 trace). So MoRI cannot be normalized down to DeepEP's
+        # device fraction; floor it at a known-functional minimum and record that the
+        # target fraction was NOT reached.
+        rm = args.resource_mode
+        floor = int(os.environ.get("CX_MORI_MIN_BLOCKS", "80"))  # functional minimum (deadlocks lower)
+        env_blocks = os.environ.get("CX_MORI_BLOCK_NUM")
+        self._block_floored = False
+        if env_blocks:
+            self.block_num = int(env_blocks)
+            self._block_target = self.block_num
+        elif rm == "normalized":
+            self._block_target = max(1, round(args.sm_fraction * dev_cus))
+            self.block_num = max(floor, self._block_target)
+            self._block_floored = self.block_num > self._block_target
+        else:  # tuned (no launch auto-tune API in mori-0227-2) / default
+            self.block_num = 80
+            self._block_target = 80
+        self._tuned_source = ("default-80" if rm == "tuned" else
+                              ("normalized-floored" if self._block_floored else "n/a"))
+        self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))
+        self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))
+
+        # Kernel-type selection (CX_MORI_KERNEL_TYPE): the default IntraNode dispatch/combine
+        # kernels synchronize through a direct cross-device peer-atomic barrier in the IPC-mapped
+        # symmetric heap (intranode.hpp barrier; the single unconditional EpDispatchIntraNodeKernel).
+        # That barrier COMPLETES on gfx950 (mi355x) but DEADLOCKS at the first dispatch (T=1) on
+        # gfx942/CDNA3 (MI325X) — verified across heap types (uncached run 28617588816,
+        # cached/normal run 28618583084: identical T=1 hang, so heap coherence is NOT the cause).
+        # AsyncLL is the gfx942 EP path instead: upstream ships gfx942_mi308x_AsyncLL_ep* tuning
+        # configs, and its send/recv-copy kernels move data over SDMA/XGMI (crossDeviceBarrierFlag
+        # path) rather than the direct-peer barrier — SDMA is "only effective for AsyncLL"
+        # (dispatch_combine.cpp:138). AsyncLL splits into send + recv halves: op.dispatch() /
+        # op.combine() launch ONLY the send kernels, so dispatch_recv()/combine_recv() must follow
+        # to complete the transfer (see MoRI tests/.../test_dispatch_combine_async_ll.py). Requires
+        # MORI_ENABLE_SDMA=1 (set by the MI325X launcher).
+        kt_req = os.environ.get("CX_MORI_KERNEL_TYPE", "intranode").strip().lower()
+        self._kernel_type = None
+        self._kernel_type_label = "IntraNode"
+        self._async_ll = False
+        if kt_req in ("asyncll", "async_ll", "async-ll"):
+            kt_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None)
+            if kt_enum is None or not hasattr(kt_enum, "AsyncLL"):
+                raise RuntimeError(
+                    "CX_MORI_KERNEL_TYPE=asyncll requested but this MoRI build does not expose "
+                    "EpDispatchCombineKernelType.AsyncLL — see MORI_QUANT_API kernel_type_surface")
+            self._kernel_type = kt_enum.AsyncLL
+            self._kernel_type_label = "AsyncLL"
+            self._async_ll = True
+        elif kt_req not in ("intranode", "intra_node", "intra-node", ""):
+            raise RuntimeError(f"unknown CX_MORI_KERNEL_TYPE={kt_req!r} (expected intranode|asyncll)")
+
+        world_group = torch.distributed.group.WORLD
+        torch._C._distributed_c10d._register_process_group("default", world_group)
+        mori.shmem.shmem_torch_process_group_init("default")
+
+        self._cap = self.buffer_cap(args)
+        # Dispatch precision: bf16 (quant_type="none") or fp8 (e4m3fnuz DIRECT-CAST — the FNUZ
+        # variant). MoRI's only fp8 mode on this image is `fp8_direct_cast` (GHA self-introspection
+        # found the valid set is ['none','fp8_direct_cast']): the dispatch kernel direct-casts the
+        # bf16 input to e4m3fnuz for transport and returns the recv buffer as input.dtype (bf16) again
+        # — so NO caller scales (scale_dim=0; scale_dim>0 is only for caller FP4 dispatch scales). We
+        # DUMP MoRI's quant API to stderr (self-documenting GHA log — SSH to the cluster stalls) and
+        # pick the first quant_type MoRI's own _normalize_quant_type accepts (cheap; no heap alloc).
+        self._fp8 = (args.dispatch_dtype == "fp8")
+        self._quant_label = "none"
+        scale_dim = 0
+        quant_type = "none"
+        if self._fp8:
+            import json as _json
+            print("MORI_QUANT_API " + _json.dumps(_mori_quant_introspect()), file=sys.stderr, flush=True)
+            validator = _mori_quant_type_validator()
+            cands = _fp8_quant_type_candidates()
+            print(
+                f"MORI_FP8_CANDIDATES {[label for _, label in cands]}",
+                file=sys.stderr,
+                flush=True,
+            )
+            for val, label in cands:
+                try:
+                    if validator is not None:
+                        validator(val)   # raises ValueError on an invalid value (no heap alloc)
+                    else:
+                        mori.ops.EpDispatchCombineConfig(   # fallback: config-construct probe
+                            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+                            hidden_dim=args.hidden, scale_dim=0,
+                            scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
+                            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+                            max_num_inp_token_per_rank=max(512, self._cap),
+                            num_experts_per_rank=self.experts_per_rank,
+                            num_experts_per_token=args.topk,
+                            use_external_inp_buf=False, quant_type=val)
+                    quant_type, self._quant_label = val, label
+                    break
+                except Exception as e:
+                    print(f"MORI_FP8_REJECT {label}: {e!r}", file=sys.stderr, flush=True)
+            if quant_type == "none":
+                raise RuntimeError("no MoRI quant_type candidate accepted for fp8 — see "
+                                   "MORI_QUANT_API above for this build's actual quant surface")
+            print(f"MORI_FP8_QUANT_TYPE {self._quant_label}", file=sys.stderr, flush=True)
+            self.fp8_in_timing = True  # the e4m3fnuz direct-cast is internal to dispatch (in timing)
+        # scale_dim==0 in both bf16 and fp8-direct-cast paths -> the 1-byte sentinel element size.
+        _scale_elt = torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size()
+        # zero-copy mode = NOT use_external_inp_buf. MoRI ASSERTS "Fp8DirectCast is not supported in
+        # zero-copy mode" (dispatch_combine.cpp:454, evidenced on MI355X run 28318485335), and the
+        # source also gates Fp8BlockwiseQuant on --zero-copy 0. So fp8 MUST use the external-input-buf
+        # (non-zero-copy) path; the dispatch copies the input to its staging buffer internally
+        # (EpDispatchCopyToStaging). bf16 keeps the validated zero-copy path (use_external_inp_buf=False).
+        _use_ext_inp_buf = bool(self._fp8)
+        _cfg_kwargs = dict(
+            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+            hidden_dim=args.hidden, scale_dim=scale_dim,
+            scale_type_size=_scale_elt,
+            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+            max_num_inp_token_per_rank=max(512, self._cap),
+            num_experts_per_rank=self.experts_per_rank,
+            num_experts_per_token=args.topk,
+            use_external_inp_buf=_use_ext_inp_buf, quant_type=quant_type,
+        )
+        if self._async_ll:
+            # AsyncLL pre-allocates the recv slot pool; 0 = MoRI's worst-case default
+            # (max_num_inp_token_per_rank * world_size). Override via CX_MORI_MAX_TOTAL_RECV.
+            _cfg_kwargs["kernel_type"] = self._kernel_type
+            _cfg_kwargs["max_total_recv_tokens"] = int(
+                os.environ.get("CX_MORI_MAX_TOTAL_RECV", "0"))
+        self.config = mori.ops.EpDispatchCombineConfig(**_cfg_kwargs)
+        print(f"MORI_KERNEL_TYPE {self._kernel_type_label} "
+              f"enable_sdma={os.environ.get('MORI_ENABLE_SDMA')} "
+              f"max_total_recv={_cfg_kwargs.get('max_total_recv_tokens', 'n/a')}",
+              file=sys.stderr, flush=True)
+        self.op = mori.ops.EpDispatchCombineOp(self.config)
+        # fp8 blockwise carries fp8 quant error -> loosen the correctness gate to the fp8 class
+        # (the harness reads backend.tolerance; bf16 default 5e-2). The combine reduces the
+        # (dequantized) payload per rank, compared against x*unique_ranks within this tolerance class.
+        if self._fp8:
+            self.tolerance = 1.5e-1
+        # Provenance: MoRI has no pip version; pin via MORI_COMMIT, else the image tag
+        # the launcher exported (COLLECTIVEX_IMAGE carries the mori build tag), so the
+        # provenance gate has something real rather than "unknown".
+        img = os.environ.get("COLLECTIVEX_IMAGE", "")
+        mori_commit = os.environ.get("MORI_COMMIT") or (f"image:{img}" if img else "unknown")
+        self.backend_provenance = {
+            "mori_commit": mori_commit,
+            "kernel_type": self._kernel_type_label,
+            "enable_sdma": os.environ.get("MORI_ENABLE_SDMA"),
+            "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+            "max_num_inp_token_per_rank": max(512, self._cap),
+            "resource_mode": args.resource_mode, "block_num": self.block_num,
+            "block_num_target": self._block_target, "block_num_floored": self._block_floored,
+            "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps,
+            "device_cus": dev_cus, "sm_fraction": (self.block_num / dev_cus),
+            "tuned_source": self._tuned_source,
+            "dispatch_dtype": args.dispatch_dtype,
+            "quant_type": self._quant_label,
+            "fp8_format": ("e4m3fnuz" if self._fp8 else None),
+            "fp8_mode": ("direct_cast" if self._fp8 else None),  # internal cast, scale_dim=0, no blocks
+        }
+
+    def buffer_cap(self, args):
+        # Largest tokens/rank the 2 GiB registerable heap holds at hidden=7168 (512,
+        # validated on-node). Override via CX_MORI_MAX_TOKENS.
+        return int(os.environ.get("CX_MORI_MAX_TOKENS", "512"))
+
+    def make_problem(self, T, idx, weights, x):
+        # Shared-trace slice: idx[T,topk] -> int32 (MoRI expects int32 expert ids); weights[T,topk]
+        # f32; x[T,hidden] bf16. scale_dim==0 for BOTH bf16 and fp8-direct-cast (the kernel casts
+        # bf16<->e4m3fnuz internally for transport), so scales is the (T,0) fp8 sentinel either way
+        # (dispatch ignores it since scale_dim==0). caller scales are only for FP4 dispatch.
+        indices = idx.to(torch.int32)
+        scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=self.device)
+        return types.SimpleNamespace(T=T, x=x, indices=indices,
+                                     weights=weights.to(torch.float32), scales=scales)
+
+    def dispatch(self, p):
+        (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch(
+            p.x, p.weights, p.scales, p.indices,
+            block_num=self.block_num, warp_per_block=self.dispatch_warps)
+        if self._async_ll:
+            # op.dispatch() launched only the AsyncLL SEND kernels; the recv buffers (and recv_num)
+            # are not valid until the RECV kernels run. Both halves enqueue on the current stream,
+            # so the harness's event-timed region captures the full send+recv transfer.
+            self.op.dispatch_recv(warp_per_block=self.dispatch_warps)
+        total_recv = int(recv_num[0].item())  # read BEFORE combine (combine resets recv_num)
+        # MoRI returns the recv buffer as input.dtype (bf16) for BOTH "none" and "fp8_direct_cast"
+        # (the e4m3fnuz cast is internal to the transport, dequantized back to bf16 on recv) -> a
+        # plain .to(bf16) is the combine input. fp8's e4m3 rounding shows up in the correctness gate
+        # against the looser fp8 tolerance class set in __init__.
+        return types.SimpleNamespace(
+            dispatch_output=dispatch_output, dispatch_weights=dispatch_weights,
+            dispatch_indices=dispatch_indices, total_recv=total_recv,
+            combine_input=dispatch_output.to(torch.bfloat16))
+
+    def stage(self, p, h):
+        # comm-only contract: stage the "expert outputs" into MoRI's registered
+        # combine-input buffer UNTIMED (in a real MoE the expert FFN writes here).
+        buf = self.op.get_registered_combine_input_buffer(
+            torch.bfloat16, hidden_dim=h.combine_input.size(1))
+        buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :])
+
+    def combine(self, p, h):
+        # AsyncLL: upstream exercises the AsyncLL combine WITHOUT weight reconstruction
+        # (test_dispatch_combine_async_ll.py passes weights=None), which matches this backend's
+        # unweighted correctness model in expected() (sum of one copy per destination rank). Pass
+        # None so the reduction is the raw per-rank token sum on both kernel types.
+        _cw = None if self._async_ll else h.dispatch_weights
+        combined, _w = self.op.combine(
+            h.combine_input, _cw, h.dispatch_indices,
+            block_num=self.block_num, warp_per_block=self.combine_warps)
+        if self._async_ll:
+            # op.combine() launched only the AsyncLL SEND kernels; combine_recv() completes the
+            # reduction transfer into the returned buffer (same stream → inside the timed region).
+            self.op.combine_recv(warp_per_block=self.combine_warps)
+        return combined
+
+    def expected(self, p, h):
+        # MoRI combine sums one copy per destination RANK ⇒ combined[i] ≈
+        # ref[i] * (#unique destination ranks among the token's topk experts).
+        pes = p.indices.long() // self.experts_per_rank
+        unique_pes = torch.tensor(
+            [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32
+        ).unsqueeze(1)
+        ref = p.x.float()
+        if self._fp8:
+            # fp8_direct_cast transports e4m3fnuz, so gate against the SAME direct-cast reference
+            # (consistency — like the flashinfer mxfp8/nvfp4 paths): combined = reduce(e4m3fnuz(x)),
+            # ref = e4m3fnuz(x)*ranks, so the e4m3 rounding CANCELS. A bf16 reference instead carries
+            # the full e4m3 error into relErr, which spuriously fails the per-rank gate at T=1 (the
+            # relErr denominator there is a single token's magnitude — a near-zero token inflates it).
+            ref = p.x.to(torch.float8_e4m3fnuz).float()
+        return ref * unique_pes, p.T
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def finalize(self, rc):
+        # MoRI's shmem teardown asserts after shmem_finalize(); results are already
+        # written, so sync and hard-exit past it.
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py
new file mode 100644
index 0000000000..cd8540488e
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_nccl.py
@@ -0,0 +1,140 @@
+"""CollectiveX — NCCL all-to-all expert-parallel backend (cross-node EP, goal 182).
+
+The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``: dispatch
+routes each token-copy to the rank that owns its expert via an uneven all-to-all; combine reverses it
+and weighted-sums the top-k copies back into each origin token. With no expert compute the round trip
+reconstructs ``x * sum(topk_weights)`` per token.
+
+Why this exists alongside DeepEP/UCCL/MoRI: those use custom one-sided RDMA (DeepEP/NVSHMEM, UCCL's own
+ibv verbs, MoRI ionic_rdma). Cross-node, UCCL's ``ibv_reg_mr`` failed with EINVAL -> heap corruption ->
+SIGSEGV (run 28326528672) because the cluster's IB HCAs / container lack the GPUDirect-RDMA peer-memory
+that custom verbs registration needs. NCCL's collective transport, by contrast, negotiates IB and
+*gracefully host-stages* when GPUDirect RDMA is unavailable — so an EP built purely on NCCL collectives
+runs cross-node on the same fabric. It is also the reference baseline the fused EP kernels improve upon,
+so a same-shape NCCL number is a meaningful comparison point, not just a fallback.
+
+Scope: BF16, normal mode, layout-and-dispatch-v1 (the timed window includes the layout/argsort + both
+all-to-alls). RCCL exposes the identical API, so this backend also covers AMD (rccl) cross-node EP.
+"""
+import types
+
+import torch
+import torch.distributed as dist
+
+
+def _format_collective_version(raw) -> str:
+    if isinstance(raw, int):
+        if raw < 10000:
+            return f"{raw // 1000}.{raw // 100 % 10}.{raw % 100}"
+        return f"{raw // 10000}.{raw // 100 % 100}.{raw % 100}"
+    if isinstance(raw, (tuple, list)):
+        return ".".join(str(value) for value in raw)
+    return str(raw) if raw not in (None, "") else "unknown"
+
+
+class NCCLBackend:
+    name = "nccl-ep"
+    combine_needs_redispatch = False   # dispatch saves the permutation + splits; combine reuses them
+    # Pure-collective token shuffle: bf16 only (no fp8 dispatch path), normal mode, single contract.
+    SUPPORTED_PRECISIONS = {"bf16"}
+    SUPPORTED_MODES = {"normal"}
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.experts = args.experts
+        assert args.experts % world_size == 0, \
+            f"NCCL EP needs experts({args.experts}) divisible by world_size({world_size})"
+        self.experts_per_rank = args.experts // world_size
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            f"NCCL EP supports precisions={sorted(self.SUPPORTED_PRECISIONS)} modes={sorted(self.SUPPORTED_MODES)} only"
+        self.tolerance = 5e-2   # bf16 round-trip
+        try:
+            _version = _format_collective_version(torch.cuda.nccl.version())
+        except Exception:
+            _version = "unknown"
+        _library = "rccl" if torch.version.hip else "nccl"
+        self.backend_provenance = {
+            "backend": f"{_library}-all2all",
+            "collective_library": _library,
+            "nccl_version": _version,
+            "transport": f"{_library}-all_to_all_single",
+            "resource_mode": args.resource_mode,
+            "num_sms": None,
+            "device_sms": torch.cuda.get_device_properties(device).multi_processor_count,
+            "tuned_source": "nccl-collective",
+        }
+
+    def buffer_cap(self, args):
+        return None   # no fixed pre-allocated buffer; all-to-all sizes itself per step
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice.
+        return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                     topk_weights=weights.to(torch.float32), layout=None)
+
+    def dispatch(self, p):
+        ws = self.world_size
+        x = p.x                                   # [T, H] bf16
+        idx = p.topk_idx                          # [T, topk]
+        T, H = int(x.shape[0]), int(x.shape[1])
+        topk = int(idx.shape[1])
+        dev = x.device
+        # Flatten the T*topk token-copies; each goes to the rank owning its expert.
+        flat_expert = idx.reshape(-1)                                       # [T*topk]
+        flat_dest = (flat_expert // self.experts_per_rank).to(torch.int64)  # dest rank per copy
+        flat_token = torch.arange(T, device=dev, dtype=torch.int64).repeat_interleave(topk)
+        # Group copies by destination rank (stable -> deterministic, invertible permutation).
+        order = torch.argsort(flat_dest, stable=True)
+        send_counts = torch.bincount(flat_dest, minlength=ws)               # [ws]
+        send_x = x.index_select(0, flat_token.index_select(0, order)).contiguous()  # [T*topk, H], send order
+        # Exchange per-rank counts so every rank can size its receive buffer.
+        recv_counts = torch.empty_like(send_counts)
+        dist.all_to_all_single(recv_counts, send_counts)
+        sc = send_counts.tolist()
+        rc = recv_counts.tolist()
+        total_recv = int(sum(rc))
+        recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev)
+        # The dispatch all-to-all (uneven splits). NCCL routes internode over IB (host-staged if no
+        # GPUDirect RDMA) — this is the line that runs cross-node where UCCL's ibv_reg_mr fails.
+        dist.all_to_all_single(recv_x, send_x, rc, sc)
+        return types.SimpleNamespace(recv_x=recv_x, combine_input=None, order=order,
+                                     flat_token=flat_token, flat_w=p.topk_weights.reshape(-1),
+                                     send_counts=sc, recv_counts=rc, T=T, H=H, total_recv=total_recv)
+
+    def stage(self, p, h):
+        # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity).
+        h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists).
+        send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype,
+                                device=h.combine_input.device)
+        dist.all_to_all_single(send_back, h.combine_input.contiguous(), h.send_counts, h.recv_counts)
+        # send_back is in send (sorted) order; invert the argsort to token-copy order.
+        copies = torch.empty_like(send_back)
+        copies[h.order] = send_back
+        # Weighted reduce of each token's top-k copies into [T, H] (accumulate in fp32 for stability).
+        out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device)
+        out.index_add_(0, h.flat_token, copies.float() * h.flat_w.unsqueeze(1))
+        return out.to(p.x.dtype)
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def expected(self, p, h):
+        # Round trip with identity expert: out[t] = sum_k w[t,k] * x[t] = x[t] * sum_k w[t,k].
+        wsum = p.topk_weights.sum(dim=1, keepdim=True).float()
+        return p.x.float() * wsum, p.T
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py
new file mode 100644
index 0000000000..f9de6e89a1
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_uccl.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — UCCL EP (NVIDIA), normal + LL modes. PRODUCING RESULTS:
+cx_build_uccl vendors UCCL's deep_ep_wrapper as `uccl_deepep` (its Buffer takes a torch
+ProcessGroup), so this adapter runs GENUINE uccl.ep dispatch/combine (uccl_version 0.1.1,
+intranode NVLink) — validated on h100/h200/b300/b200.
+
+IMPORTANT (empirically established on H100 via GHA): the LOW-LEVEL `uccl.ep.Buffer` is
+NOT a drop-in DeepEP clone. Its constructor is
+  Buffer(rank, num_ranks, num_nvl_bytes=0, num_rdma_bytes=0, low_latency_mode=False, …)
+— it takes rank/num_ranks ints, NOT a torch ProcessGroup, so the `Buffer(self.group, …)`
+calls below raise `TypeError: incompatible function arguments`. The DeepEP-identical
+`Buffer(group, …)` API is UCCL's separate ~1900-line `deep_ep_wrapper` package (packaged
+as `deep_ep`, colliding with the container's real DeepEP), whose __init__ runs a proxy +
+IPC-handle-exchange + runtime.sync + connect_atomic_buffer bootstrap. To finish UCCL:
+vendor `deep_ep_wrapper` under a non-colliding name (it uses relative imports + only needs
+`uccl.ep`) and import its Buffer here; then this file is a true ep_deepep.py clone. This is
+DONE: cx_build_uccl vendors `deep_ep_wrapper` as `uccl_deepep` and the import below uses it; if
+that wrapper is ever absent the import falls back to the low-level `uccl.ep.Buffer`, which then
+fails loudly (preserved failed-case) — never faked. With the wrapper present, results are genuine.
+
+The harness contract (make_problem/dispatch/stage/combine/expected/buffer_cap/recv_tokens/
+finalize + backend_provenance + SUPPORTED_*) mirrors ep_deepep.py and is correct once the
+wrapper Buffer is wired.
+
+Install (see launchers/run_in_container.sh cx_build_uccl): `pip install uccl` ships a
+prebuilt cp312 wheel; the UCCL EP kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH
+(pip install nvidia-cuda-runtime-cu12, prepend its lib dir) even on a cu13 image.
+
+Correctness (identical to DeepEP's intranode test): a pure dispatch->combine round trip
+with no expert compute reconstructs x only after dividing by the number of ranks each
+token was sent to, so the harness expects combined ~= x * is_token_in_rank.sum(dim=1).
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    import uccl  # for version/provenance
+    try:
+        # PREFERRED: vendored deep_ep_wrapper (cx_build_uccl -> uccl_deepep). Buffer(group, ...)
+        # takes a torch ProcessGroup (matches DeepEP + this adapter's calls) + runs UCCL's full
+        # proxy/IPC/runtime.sync bootstrap. Fallback: low-level uccl.ep.Buffer(rank,num_ranks,...).
+        from uccl_deepep import Buffer  # type: ignore
+    except Exception:
+        from uccl.ep import Buffer  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the installed uccl wheel + cu12 runtime
+    print("ERROR: uccl.ep import failed — `pip install uccl nvidia-cuda-runtime-cu12` and "
+          "prepend the cu12 lib dir to LD_LIBRARY_PATH at job setup (cx_build_uccl). "
+          f"{exc!r}", file=sys.stderr)
+    raise
+
+
+def _uccl_version() -> str:
+    try:
+        import importlib.metadata as _md
+        return _md.version("uccl")
+    except Exception:
+        return getattr(uccl, "__version__", "unknown")
+
+
+# UCCL's normal-mode fp8 dispatch takes x as a (fp8, scales) tuple with a per-token
+# block-128 scale — the SAME convention DeepEP's kernels expect (UCCL's ep.Buffer is a
+# clone): scales [T, H//128] float32, e4m3, 448 = e4m3 max. Both directions of the cast
+# run OUTSIDE the timed window (cast in make_problem, dequant in stage), so fp8
+# quantization is NOT included in dispatch time (except under runtime-visible-v1).
+_FP8_MAX = 448.0
+_FP8_BLOCK = 128
+
+
+def _per_token_cast_to_fp8(x):
+    # x: [T, H] (H % 128 == 0) -> (x_fp8 [T,H] e4m3fn, scales [T, H//128] f32)
+    T, H = x.shape
+    xv = x.float().view(T, H // _FP8_BLOCK, _FP8_BLOCK)
+    amax = xv.abs().amax(dim=2).clamp(min=1e-4)               # [T, H//128]
+    x_fp8 = (xv * (_FP8_MAX / amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(T, H)
+    return x_fp8, (amax / _FP8_MAX).contiguous()
+
+
+def _per_block_dequant(x_fp8, scales):
+    # inverse of the above: [R,H] e4m3 + [R, H//128] f32 -> [R,H] bf16
+    R, H = x_fp8.shape
+    xv = x_fp8.float().view(R, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(2)).view(R, H).to(torch.bfloat16)
+
+
+def _per_block_dequant_3d(x_fp8, scales):
+    # LL recv layout: [E, S, H] e4m3 + [E, S, H//128] f32 -> [E, S, H] bf16
+    E, S, H = x_fp8.shape
+    xv = x_fp8.float().view(E, S, H // _FP8_BLOCK, _FP8_BLOCK)
+    return (xv * scales.unsqueeze(-1)).view(E, S, H).to(torch.bfloat16)
+
+
+class UCCLBackend:
+    name = "uccl"
+    combine_needs_redispatch = False  # UCCL combine reuses the handle (DeepEP-clone semantics)
+    # Capabilities — run_ep.py REJECTS anything outside these BEFORE construction (no
+    # fallback/mislabel). Expanded as each path is implemented + hardware-validated.
+    #   normal mode: bf16 + fp8 (per-token block-128 cast) — validated intranode NVLink on H200 (EP2).
+    #   ll mode: low_latency_dispatch/combine via allow_nvlink_for_low_latency_mode — validated
+    #   RUNNING intranode over NVLink on H200 (EP2); same DeepEP-clone LL kernel family.
+    SUPPORTED_PRECISIONS = {"bf16", "fp8"}
+    SUPPORTED_MODES = {"normal", "ll"}
+    # Three contracts (mirror DeepEP — UCCL's Buffer is the same API):
+    #   layout-and-dispatch-v1     — times get_dispatch_layout INSIDE dispatch; fp8 cast/dequant
+    #                                OUTSIDE (preprocessing mirrors a producer handing quantized x).
+    #   cached-layout-comm-only-v1 — layout hoisted out (untimed); dispatch = pure comm. normal only.
+    #   runtime-visible-v1         — dispatch INCLUDES the fp8 quant (cast) + layout + comm + the
+    #                                recv-dequant that makes expert input consumable; combine starts
+    #                                from bf16 expert outputs. (normal mode; LL times all of it in-kernel.)
+    SUPPORTED_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1"}
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.mode = args.mode
+        self.ll = (args.mode == "ll")
+        self.contract = args.measurement_contract
+        # hoist layout out of the timed dispatch only for the cached contract in normal mode.
+        self.cache_layout = (self.contract == "cached-layout-comm-only-v1") and not self.ll
+        # runtime-visible-v1: the fp8 cast + recv-dequant move INSIDE the timed dispatch (normal
+        # mode). LL already times cast+layout+comm in its single kernel, so it's runtime-visible
+        # by construction — the flag only changes normal mode's boundary.
+        self.runtime_visible = (self.contract == "runtime-visible-v1") and not self.ll
+        self.group = dist.group.WORLD
+        assert args.dispatch_dtype in self.SUPPORTED_PRECISIONS and args.mode in self.SUPPORTED_MODES, \
+            "run_ep.py must reject unsupported dtype/mode before constructing the backend"
+        # fp8 e4m3 per-token-block round-trip caps reconstruction error near the largest
+        # element at ~1/16 (3 mantissa bits); bf16 round-trip is ~5e-3. Tolerance is
+        # recorded in the artifact so the looser fp8 gate is explicit, not hidden.
+        self.fp8 = (args.dispatch_dtype == "fp8")
+        self.tolerance = 1.25e-1 if self.fp8 else 5e-2
+        dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+        ver = _uccl_version()
+        if self.ll:
+            self._init_ll(args, dev_sms, ver)
+        else:
+            self._init_normal(args, rank, dev_sms, ver)
+
+    def _init_normal(self, args, rank, dev_sms, ver):
+        # fp8 cast: UNTIMED (make_problem) under layout-and-dispatch / cached-layout; TIMED (inside
+        # dispatch) under runtime-visible-v1. So fp8_in_timing tracks the contract honestly.
+        self.fp8_in_timing = (self.runtime_visible if self.fp8 else None)
+        self.combine_needs_redispatch = False  # normal combine reuses the handle
+        # Intranode normal mode: NVLink buffer only. ONE buffer size for ALL points (the shared
+        # T=128 point must match between the decode and prefill sweeps). 4 GiB holds T up to 4096.
+        num_nvl_bytes = int(os.environ.get("CX_UCCL_NVL_BYTES",
+                                           os.environ.get("CX_DEEPEP_NVL_BYTES",
+                                                          str(4 * 1024 * 1024 * 1024))))
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0)
+        rm = args.resource_mode
+        tuned_src = None
+        if rm == "normalized":
+            num_sms = max(1, round(args.sm_fraction * dev_sms))   # ~same device fraction as MoRI
+        elif rm == "tuned":
+            # Best-available for the installed UCCL: its OWN default SM count (Buffer.num_sms —
+            # the library's analytic choice). get_dispatch_config(num_ranks) returns the
+            # recommended Config but doesn't expose num_sms to Python; the default reflects it.
+            num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+            tuned_src = "uccl-default-num_sms"
+        else:  # default — the bring-up budget
+            num_sms = args.num_sms
+        try:
+            Buffer.set_num_sms(num_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            raise RuntimeError(f"UCCL did not apply requested num_sms={num_sms}: {exc!r}") from exc
+        applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+        if applied_num_sms != num_sms:
+            raise RuntimeError(
+                f"UCCL num_sms mismatch: requested={num_sms} applied={applied_num_sms}")
+        self.backend_provenance = {
+            "uccl_version": ver,
+            "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}",
+            "mode": "normal", "resource_mode": rm, "requested_num_sms": num_sms,
+            "num_sms": applied_num_sms, "device_sms": dev_sms,
+            "sm_fraction": (applied_num_sms / dev_sms), "tuned_source": tuned_src or "n/a",
+            "num_nvl_bytes": num_nvl_bytes,
+        }
+
+    def _init_ll(self, args, dev_sms, ver):
+        # Low-latency mode: a distinct kernel family (IBGDA, but runs intranode over NVLink via
+        # allow_nvlink_for_low_latency_mode). fp8 cast happens INSIDE low_latency_dispatch so for
+        # fp8 the quantization IS inside the timed window (recorded honestly). The buffer is sized
+        # for a FIXED num_max_dispatch_tokens_per_rank (all ranks identical), so LL is a
+        # decode-shaped path; buffer_cap caps the sweep at num_max (no silent drop). set_num_sms
+        # does NOT apply (the LL kernel picks its own occupancy) — recorded n/a.
+        self.fp8_in_timing = (True if self.fp8 else None)
+        self.combine_needs_redispatch = True   # re-dispatch (untimed) before each timed combine
+        self.num_max = int(os.environ.get("CX_LL_MAX_TOKENS", "128"))
+        self.experts = args.experts
+        rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+            self.num_max, args.hidden, self.world_size, args.experts)
+        # one QP per local expert is the DeepEP/UCCL convention for LL
+        self.num_qps = max(1, args.experts // self.world_size)
+        self.buffer = Buffer(self.group, 0, rdma_bytes, low_latency_mode=True,
+                             num_qps_per_rank=self.num_qps,
+                             allow_nvlink_for_low_latency_mode=True)
+        self.backend_provenance = {
+            "uccl_version": ver,
+            "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{ver}",
+            "mode": "ll", "resource_mode": args.resource_mode,
+            "num_sms": None, "device_sms": dev_sms, "tuned_source": "ll-fixed-kernel",
+            "num_max_dispatch_tokens_per_rank": self.num_max,
+            "num_rdma_bytes": rdma_bytes, "num_qps_per_rank": self.num_qps,
+            "low_latency_mode": True, "use_fp8": self.fp8,
+        }
+
+    def buffer_cap(self, args):
+        # LL is sized for a fixed num_max; cap the sweep there (reported, not silent).
+        return self.num_max if self.ll else None
+
+    def make_problem(self, T, idx, weights, x):
+        # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared trace slice.
+        p = types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+                                  topk_weights=weights.to(torch.float32), layout=None)
+        if self.fp8 and not self.ll and not self.runtime_visible:
+            # layout-and-dispatch / cached-layout: per-token block-128 cast, UNTIMED (preprocessing,
+            # mirrors the real producer that hands the dispatcher already-quantized activations).
+            # runtime-visible does NOT pre-cast (the cast is timed inside dispatch); LL casts in-kernel.
+            p.x_fp8, p.x_scales = _per_token_cast_to_fp8(x)
+        if self.cache_layout:
+            # cached-layout-comm-only-v1: compute the dispatch layout ONCE here (untimed) so the
+            # timed dispatch is pure comm. (layout-and-dispatch-v1 leaves it None and dispatch
+            # computes it inside the timed window.)
+            ntr, _, ntpe, itir, _ = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+            p.layout = (ntr, ntpe, itir)
+        return p
+
+    def dispatch(self, p):
+        if self.ll:
+            return self._dispatch_ll(p)
+        if p.layout is not None:                       # cached-layout-comm-only-v1
+            num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = p.layout
+        else:                                          # layout-and-dispatch / runtime-visible (timed layout)
+            (num_tokens_per_rank, _, num_tokens_per_expert,
+             is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        ref_fp8 = ref_scales = None
+        if self.fp8:
+            if self.runtime_visible:
+                # runtime-visible: the per-token block-128 cast is INSIDE the timed dispatch.
+                x_fp8, x_scales = _per_token_cast_to_fp8(p.x)
+                ref_fp8, ref_scales = x_fp8, x_scales      # for the correctness reference
+            else:
+                x_fp8, x_scales = p.x_fp8, p.x_scales      # pre-cast (untimed)
+            x_in = (x_fp8, x_scales)
+        else:
+            x_in = p.x
+        recv_x, _recv_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            x_in, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        out = types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank, ref_fp8=ref_fp8, ref_scales=ref_scales)
+        if self.fp8 and self.runtime_visible:
+            # dispatch ENDS when expert input is consumable: dequant fp8 recv -> bf16 INSIDE the
+            # timed window (the contract's "expert input genuinely consumable" boundary). stage()
+            # then no-ops for this contract.
+            recv_fp8, recv_scales = recv_x
+            out.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+            out.rv_staged = True
+        return out
+
+    def _dispatch_ll(self, p):
+        # x is bf16; the kernel casts to fp8 internally when use_fp8=True (so for fp8 the cast IS
+        # inside this timed op — fp8_in_timing=True). recv is the expert-major 3D layout
+        # [num_local_experts, num_max*world, hidden] (+scales when fp8).
+        recv_x, recv_count, handle, _event, _hook = self.buffer.low_latency_dispatch(
+            p.x, p.topk_idx, self.num_max, self.experts,
+            use_fp8=self.fp8, return_recv_hook=False)
+        return types.SimpleNamespace(recv_x=recv_x, recv_count=recv_count, handle=handle)
+
+    def stage(self, p, h):
+        # comm-only contract: "expert outputs" already exist as recv_x. Dequantize fp8 recv to
+        # bf16 HERE (untimed) — the expert-compute boundary — so combine moves bf16 in both
+        # precisions. Bf16 recv is staged as-is. (LL recv is 3D; normal recv is 2D.)
+        if getattr(h, "rv_staged", False):
+            return None   # runtime-visible already produced bf16 combine_input inside dispatch (timed)
+        if self.ll:
+            if self.fp8:
+                recv_fp8, recv_scales = h.recv_x
+                h.combine_input = _per_block_dequant_3d(recv_fp8, recv_scales)
+            else:
+                h.combine_input = h.recv_x
+        elif self.fp8:
+            recv_fp8, recv_scales = h.recv_x
+            h.combine_input = _per_block_dequant(recv_fp8, recv_scales)
+        else:
+            h.combine_input = h.recv_x
+        return None
+
+    def combine(self, p, h):
+        if self.ll:
+            # weighted per-expert reduce; topk_idx/weights are the ORIGINAL per-token ones.
+            combined_x, _event, _hook = self.buffer.low_latency_combine(
+                h.combine_input, p.topk_idx, p.topk_weights, h.handle)
+            return combined_x
+        combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle,
+                                               topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        if self.ll:
+            # LL combine reduces each token's topk expert copies weighted by topk_weights; with no
+            # expert compute each copy is (the kernel's fp8 cast of) x, so combined ~= x *
+            # sum(topk_weights). fp8 quant error is covered by self.tolerance.
+            wsum = p.topk_weights.sum(dim=1, keepdim=True)
+            return p.x.float() * wsum, p.T
+        # normal: round trip with no expert compute reconstructs x*(#destination ranks); for fp8
+        # compare against the dequantized cast that was actually sent.
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        ref = p.x.float()
+        if self.fp8:
+            # runtime-visible cast lives on the handle (no pre-cast on p); else use the pre-cast.
+            x_fp8 = getattr(h, "ref_fp8", None)
+            x_scales = getattr(h, "ref_scales", None)
+            if x_fp8 is None:
+                x_fp8, x_scales = p.x_fp8, p.x_scales
+            ref = _per_block_dequant(x_fp8, x_scales).float()
+        return ref * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        if self.ll:
+            return int(h.recv_count.sum().item())  # token-copies received across local experts
+        rx = h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x
+        return int(rx.shape[0])
+
+    def finalize(self, rc):
+        # UCCL's symmetric-memory / proxy teardown SIGSEGVs after the sweep completes — but the
+        # result JSON is already written by run_sweep, so (like ep_mori) hard-exit past the crashy
+        # dist/uccl cleanup with the real rc. A clean teardown isn't worth a false 'failed' on a
+        # valid result (the H100 smoke produced status=valid, correct=True before the SIGSEGV).
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py
new file mode 100644
index 0000000000..e2ed2b94a7
--- /dev/null
+++ b/experimental/CollectiveX/tests/eplb.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for
+skewed (zipf) expert load.
+
+Under skewed routing, the ranks hosting hot logical experts receive far more token-copies
+than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX
+the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts
+onto extra physical slots and PLACES the slots so every rank carries ~equal load.
+
+This module is backend-agnostic: it is purely a transform of the deterministic routing
+trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to
+rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots
+RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping
+reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical`
+and the remapped (physical) trace; nothing else changes.
+
+  num_physical = num_logical + redundant   (redundant rounded up to a multiple of ep_size)
+  build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks
+  remap_idx():  each token's logical targets -> physical replicas, spread by global token id
+
+Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch.
+"""
+from __future__ import annotations
+
+
+def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int:
+    """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the
+    physical experts divide evenly across ranks (symmetric dispatch)."""
+    r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size
+    return num_logical + r
+
+
+def _contiguous_rank_load(logical_load, ep_size):
+    """Per-rank received load WITHOUT EPLB: logical experts placed contiguously
+    (experts_per_rank = num_logical/ep_size), so rank r carries its block's total."""
+    n = len(logical_load)
+    per = n // ep_size
+    return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)]
+
+
+def build_plan(logical_load, num_physical: int, ep_size: int) -> dict:
+    """logical_load: list[float] length num_logical (token-copies per logical expert).
+    Returns the replication+placement plan (all pure-Python lists) + before/after balance."""
+    num_logical = len(logical_load)
+    assert num_physical >= num_logical, "num_physical must be >= num_logical"
+    assert num_physical % ep_size == 0, "num_physical must divide ep_size"
+    assert num_logical % ep_size == 0, "num_logical must divide ep_size"
+    spp = num_physical // ep_size                      # physical slots per rank (fixed)
+
+    # 1) Replica allocation — start one slot per logical expert, then hand each redundant
+    #    slot to the expert with the highest CURRENT per-replica load (greedy min-max).
+    replicas = [1] * num_logical
+    for _ in range(num_physical - num_logical):
+        best, best_lps = 0, -1.0
+        for e in range(num_logical):
+            lps = logical_load[e] / replicas[e]
+            if lps > best_lps:
+                best, best_lps = e, lps
+        replicas[best] += 1
+
+    # 2) Slots = (per-replica load, logical expert), one per replica.
+    slots = []
+    for e in range(num_logical):
+        lps = logical_load[e] / replicas[e]
+        slots.extend((lps, e) for _ in range(replicas[e]))
+
+    # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the
+    #    max per-rank load: heaviest slot first -> least-loaded rank that still has capacity.
+    slots.sort(reverse=True)
+    rank_slots = [[] for _ in range(ep_size)]
+    rank_load = [0.0] * ep_size
+    for lps, e in slots:
+        r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp),
+                key=lambda r: rank_load[r])
+        rank_slots[r].append(e)
+        rank_load[r] += lps
+
+    # 4) Rank-major physical numbering -> contiguous placement == this balanced placement.
+    phys2log, rank_of_phys = [], []
+    for r in range(ep_size):
+        for e in rank_slots[r]:
+            phys2log.append(e)
+            rank_of_phys.append(r)
+    log2phys = [[] for _ in range(num_logical)]
+    for pid, e in enumerate(phys2log):
+        log2phys[e].append(pid)
+
+    before = _contiguous_rank_load(logical_load, ep_size)
+    total = sum(logical_load) or 1.0
+    mean = total / ep_size
+    return {
+        "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size,
+        "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas),
+        "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys,
+        "rank_load_after": rank_load, "rank_load_before": before,
+        # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts.
+        "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean,
+        "replicated_experts": sum(1 for r in replicas if r > 1),
+    }
+
+
+def remap_idx(idx_logical, plan):
+    """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace).
+    Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's
+    physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out
+    across its replicas (= across ranks). Replicas of distinct logical experts are disjoint,
+    so a token's top-k physical ids stay distinct (dispatch invariant preserved)."""
+    import torch
+    replicas = plan["replicas"]
+    num_logical = len(replicas)
+    max_rc = plan["max_replicas"]
+    rc = torch.tensor(replicas, dtype=torch.int64)
+    # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed
+    # past rc[e] because the replica index is taken mod rc[e]).
+    padded = torch.zeros(num_logical, max_rc, dtype=torch.int64)
+    for e, phys in enumerate(plan["log2phys"]):
+        for k in range(max_rc):
+            padded[e, k] = phys[k] if k < len(phys) else phys[0]
+    gt = idx_logical.shape[0]
+    rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1)     # [gt,1] global token id
+    e = idx_logical.to(torch.int64)                             # [gt,topk]
+    ridx = rows % rc[e]                                         # [gt,topk] replica index
+    return padded[e, ridx]                                      # [gt,topk] physical ids
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed.
+    import sys
+    NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32
+    load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)]
+    nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP)
+    plan = build_plan(load, nphys, EP)
+    print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}")
+    print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} "
+          f"(hottest expert 0 replicas={plan['replicas'][0]})")
+    print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}")
+    print(f"per-rank load AFTER  (EPLB):       {[round(x,3) for x in plan['rank_load_after']]}")
+    print(f"imbalance (max/mean)  BEFORE={plan['imbalance_before']:.2f}x  AFTER={plan['imbalance_after']:.2f}x")
+    # Gates: equal slot cardinality, every logical expert placed, big imbalance cut.
+    assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL))
+    assert sum(plan["replicas"]) == nphys
+    assert len(plan["phys2log"]) == nphys
+    assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL))
+    # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing
+    assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"])
+    assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance"
+    assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}"
+    # remap (if torch present): distinctness + balanced receive on a sampled zipf trace.
+    try:
+        import torch
+        g = torch.Generator().manual_seed(0)
+        p = torch.tensor(load)
+        p = (p / p.sum()).expand(4096, NUM_LOGICAL)
+        idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64)
+        idx_p = remap_idx(idx_l, plan)
+        assert idx_p.shape == idx_l.shape
+        # top-k physical ids distinct per token
+        assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct"
+        spp = plan["slots_per_rank"]
+        recv_before = [0] * EP
+        recv_after = [0] * EP
+        per_log = NUM_LOGICAL // EP
+        for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()):
+            for e in row_l:
+                recv_before[e // per_log] += 1
+            for pid in row_p:
+                recv_after[pid // spp] += 1
+        ib = max(recv_before) / (sum(recv_before) / EP)
+        ia = max(recv_after) / (sum(recv_after) / EP)
+        print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x  AFTER={ia:.2f}x")
+        assert ia < ib and ia < 1.35, "remap must balance per-rank receive load"
+        print("remap self-test: OK")
+    except ImportError:
+        print("(torch absent — skipped remap self-test; planner gates passed)")
+    print("EPLB self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py
new file mode 100644
index 0000000000..5722c4b71c
--- /dev/null
+++ b/experimental/CollectiveX/tests/make_workloads.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""Generate canonical serialized workloads (goal Part 1). Runs build_workload (needs torch) for
+each (routing, global_tokens) in a ladder and writes <workload_id>.npz + .manifest.json into a
+dir that runs then consume via `run_ep.py --workload-dir`. One trace per global-token count
+because the generator is not prefix-consistent across sizes.
+
+  python3 tests/make_workloads.py --out-dir /path/to/cx_workloads \\
+      --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\
+      --tokens-ladder "1 2 4 8 16 32 64 128 256 512"
+
+Or by the named v1 workload in configs/workloads.yaml. Explicit dimension flags still override it:
+
+  python3 tests/make_workloads.py --out-dir /path/to/cx_workloads --workload deepseek-v3-v1 --routing uniform --ep 8
+
+--id-only prints the deterministic workload_id per ladder point WITHOUT torch/numpy (the id is a hash
+of the identity params, not the bytes) — runnable on a login node / in CI to prove cross-SKU identity:
+
+  python3 tests/make_workloads.py --workload deepseek-v3-v1 --ep 8 --id-only
+
+Generate every routing the suites need by running once per --routing. Idempotent (same id => same
+file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import workload as wl   # noqa: E402
+
+# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution.
+_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def resolve_manifest(name):
+    """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts).
+    Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`.
+    Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib."""
+    import yaml
+    path = os.path.join(_REPO, "configs", "workloads.yaml")
+    cfg = yaml.safe_load(open(path))
+    known = []
+    for section in ("synthetic", "model_derived"):
+        sec = cfg.get(section) or {}
+        known += list(sec)
+        m = sec.get(name)
+        if m is None:
+            continue
+        experts = m.get("experts", m.get("routed_experts"))
+        if m.get("hidden") is None or m.get("topk") is None or experts is None:
+            raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}")
+        return int(m["hidden"]), int(m["topk"]), int(experts)
+    raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}")
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads")
+    ap.add_argument("--out-dir", help="required unless --id-only")
+    ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)")
+    ap.add_argument("--routing", default="uniform")
+    ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)")
+    ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)")
+    ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)")
+    ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)")
+    ap.add_argument("--seed", type=int, default=67)
+    ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512")
+    ap.add_argument("--id-only", action="store_true",
+                    help="print deterministic workload_id per point WITHOUT torch/numpy (no files written)")
+    a = ap.parse_args()
+
+    # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override
+    # per field. With neither, fall back to the v1 DeepSeek dimensions (7168/8/256).
+    base_h, base_t, base_e = (7168, 8, 256)
+    if a.workload:
+        base_h, base_t, base_e = resolve_manifest(a.workload)
+    hidden = a.hidden if a.hidden is not None else base_h
+    topk = a.topk if a.topk is not None else base_t
+    experts = a.experts if a.experts is not None else base_e
+
+    if not a.id_only and not a.out_dir:
+        ap.error("--out-dir is required unless --id-only")
+
+    ladder = sorted({int(t) for t in a.tokens_ladder.replace(",", " ").split() if int(t) > 0})
+    epr = experts // a.ep
+    label = f"workload={a.workload} " if a.workload else ""
+
+    if a.id_only:
+        # Identity-only path: the workload_id is a hash of (generator|routing|hidden|topk|experts|gt|seed),
+        # so it is fully determined WITHOUT generating the trace. Proves cross-SKU identity in CI/login.
+        made = []
+        for T in ladder:
+            gt = T * a.ep
+            wid = wl.compute_workload_id(a.routing, hidden, topk, experts, gt, a.seed)
+            made.append((T, gt, wid))
+            print(f"  T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}")
+        print(f"{label}id-only: {len(made)} workload_id(s) "
+              f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})")
+        return 0
+
+    os.makedirs(a.out_dir, exist_ok=True)
+    made = []
+    for T in ladder:
+        gt = T * a.ep
+        idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr)
+        wid = wl.save_workload(a.out_dir, idx, w, man)
+        made.append((T, gt, wid))
+        print(f"  T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}  "
+              f"(trace sha {man['checksums']['trace'][:12]})")
+    print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py
new file mode 100644
index 0000000000..ce01ba74cd
--- /dev/null
+++ b/experimental/CollectiveX/tests/routing.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""CollectiveX — deterministic, platform-independent MoE routing trace.
+
+Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated
+ONCE from a fixed seed over the *global* token batch, indexed by global token id, and
+is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k,
+experts_per_rank). Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations
+are per-rank (same rank ⇒ same x on any platform), so a given global token id has
+identical activation everywhere without materializing a global activation tensor.
+
+Trace classes (the rank fan-out — #destination ranks a token's top-k experts touch —
+is the property that makes an EP workload representative; review caught the old
+default having fan-out 1):
+
+  * uniform   — top-k distinct experts drawn uniformly per token. The DEFAULT.
+                Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈
+                8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson.
+  * balanced  — load-equalized AND maximally spread: token i, slot j →
+                (i + j·experts_per_rank) mod E, so the 8 experts sit one-per-rank
+                (fan-out = ep_size) and every expert is hit equally. The high-fan-out,
+                perfectly-balanced reference.
+  * balanced-rank-local — the OLD degenerate "balanced": (i·top_k + j) mod E, i.e.
+                top_k consecutive experts, which (top_k ≤ experts/rank, aligned) all
+                land on ONE rank ⇒ fan-out 1, minimum communication. Kept as an
+                explicit edge case, honestly named.
+  * zipf      — expert popularity proportional to 1/rank (skewed load), uniform-ish fan-out.
+  * hotspot-single — expert 0 is present in every token's top-k (receive-concentration probe).
+
+Always publish the realized fan-out so the workload is never misread again
+(`routing_stats`).
+"""
+from __future__ import annotations
+
+import hashlib
+
+import torch
+
+_RANK_SUBSEED = 7919
+
+
+def _cpu_gen(seed: int) -> "torch.Generator":
+    g = torch.Generator(device="cpu")
+    g.manual_seed(int(seed))
+    return g
+
+
+def build_global_routing(global_tokens: int, experts: int, topk: int,
+                         routing: str, seed: int, experts_per_rank: int, step: int = 0):
+    """(idx[gt, topk] int64, weights[gt, topk] float32) on CPU — deterministic,
+    independent of world/EP/platform, with experts distinct within a token. `step` is retained only
+    for legacy call compatibility and must be zero."""
+    if topk > experts:
+        raise ValueError(f"topk ({topk}) > experts ({experts})")
+    if int(step) != 0:
+        raise ValueError("nonzero routing step requires a stateful trace-replay benchmark")
+    gt = int(global_tokens)
+    g = _cpu_gen(seed)
+    if routing == "uniform":
+        keys = torch.rand(gt, experts, generator=g)
+        idx = keys.argsort(dim=1)[:, :topk].contiguous().to(torch.int64)
+    elif routing == "balanced":
+        # one expert per rank ⇒ fan-out = ep_size, perfectly balanced load.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i + j * int(experts_per_rank)) % experts
+    elif routing == "balanced-rank-local":
+        # top_k consecutive (mod E) ⇒ all on ONE rank ⇒ fan-out 1 (min comm). Edge case.
+        i = torch.arange(gt, dtype=torch.int64).unsqueeze(1)
+        j = torch.arange(topk, dtype=torch.int64).unsqueeze(0)
+        idx = (i * topk + j) % experts
+    elif routing == "zipf":
+        p = 1.0 / torch.arange(1, experts + 1, dtype=torch.float32)
+        p = (p / p.sum()).expand(gt, experts)
+        idx = torch.multinomial(p, topk, replacement=False, generator=g).to(torch.int64)
+    elif routing == "hotspot-single":
+        # One hot expert is in every token's top-k; the others are uniform and distinct.
+        hot = 0
+        others = [e for e in range(experts) if e != hot]
+        others_t = torch.tensor(others, dtype=torch.int64)
+        rest = torch.stack([others_t[torch.randperm(experts - 1, generator=g)[:topk - 1]]
+                            for _ in range(gt)]).to(torch.int64)
+        idx = torch.cat([torch.full((gt, 1), hot, dtype=torch.int64), rest], dim=1)
+    else:
+        raise ValueError(
+            f"unknown routing '{routing}' "
+            "(uniform|balanced|balanced-rank-local|zipf|hotspot-single)")
+    weights = torch.softmax(torch.randn(gt, topk, generator=g), dim=1).to(torch.float32)
+    return idx, weights
+
+
+# Activation VALUE distributions (goal Part 2 "activation-value sensitivity"). Under bf16 combine
+# these are latency-neutral (bf16 is value-independent — the ratio is ~1.0, the expected null
+# result); they become latency-relevant only under a quantized combine (PR311), where amax /
+# outliers / saturation drive scale computation. Kept here so the rig is ready + the value
+# identity (activation_identity) is honest about which distribution was used.
+ACTIVATION_PROFILES = ("normal", "zeros", "small-amplitude", "wide-dynamic-range", "fp8-saturation")
+_FP8_E4M3_MAX = 448.0   # e4m3 max magnitude — fp8-saturation pushes values to/over this
+
+
+def rank_slice(idx, weights, rank: int, tokens_per_rank: int):
+    lo = rank * tokens_per_rank
+    return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous()
+
+
+def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device,
+                     dtype=torch.bfloat16, profile: str = "normal"):
+    """Per-rank expert-input activations. Deterministic from (seed, rank) so a given global
+    token has identical activation on every platform. `profile` selects the VALUE distribution
+    (goal Part 2): normal N(0,1); zeros; small-amplitude (×0.01); wide-dynamic-range (heavy-tailed
+    with rare large outliers); fp8-saturation (values scaled to straddle the e4m3 max so an fp8
+    cast saturates). All seeded identically per rank — only the value shape changes."""
+    g = _cpu_gen(int(seed) * _RANK_SUBSEED + int(rank) + 1)
+    if profile == "zeros":
+        x = torch.zeros(tokens, hidden, dtype=torch.float32)
+    elif profile == "small-amplitude":
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 0.01
+    elif profile == "wide-dynamic-range":
+        # heavy-tailed: N(0,1) base with a sparse (~1%) set of large (×~250) outliers, so amax
+        # per block swings widely token-to-token (the case that stresses per-block fp8 scaling).
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32)
+        spikes = (torch.rand(tokens, hidden, generator=g) < 0.01).float()
+        x = x + spikes * torch.randn(tokens, hidden, generator=g, dtype=torch.float32) * 250.0
+    elif profile == "fp8-saturation":
+        # uniform in [-1,1] scaled to ~1.5× the e4m3 max so a naive fp8 cast clips/saturates.
+        u = torch.rand(tokens, hidden, generator=g, dtype=torch.float32) * 2.0 - 1.0
+        x = u * (_FP8_E4M3_MAX * 1.5)
+    elif profile == "normal":
+        x = torch.randn(tokens, hidden, generator=g, dtype=torch.float32)
+    else:
+        raise ValueError(f"unknown activation profile '{profile}' (one of {ACTIVATION_PROFILES})")
+    return x.to(device=device, dtype=dtype)
+
+
+def placement_perm(ep_size: int, gpus_per_node: int, placement: str) -> list:
+    """phys[logical_rank] -> physical slot, per placement kind (goal Part 2 placement matrix).
+    The physical slot's node = slot // gpus_per_node, domain = slot // scale_up_domain. Single
+    node (ep <= gpus_per_node) makes every placement identical (everything is same-node).
+
+      packed         identity — fill one node/domain before crossing (latency-oriented default).
+      runtime-native identity for now — reproduces the serving placement (link via recipe meta).
+      striped        round-robin logical ranks across nodes (exposes inter-node transport).
+      adversarial    a deterministic scatter that maximizes cross-node/-domain copies.
+    """
+    n = ep_size
+    if gpus_per_node <= 0 or gpus_per_node >= n or placement in ("packed", "runtime-native"):
+        return list(range(n))
+    nodes = (n + gpus_per_node - 1) // gpus_per_node
+    if placement == "striped":
+        # logical r -> node (r % nodes), intra-node slot (r // nodes): spreads neighbors apart.
+        return [min(n - 1, (r % nodes) * gpus_per_node + (r // nodes)) for r in range(n)]
+    if placement == "adversarial":
+        # reverse within the rank space, then stripe — pushes a rank's neighbors to far nodes.
+        return [min(n - 1, ((n - 1 - r) % nodes) * gpus_per_node + ((n - 1 - r) // nodes))
+                for r in range(n)]
+    return list(range(n))
+
+
+def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int,
+                     gpus_per_node: int, scale_up_domain: int = None,
+                     placement: str = "packed") -> dict:
+    """Locality of the routed (token, dest-rank) copies (goal Part 2 topology section).
+    A token's SOURCE rank is global_id // tokens_per_rank; its DEST ranks are idx // epr. The
+    PLACEMENT maps each logical rank to a physical slot, so node/domain membership — and thus the
+    same-node / same-domain / cross-* fractions — depend on packed vs striped vs adversarial."""
+    import torch as _t
+    gt = idx.shape[0]
+    dest = (idx // experts_per_rank).clamp(max=ep_size - 1)             # [gt, topk] dest logical rank
+    src = (_t.arange(gt) // max(1, tokens_per_rank)).clamp(max=ep_size - 1).unsqueeze(1)
+    src = src.expand_as(dest)
+    sud = scale_up_domain or (gpus_per_node * ep_size)                  # default: all one domain
+    # physical slot of each logical rank, per placement -> node / domain it lives in.
+    perm = placement_perm(ep_size, gpus_per_node, placement)
+    phys = _t.tensor(perm, dtype=_t.int64)
+    pd, ps = phys[dest], phys[src]
+    local = (dest == src)
+    same_node = (pd // gpus_per_node) == (ps // gpus_per_node)
+    same_dom = (pd // sud) == (ps // sud)
+    n = dest.numel()
+    return {
+        "placement": placement,
+        "local_rank_fraction": float(local.float().mean()),
+        "same_node_fraction": float(same_node.float().mean()),
+        "same_scaleup_domain_fraction": float(same_dom.float().mean()),
+        "cross_node_fraction": float((~same_node).float().mean()),
+        "cross_domain_fraction": float((~same_dom).float().mean()),
+        "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n),
+    }
+
+
+def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict:
+    """Realized routing properties for the GLOBAL trace — published per point so the
+    fan-out / load can never be silently misread. idx is the global [gt, topk] tensor;
+    weights the matching [gt, topk] gate weights (hashed too for workload identity).
+    """
+    ep = max(1, experts // max(1, experts_per_rank))
+    ranks = (idx // experts_per_rank)                       # [gt, topk] destination rank per assignment
+    # unique destination ranks per token (fan-out)
+    onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool)
+    onehot.scatter_(1, ranks.clamp(max=ep - 1), True)
+    fanout = onehot.sum(dim=1)                              # [gt]
+    hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist()  # counts for fan-out 1..ep
+    load = torch.bincount(idx.reshape(-1), minlength=experts).float()
+    # token-copies SENT to each destination rank (the "send histogram", review #3).
+    rank_load_t = torch.bincount(ranks.reshape(-1).clamp(max=ep - 1), minlength=ep).float()
+    rank_load = [int(x) for x in rank_load_t.tolist()]
+    # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity
+    # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio =
+    # worst expert load over the mean. uniform -> CV≈0, hotspot_ratio≈1; zipf / hotspot-single ->
+    # high CV and hotspot_ratio (≫1). Population std (unbiased=False) over the full realized trace.
+    def _cv(t):
+        m = float(t.mean())
+        return float(t.std(unbiased=False) / m) if m > 0 else 0.0
+    expert_load_cv = _cv(load)
+    rank_load_cv = _cv(rank_load_t)
+    hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0
+    # Empty-expert / empty-rank counts (goal P2 "report full load and fanout statistics"):
+    # how many experts/dest-ranks received ZERO token-copies (the dark side of skew — idle
+    # units while the hot rank stalls). dest-rank load max/mean make the rank histogram
+    # self-describing without re-reading rank_load_hist.
+    empty_expert_count = int((load == 0).sum())
+    empty_rank_count = int((rank_load_t == 0).sum())
+    dest_rank_load_max = int(rank_load_t.max())
+    dest_rank_load_mean = float(rank_load_t.mean())
+    # SHA-256 workload identity over BOTH topk_idx and gate weights (review #3): a chart
+    # point's routing is provably identical across SKUs only if both hashes match.
+    idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes()
+    idx_hash = hashlib.sha256(idx_bytes).hexdigest()[:16]
+    if weights is not None:
+        w_bytes = weights.to(torch.float32).cpu().numpy().tobytes()
+        w_hash = hashlib.sha256(w_bytes).hexdigest()[:16]
+        routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()[:16]  # combined identity
+    else:
+        w_hash, routing_hash = None, idx_hash
+    return {
+        "fanout_mean": float(fanout.float().mean()),
+        "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()),
+        "fanout_hist": hist,                               # index k-1 = #tokens with fan-out k
+        "rank_load_hist": rank_load,                       # token-copies sent to each dest rank
+        "routed_copies": int(fanout.sum()),                # total (token, dest-rank) pairs
+        "expert_load_min": int(load.min()), "expert_load_max": int(load.max()),
+        "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv,
+        "rank_load_cv": rank_load_cv, "hotspot_ratio": hotspot_ratio,
+        "dest_rank_load_max": dest_rank_load_max, "dest_rank_load_mean": dest_rank_load_mean,
+        "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count,
+        "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash,
+    }
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":  # needs torch; verifies routing stats and value profiles
+    import sys
+    E, TOPK, EPR, GT = 256, 8, 32, 4096
+    # (1) static hotspot pins expert zero and keeps every token's top-k distinct.
+    si, _ = build_global_routing(GT, E, TOPK, "hotspot-single", 67, EPR)
+    assert (si[:, 0] == 0).all(), "hotspot-single must pin expert 0 on every step"
+    assert all(len(set(r.tolist())) == TOPK for r in si[:16]), "hotspot top-k must stay distinct"
+    # (2) uniform has low concentration while hotspot is visibly concentrated.
+    su = routing_stats(build_global_routing(GT, E, TOPK, "uniform", 67, EPR)[0], E, EPR)
+    sh = routing_stats(si, E, EPR)
+    assert su["hotspot_ratio"] < 1.5 and sh["hotspot_ratio"] > 5, "hotspot_ratio must separate uniform/hotspot"
+    assert sh["empty_expert_count"] >= 0 and "empty_rank_count" in sh and "dest_rank_load_max" in sh
+    print(f"routing stats OK (uniform hotspot_ratio={su['hotspot_ratio']:.2f} "
+          f"hotspot empty_experts={sh['empty_expert_count']} dest_rank_max={sh['dest_rank_load_max']})")
+    # (3) value profiles: distinct value shapes, all finite, fp8-saturation exceeds e4m3 max.
+    dev = torch.device("cpu")
+    z = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="zeros")
+    assert float(z.abs().max()) == 0.0, "zeros profile must be all-zero"
+    sat = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="fp8-saturation")
+    assert float(sat.abs().max()) > _FP8_E4M3_MAX, "fp8-saturation must exceed e4m3 max"
+    sm = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile="small-amplitude")
+    assert float(sm.abs().max()) < 1.0, "small-amplitude must be tiny"
+    for prof in ACTIVATION_PROFILES:
+        v = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32, profile=prof)
+        assert torch.isfinite(v).all(), f"{prof} produced non-finite values"
+    print(f"activation profiles OK ({', '.join(ACTIVATION_PROFILES)})")
+    print("routing self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 0000000000..95443322d2
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun).
+
+Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep
+via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine
+are timed SEPARATELY (see ep_harness); only T varies along the resulting line.
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\
+      --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\
+      --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\
+      --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\
+      --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import ep_harness  # noqa: E402  (stdlib-only; safe before torch)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+    ap.add_argument("--backend", required=True,
+                    choices=["deepep", "deepep-hybrid", "mori", "uccl", "nccl-ep", "flashinfer"])
+    ep_harness.add_common_args(ap)
+    args = ap.parse_args()
+
+    sampling_error = ep_harness.sampling_contract_error(args.iters, args.trials, args.warmup)
+    if sampling_error:
+        print(f"ERROR: {sampling_error}", file=sys.stderr)
+        return 2
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12355")
+
+    # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction
+    # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL
+    # routing trace and remaps it to the balanced physical placement (a pure routing transform,
+    # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count.
+    if getattr(args, "eplb", False):
+        import eplb
+        args.num_logical_experts = args.experts
+        args.experts = eplb.physical_count(args.experts, args.num_redundant_experts, world_size)
+
+    # Reproduction provenance (recorded in the artifact). Rack launchers provide ranks directly
+    # through srun, while single-node launchers use torchrun; do not claim torchrun for both.
+    if os.environ.get("TORCHELASTIC_RUN_ID"):
+        args.distributed_launcher = "torchrun"
+        prefix = f"torchrun --nproc_per_node={world_size}"
+    else:
+        args.distributed_launcher = "rank-environment"
+        prefix = f"RANK={rank} WORLD_SIZE={world_size} LOCAL_RANK={local_rank} python3"
+    args.reproduction_command = f"{prefix} tests/run_ep.py " + " ".join(sys.argv[1:])
+    args.image = os.environ.get("COLLECTIVEX_IMAGE", "")
+    args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "")
+    # Container provenance (goal P1): arch (amd64/arm64) + local squash hash for Enroot/Pyxis.
+    import platform as _plat
+    _arch = {"x86_64": "amd64", "aarch64": "arm64"}.get(_plat.machine(), _plat.machine())
+    args.image_arch = _arch
+    args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256")
+    # Complete GitHub provenance (goal P1): repo, run id, attempt, ref/branch, source SHA, job,
+    # artifact. A result is only publication-'official' when these are present (validity gate).
+    _run = {"run_id": os.environ.get("GITHUB_RUN_ID"),
+            "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+            "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+            "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+            "repo": os.environ.get("GITHUB_REPOSITORY"),
+            "job": os.environ.get("GITHUB_JOB"),
+            "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME")}
+    args.git_run = _run if any(_run.values()) else None
+
+    # Import the backend CLASS (module-top imports torch + the backend lib; no process
+    # group needed) and REJECT unsupported combos BEFORE init — never fall back or
+    # mislabel (review/goal). All ranks reject identically.
+    if args.backend == "mori":
+        from ep_mori import MoRIBackend as Backend
+    elif args.backend == "nccl-ep":
+        from ep_nccl import NCCLBackend as Backend
+    elif args.backend == "uccl":
+        from ep_uccl import UCCLBackend as Backend
+    elif args.backend == "flashinfer":
+        from ep_flashinfer import FlashInferBackend as Backend
+    elif args.backend == "deepep-hybrid":
+        from ep_deepep_hybrid import DeepEPHybridBackend as Backend
+    else:
+        from ep_deepep import DeepEPBackend as Backend
+    if args.num_ep_groups != 1:
+        if rank == 0:
+            print(f"ERROR: num_ep_groups={args.num_ep_groups} REJECTED — real subgroup process "
+                  f"groups are unimplemented; not faking it.", file=sys.stderr)
+        return 5
+    sp = getattr(Backend, "SUPPORTED_PRECISIONS", {"bf16"})
+    sm = getattr(Backend, "SUPPORTED_MODES", {"normal"})
+    if args.dispatch_dtype not in sp or args.mode not in sm:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS dispatch-dtype={args.dispatch_dtype} / "
+                  f"mode={args.mode} — not supported on this build (no fallback). "
+                  f"supported precisions={sorted(sp)} modes={sorted(sm)}.", file=sys.stderr)
+        return 5
+    # Combine-path capability (review: dispatch_dtype=fp8 must NOT silently imply quantized
+    # combine). Defaults (bf16 / none) reproduce today's behavior; a quant-combine backend
+    # widens its SUPPORTED_COMBINE_* sets. getattr keeps backends that don't declare them at bf16/none.
+    scd = getattr(Backend, "SUPPORTED_COMBINE_DTYPES", {"bf16"})
+    sqm = getattr(Backend, "SUPPORTED_COMBINE_QUANT_MODES", {"none"})
+    cdt = getattr(args, "combine_dtype", "bf16")
+    cqm = getattr(args, "combine_quant_mode", "none")
+    if cdt not in scd or cqm not in sqm:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS combine-dtype={cdt} / combine-quant-mode={cqm} "
+                  f"— quant combine not wired (no fallback). supported combine_dtypes={sorted(scd)} "
+                  f"quant_modes={sorted(sqm)}.", file=sys.stderr)
+        return 5
+    # Measurement-contract capability (review #3): each adapter conforms to a declared
+    # contract; reject anything else rather than letting it pick its own timing boundary.
+    sc = getattr(Backend, "SUPPORTED_CONTRACTS", {"layout-and-dispatch-v1"})
+    if args.measurement_contract not in sc:
+        if rank == 0:
+            print(f"ERROR: {args.backend} REJECTS measurement-contract="
+                  f"{args.measurement_contract} — supported={sorted(sc)}.", file=sys.stderr)
+        return 5
+    if args.measurement_contract == "cached-layout-comm-only-v1" and args.mode == "ll":
+        if rank == 0:
+            print("ERROR: cached-layout-comm-only-v1 is meaningless for LL (low_latency_dispatch "
+                  "computes its layout internally; nothing to hoist).", file=sys.stderr)
+        return 5
+
+    # MoRI uses the gloo+NCCL group shape from its reference; other adapters use NCCL/RCCL.
+    if not dist.is_initialized():
+        if args.backend == "mori":
+            dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world_size,
+                                    device_id=device)
+        else:
+            dist.init_process_group("nccl")
+
+    # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its
+    # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an
+    # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise.
+    try:
+        backend = Backend(args, rank, world_size, local_rank, device)
+        if rank == 0:
+            print(f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} "
+                  f"world={world_size} ep_size={world_size} hidden={args.hidden} "
+                  f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype} "
+                  f"routing={args.routing} seed={args.seed}")
+        rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+    except Exception:
+        import traceback
+        print(f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n" + traceback.format_exc(),
+              flush=True)
+        raise
+    # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+    # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+    return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/test_sampling_contract.py b/experimental/CollectiveX/tests/test_sampling_contract.py
new file mode 100644
index 0000000000..ae530dc524
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_sampling_contract.py
@@ -0,0 +1,845 @@
+#!/usr/bin/env python3
+"""Focused tests for the CollectiveX fixed EP sampling contract."""
+from __future__ import annotations
+
+import argparse
+import ast
+import copy
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+from unittest import mock
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+ROOT = os.path.dirname(HERE)
+sys.path.insert(0, HERE)
+sys.path.insert(0, ROOT)
+
+import ep_harness  # noqa: E402
+import aggregate_results as ar  # noqa: E402
+import capability  # noqa: E402
+import generate_matrix  # noqa: E402
+import make_bundle  # noqa: E402
+import summarize  # noqa: E402
+import sweep_matrix  # noqa: E402
+import validate_results as vr  # noqa: E402
+
+
+def _hist(n: int) -> dict:
+    return {"n": n, "min": 1.0, "max": 1.0, "bins": 40, "counts": [n]}
+
+
+def _doc(iters: int = 8, trials: int = 64, warmup: int = 32, samples: int = 512) -> dict:
+    validity = {
+        "execution_status": "complete",
+        "semantic_correctness": "pass",
+        "workload_identity": "consistent-across-ranks",
+        "workload_source": "seeded-runtime",
+        "measurement_conformance": "conformant",
+        "sampling_conformance": "conformant",
+        "resource_conformance": "backend-default",
+        "provenance_complete": False,
+        "anomaly_free": True,
+    }
+    pcts = {"p50": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0}
+    return {
+        "schema_version": 5,
+        "family": "moe",
+        "runner": "test-runner",
+        "backend": "deepep",
+        "mode": "normal",
+        "phase": "decode",
+        "ep_size": 8,
+        "publication_status": "comparable-experimental",
+        "measurement_contract": "layout-and-dispatch-v1",
+        "shape": {
+            "hidden": 7168,
+            "topk": 8,
+            "experts": 256,
+            "experts_per_rank": 32,
+            "dispatch_dtype": "bf16",
+            "routing": "uniform",
+        },
+        "validity": validity,
+        "workload": {
+            "source": "seeded-runtime",
+            "workload_id": None,
+            "trace_signature": "abc",
+            "cross_rank_consistent": True,
+        },
+        "reproduction": {
+            "command": "python3 tests/run_ep.py",
+            "seed": 67,
+            "measurement_contract": "layout-and-dispatch-v1",
+            "sampling_contract": "fixed-512-v1",
+            "samples_per_point": samples,
+            "iters": iters,
+            "trials": trials,
+            "warmup": warmup,
+            "warmup_semantics": "full-roundtrip-per-trial-point-v1",
+        },
+        "placement": {
+            "kind": "packed", "nodes": 1, "gpus_per_node": 8,
+            "scale_up_domain": 8, "ranks": 8,
+        },
+        "backend_provenance": {},
+        "comparison_key": "fixture-comparison-key",
+        "anomalies": [],
+        "anomaly_summary": {"waived": False},
+        "rows": [{
+            "tokens_per_rank": 8,
+            "global_tokens": 64,
+            "samples_pooled": samples,
+            "trials": trials,
+            "dispatch": dict(pcts),
+            "combine": dict(pcts),
+            "roundtrip": dict(pcts),
+            "isolated_sum": {},
+            "byte_contracts": {
+                "token_rank_payload_copies": 64,
+                "token_expert_payload_copies": 512,
+                "dispatch_bytes": 1,
+                "combine_bytes": 1,
+            },
+            "correct": True,
+            "raw_samples": {
+                "dispatch": _hist(samples),
+                "combine": _hist(samples),
+                "roundtrip": _hist(samples),
+            },
+        }],
+    }
+
+
+def _failed(case: dict, generated_at="2026-07-03T00:00:00Z", **fields) -> dict:
+    return {
+        "schema_version": 5, "family": "moe", "record_type": "failed-case",
+        "runner": "h100-dgxc_01", "topology_class": "h100-nvlink-island",
+        "backend": case["backend"], "phase": case["phase"],
+        "publication_status": "failed", "generated_at": generated_at, "rows": [],
+        "failure": {"failure_mode": "timeout", "return_code": 124, "case": case}, **fields,
+    }
+
+
+class SamplingContractTest(unittest.TestCase):
+    def test_constants_and_default_profile_match_validator(self) -> None:
+        self.assertEqual(ep_harness.SCHEMA_VERSION, 5)
+        self.assertEqual(ep_harness.SAMPLING_CONTRACT, vr.SAMPLING_CONTRACT)
+        self.assertEqual(ep_harness.TIMED_SAMPLES_PER_POINT, vr.TIMED_SAMPLES_PER_POINT)
+        self.assertEqual(ep_harness.TIMED_ITERS_PER_TRIAL, vr.TIMED_ITERS_PER_TRIAL)
+        self.assertEqual(ep_harness.TRIALS_PER_POINT, vr.TRIALS_PER_POINT)
+        self.assertEqual(ep_harness.WARMUP_ITERS_PER_TRIAL, vr.WARMUP_ITERS_PER_TRIAL)
+        self.assertEqual(ep_harness.WARMUP_SEMANTICS, vr.WARMUP_SEMANTICS)
+        self.assertIsNone(ep_harness.sampling_contract_error(8, 64, 32))
+
+        parser = argparse.ArgumentParser()
+        ep_harness.add_common_args(parser)
+        args = parser.parse_args([
+            "--runner", "test", "--topology-class", "test-topology", "--out", "result.json",
+        ])
+        self.assertEqual((args.iters, args.trials, args.warmup), (8, 64, 32))
+
+        schemas = vr.load_schema_registry()
+        self.assertEqual(sorted(schemas), [3, 4, 5])
+        self.assertIs(schemas[3], schemas[4])
+        self.assertEqual(schemas[5]["properties"]["schema_version"]["const"], 5)
+        reproduction = schemas[5]["properties"]["reproduction"]["properties"]
+        self.assertEqual((reproduction["iters"]["const"], reproduction["trials"]["const"],
+                          reproduction["warmup"]["const"]), (8, 64, 32))
+        self.assertEqual(reproduction["warmup_semantics"]["const"],
+                         "full-roundtrip-per-trial-point-v1")
+
+    def test_non_exact_profiles_are_rejected_even_when_the_product_is_512(self) -> None:
+        self.assertIn("got 200:3:32", ep_harness.sampling_contract_error(200, 3, 32))
+        self.assertIn("got 8:1:4", ep_harness.sampling_contract_error(8, 1, 4))
+        self.assertIn("got 128:4:32", ep_harness.sampling_contract_error(128, 4, 32))
+        self.assertIn("got 8:64:4", ep_harness.sampling_contract_error(8, 64, 4))
+        self.assertIn("got 0:64:32", ep_harness.sampling_contract_error(0, 64, 32))
+
+    def test_valid_comparison_grade_fixture_passes(self) -> None:
+        doc = _doc()
+        errors, warnings, status = vr.validate_doc(doc, vr.load_schema_registry(), "fixture.json")
+        self.assertEqual(status, "comparable-experimental")
+        self.assertEqual(errors, [])
+        self.assertEqual(warnings, [])
+
+    def test_tampered_sample_counts_cannot_remain_comparison_grade(self) -> None:
+        for mutate in (
+            lambda d: d["reproduction"].update(iters=200, trials=3, samples_per_point=600),
+            lambda d: d["reproduction"].update(iters=128, trials=4),
+            lambda d: d["reproduction"].update(warmup=4),
+            lambda d: d["reproduction"].update(warmup_semantics="operation-specific-v0"),
+            lambda d: d["rows"][0].update(samples_pooled=600),
+            lambda d: d["rows"][0]["raw_samples"]["roundtrip"].update(n=8, counts=[8]),
+            lambda d: d["rows"][0]["raw_samples"]["dispatch"].update(counts=[511]),
+        ):
+            with self.subTest(mutate=mutate):
+                doc = copy.deepcopy(_doc())
+                mutate(doc)
+                errors, _warnings, _status = vr.validate_doc(doc, None, "tampered.json")
+                self.assertTrue(any("sampling" in error for error in errors), errors)
+
+    def test_all_sweep_cases_use_the_exact_profile(self) -> None:
+        with tempfile.TemporaryDirectory() as td:
+            out = os.path.join(td, "matrix.json")
+            proc = subprocess.run(
+                [sys.executable, os.path.join(ROOT, "sweep_matrix.py"), "--suites", "all",
+                 "--backends", "all", "--out", out],
+                cwd=ROOT, text=True, capture_output=True, check=False,
+            )
+            self.assertEqual(proc.returncode, 0, proc.stderr or proc.stdout)
+            with open(out) as fh:
+                matrix = json.load(fh)
+        cases = [case for shard in matrix["include"] for case in shard["cases"]]
+        self.assertTrue(cases)
+        self.assertEqual(len(matrix["include"]), 39)
+        self.assertEqual(len(cases), 232)
+        points = sum(len(case["ladder"].split()) if case["ladder"] else
+                     (8 if case["phase"] == "decode" else 6) for case in cases)
+        self.assertEqual(points, 618)
+        self.assertEqual({case["timing"] for case in cases}, {"8:64:32"})
+        self.assertEqual({case["samples_per_point"] for case in cases}, {512})
+        self.assertEqual({case["warmup_semantics"] for case in cases},
+                         {"full-roundtrip-per-trial-point-v1"})
+        self.assertEqual({shard["sku"] for shard in matrix["include"]},
+                         {"b200-dgxc", "b300", "gb200", "gb300", "h100-dgxc", "h200-dgxc",
+                          "mi325x", "mi355x"})
+        for shard in matrix["include"]:
+            platform = capability.PLATFORMS[shard["sku"]]
+            self.assertEqual(shard["launcher"], platform["launcher"])
+            self.assertEqual(shard["gpus_per_node"], platform["gpus_per_node"])
+            self.assertEqual(shard["scale_up_domain"], platform["scale_up_domain"])
+            self.assertTrue(all(case["gpus_per_node"] == platform["gpus_per_node"]
+                                and case["scale_up_domain"] == platform["scale_up_domain"]
+                                for case in shard["cases"]))
+            self.assertTrue(os.path.isfile(os.path.join(
+                ROOT, "launchers", f"launch_{shard['launcher']}.sh"
+            )))
+        self.assertEqual({case["suite"] for case in cases}, {"ep-core-v1", "ep-routing-v1"})
+        self.assertEqual({case["mode"] for case in cases}, {"normal"})
+        self.assertEqual({case["dtype"] for case in cases}, {"bf16"})
+        self.assertEqual({case["contract"] for case in cases}, {"layout-and-dispatch-v1"})
+        self.assertEqual({case["workload"] for case in cases}, {"deepseek-v3-v1"})
+        case_ids = [case["case_id"] for case in cases]
+        self.assertEqual(len(case_ids), len(set(case_ids)))
+        self.assertTrue(all(case_id.startswith("cxv1-") for case_id in case_ids))
+        self.assertTrue(all(case["canonical"] for case in cases))
+        self.assertTrue(all(not case["eplb"] or case["routing"] == "zipf" for case in cases))
+
+    def test_matrix_uses_public_gha_platform_registry(self) -> None:
+        original_load = generate_matrix._load
+
+        def public_load(name: str):
+            self.assertNotIn(name, {"platforms.yaml", "backends.yaml"})
+            return original_load(name)
+
+        with mock.patch.object(generate_matrix, "_load", side_effect=public_load):
+            generated = generate_matrix.generate("ep-core-v1")
+        self.assertTrue(generated["cases"])
+        suite_platforms = set(
+            generate_matrix._load("suites.yaml")["suites"]["ep-core-v1"]["platforms"]
+        )
+        self.assertLessEqual(suite_platforms, set(capability.PLATFORMS))
+        self.assertEqual(
+            {case["platform"] for case in generated["cases"]},
+            {"h100-dgxc", "h200-dgxc", "b200-dgxc", "b300", "gb200", "gb300", "mi325x", "mi355x"},
+        )
+        self.assertEqual(
+            set(capability.PLATFORMS),
+            {"h100-dgxc", "h200-dgxc", "b200-dgxc", "b300", "gb200", "gb300",
+             "mi325x", "mi355x"},
+        )
+        self.assertFalse(capability.resolve("b300", "deepep", mode="ll")[0])
+        self.assertFalse(capability.resolve("h200", "deepep")[0])
+
+    def test_backend_ladder_limits_apply_after_backend_expansion(self) -> None:
+        self.assertEqual(
+            sweep_matrix._resolved_ladder(
+                "128 256 512", "prefill", "mori", "uniform", "mi355x"),
+            "128 256 512",
+        )
+        self.assertIsNone(sweep_matrix._resolved_ladder(
+            "512 2048", "prefill", "mori", "zipf", "mi355x"))
+        self.assertEqual(
+            sweep_matrix._resolved_ladder(
+                "512 2048", "prefill", "mori", "zipf", "mi325x"),
+            "512",
+        )
+        self.assertEqual(
+            sweep_matrix._resolved_ladder(
+                "512 2048", "prefill", "nccl-ep", "zipf", "mi355x"),
+            "512 2048",
+        )
+
+    def test_backend_filter_does_not_add_the_amd_native_backend(self) -> None:
+        def selected(option: str, backend: str) -> tuple[set[str], set[str]]:
+            with tempfile.TemporaryDirectory() as tmp:
+                out = os.path.join(tmp, "matrix.json")
+                proc = subprocess.run(
+                    [sys.executable, os.path.join(ROOT, "sweep_matrix.py"), "--suites", "all",
+                     option, backend, "--out", out],
+                    cwd=ROOT, text=True, capture_output=True, check=False,
+                )
+                self.assertEqual(proc.returncode, 0, proc.stderr or proc.stdout)
+                with open(out) as fh:
+                    shards = json.load(fh)["include"]
+            return ({shard["backend"] for shard in shards}, {shard["sku"] for shard in shards})
+
+        self.assertEqual(selected("--backend", "deepep")[0], {"deepep"})
+        self.assertEqual(selected("--backend", "mori"), ({"mori"}, {"mi325x", "mi355x"}))
+        backends, skus = selected("--backend", "nccl-ep")
+        self.assertEqual(backends, {"nccl-ep"})
+        self.assertEqual(skus, set(capability.PLATFORMS))
+
+    def test_official_workloads_require_a_pinned_source(self) -> None:
+        suite = {"workloads": ["deepseek-v3-v1"], "required_publication": "official"}
+        workloads = {"model_derived": {"deepseek-v3-v1": {"verified_against": "pinned"}}}
+        generate_matrix.validate_workloads("core", suite, workloads)
+        workloads["model_derived"]["deepseek-v3-v1"].pop("verified_against")
+        with self.assertRaises(SystemExit):
+            generate_matrix.validate_workloads("core", suite, workloads)
+
+    def test_gradual_conditioning_does_not_expand_scored_ladder(self) -> None:
+        scored = [512]
+        self.assertEqual(ep_harness.conditioning_ladder(scored, True),
+                         [1, 2, 4, 8, 16, 32, 64, 128, 256, 512])
+        self.assertEqual(scored, [512])
+
+    def test_bundle_coverage_requires_one_result_at_the_required_tier(self) -> None:
+        case_id = "cxv1-0123456789abcdefabcd"
+        matrix = {"include": [{"cases": [{
+            "case_id": case_id, "required_publication": "official",
+        }]}]}
+        doc = {"family": "moe", "case_id": case_id, "publication_status": "official",
+               "required_publication": "official"}
+        self.assertEqual(
+            make_bundle.validate_expected_coverage([doc], matrix),
+            {"expected": 1, "observed": 1, "complete": True},
+        )
+        with self.assertRaises(SystemExit):
+            make_bundle.validate_expected_coverage(
+                [{**doc, "publication_status": "comparable-experimental"}], matrix)
+        with self.assertRaises(SystemExit):
+            make_bundle.validate_expected_coverage([], matrix)
+        with self.assertRaises(SystemExit):
+            make_bundle.validate_expected_coverage([doc, doc], matrix)
+
+    def test_ep_result_producer_never_inlines_environment_documents(self) -> None:
+        path = os.path.join(ROOT, "tests", "ep_harness.py")
+        with open(path) as fh:
+            tree = ast.parse(fh.read(), path)
+        self.assertFalse(any(
+            isinstance(node, ast.Constant) and node.value == "environment"
+            for node in ast.walk(tree)
+        ))
+
+    def test_environment_capture_calls_are_redacted(self) -> None:
+        callsites = ("runtime/run_in_container.sh",)
+        for relative in callsites:
+            with self.subTest(callsite=relative):
+                with open(os.path.join(ROOT, relative)) as fh:
+                    calls = [line for line in fh if "env_capture.py" in line]
+                self.assertTrue(calls)
+                self.assertTrue(all("--redact" in line for line in calls))
+
+    def test_flashinfer_retries_preserve_attempt_evidence(self) -> None:
+        with open(os.path.join(ROOT, "runtime", "run_in_container.sh")) as fh:
+            runtime = fh.read()
+        self.assertIn('export CX_ATTEMPT_ID="$a"', runtime)
+        self.assertNotIn('rm -f results/failed_', runtime)
+        for launcher in ("launch_gb200-nv.sh", "launch_gb300-nv.sh"):
+            with open(os.path.join(ROOT, "launchers", launcher)) as fh:
+                rack = fh.read()
+            self.assertIn('CX_FLASHINFER_RETRIES:-3', rack)
+            self.assertIn('export CX_ATTEMPT_ID="$attempt"', rack)
+        with open(os.path.join(ROOT, "runtime", "common.sh")) as fh:
+            self.assertIn('"attempt_id": env("CX_ATTEMPT_ID", "1")', fh.read())
+
+    def test_rack_build_only_uses_shared_backend_preparation(self) -> None:
+        with open(os.path.join(ROOT, "runtime", "run_in_container.sh")) as fh:
+            runtime = fh.read()
+        self.assertIn("cx_prepare_backend()", runtime)
+        self.assertIn('cx_prepare_backend "${CX_BENCH:-}"', runtime)
+        self.assertIn("cx_persist_backend_env", runtime)
+
+    def test_uccl_build_is_idempotent_within_a_shard(self) -> None:
+        with open(os.path.join(ROOT, "runtime", "run_in_container.sh")) as fh:
+            runtime = fh.read()
+        self.assertIn("[ -f /tmp/.cx_built_uccl ]", runtime)
+        self.assertIn(": > /tmp/.cx_built_uccl", runtime)
+        self.assertIn("DEEPEP_COMMIT", runtime)
+        self.assertIn("FLASHINFER_COMMIT", runtime)
+        self.assertIn("CX_FLASHINFER_STACK", runtime)
+        self.assertIn('python3 -c "from deep_ep import Buffer"', runtime)
+        self.assertIn('[ "${CX_FLASHINFER_UPGRADE:-}" = "1" ]', runtime)
+        for backend in ("deepep", "deepep-hybrid", "flashinfer"):
+            self.assertIn(f"cx_prepare_backend {backend}", runtime)
+        for launcher in ("launch_gb200-nv.sh", "launch_gb300-nv.sh"):
+            with self.subTest(launcher=launcher):
+                with open(os.path.join(ROOT, "launchers", launcher)) as fh:
+                    source = fh.read()
+                self.assertIn("CX_BUILD_ONLY=1", source)
+                self.assertIn('cx_die "EP backend preparation failed"', source)
+                self.assertIn("/tmp/.cx_backend_env", source)
+                self.assertNotIn("/tmp/.cx_hybrid_env", source)
+
+    def test_rack_launchers_pass_public_topology_and_manual_gb300_defaults_one_node(self) -> None:
+        for launcher, gpn in (("launch_gb200-nv.sh", "GPUS_PER_NODE"),
+                              ("launch_gb300-nv.sh", "GPN")):
+            with self.subTest(launcher=launcher):
+                with open(os.path.join(ROOT, "launchers", launcher)) as fh:
+                    source = fh.read()
+                self.assertIn(f'--gpus-per-node "${gpn}"', source)
+                self.assertIn('--scale-up-domain "$SCALE_UP_DOMAIN"', source)
+        with open(os.path.join(ROOT, "launchers", "launch_gb300-nv.sh")) as fh:
+            gb300 = fh.read()
+        self.assertIn('NODES="${CX_NODES:-1}"', gb300)
+        self.assertNotIn('NODES="${CX_NODES:-2}"', gb300)
+
+    def test_flashinfer_rack_mapping_never_falls_back_to_world_as_node_size(self) -> None:
+        with open(os.path.join(ROOT, "tests", "ep_flashinfer.py")) as fh:
+            source = fh.read()
+        tree = ast.parse(source)
+        mapping = next(node for node in tree.body
+                       if isinstance(node, ast.FunctionDef) and node.name == "_build_mapping")
+        self.assertEqual([arg.arg for arg in mapping.args.args],
+                         ["world_size", "rank", "gpus_per_node"])
+        self.assertNotIn("gpus_per_node=world_size", source)
+        self.assertIn("if gpus_per_node == world_size", source)
+
+    def test_sm_budget_setters_fail_instead_of_recording_an_unapplied_request(self) -> None:
+        for adapter, library in (("ep_deepep.py", "DeepEP"), ("ep_uccl.py", "UCCL")):
+            with self.subTest(adapter=adapter):
+                with open(os.path.join(ROOT, "tests", adapter)) as fh:
+                    source = fh.read()
+                self.assertIn(f'raise RuntimeError(f"{library} did not apply requested num_sms=', source)
+                self.assertIn('"requested_num_sms": num_sms', source)
+                self.assertIn('"num_sms": applied_num_sms', source)
+
+    def test_nccl_version_normalizes_integer_and_tuple_and_labels_rccl(self) -> None:
+        path = os.path.join(ROOT, "tests", "ep_nccl.py")
+        with open(path) as fh:
+            source = fh.read()
+        tree = ast.parse(source, path)
+        fn = next(node for node in tree.body
+                  if isinstance(node, ast.FunctionDef) and node.name == "_format_collective_version")
+        namespace = {}
+        exec(compile(ast.Module(body=[fn], type_ignores=[]), path, "exec"), namespace)
+        self.assertEqual(namespace["_format_collective_version"](21805), "2.18.5")
+        self.assertEqual(namespace["_format_collective_version"](2809), "2.8.9")
+        self.assertEqual(namespace["_format_collective_version"]((2, 21, 5)), "2.21.5")
+        self.assertIn('"rccl" if torch.version.hip else "nccl"', source)
+
+    def test_result_doc_probe_distinguishes_terminal_invalid_results(self) -> None:
+        common = os.path.join(ROOT, "runtime", "common.sh")
+        env = {**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}
+        with tempfile.TemporaryDirectory() as tmp:
+            valid = os.path.join(tmp, "invalid-result.json")
+            incomplete = os.path.join(tmp, "incomplete.json")
+            malformed = os.path.join(tmp, "malformed.json")
+            with open(valid, "w") as fh:
+                json.dump({"schema_version": 5, "family": "moe", "status": "invalid"}, fh)
+            with open(incomplete, "w") as fh:
+                json.dump({"schema_version": 5, "family": "moe"}, fh)
+            with open(malformed, "w") as fh:
+                fh.write("{")
+            command = 'source "$1"; cx_has_result_doc "$2"'
+            self.assertEqual(
+                subprocess.run(["bash", "-c", command, "_", common, valid], env=env).returncode,
+                0,
+            )
+            for path in (incomplete, malformed):
+                self.assertNotEqual(
+                    subprocess.run(["bash", "-c", command, "_", common, path], env=env).returncode,
+                    0,
+                )
+
+    def test_nonzero_command_demotes_an_emitted_result(self) -> None:
+        common = os.path.join(ROOT, "runtime", "common.sh")
+        env = {**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}
+        with tempfile.TemporaryDirectory() as tmp:
+            path = os.path.join(tmp, "result.json")
+            with open(path, "w") as fh:
+                json.dump(_doc(), fh)
+            subprocess.run(
+                ["bash", "-c", 'source "$1"; cx_demote_result_doc "$2" 17', "_", common, path],
+                check=True,
+                env=env,
+            )
+            with open(path) as fh:
+                result = json.load(fh)
+        self.assertEqual(result["publication_status"], "failed")
+        self.assertEqual(result["status"], "invalid")
+        self.assertEqual(result["validity"]["execution_status"], "failed")
+        self.assertEqual(result["post_emit_failure"]["return_code"], 17)
+
+    def test_failed_commands_cannot_leave_accepted_results(self) -> None:
+        with open(os.path.join(ROOT, "runtime", "run_in_container.sh")) as fh:
+            runtime = fh.read()
+        self.assertIn('cx_has_result_doc "$out"', runtime)
+        self.assertIn('cx_demote_result_doc "$out"', runtime)
+        for launcher in ("launch_gb200-nv.sh", "launch_gb300-nv.sh"):
+            with open(os.path.join(ROOT, "launchers", launcher)) as fh:
+                rack = fh.read()
+            self.assertIn('cx_has_result_doc "$expected_out"', rack)
+            self.assertIn('cx_demote_result_doc "$expected_out"', rack)
+            self.assertIn('failed_cases=$((failed_cases + 1))', rack)
+
+    def test_non_rack_launchers_reject_multi_node_runs(self) -> None:
+        launchers = (
+            "launch_h100-dgxc-slurm.sh", "launch_h200.sh", "launch_b200-dgxc.sh",
+            "launch_b300.sh", "launch_mi355x-amds.sh",
+        )
+        for launcher in launchers:
+            with self.subTest(launcher=launcher):
+                with open(os.path.join(ROOT, "launchers", launcher)) as fh:
+                    self.assertIn('cx_require_single_node "$RUNNER_NAME"', fh.read())
+
+    def test_image_digest_matches_the_selected_image(self) -> None:
+        common = os.path.join(ROOT, "runtime", "common.sh")
+        script = f'''
+          export HOME="$(mktemp -d)"
+          source {common!r}
+          test -n "$(cx_default_image_digest "$CX_IMAGE_MULTIARCH")"
+          test -z "$(cx_default_image_digest "$CX_IMAGE_AMD_MORI")"
+        '''
+        proc = subprocess.run(["bash", "-c", script], text=True, capture_output=True)
+        self.assertEqual(proc.returncode, 0, proc.stderr or proc.stdout)
+
+    def test_official_provenance_requires_every_declared_run_field(self) -> None:
+        provenance = {"version": "1.0", "commit": "abc123"}
+        run = {key: "value" for key in ep_harness.REQUIRED_GIT_RUN_FIELDS}
+        args = argparse.Namespace(image_digest="sha256:test", git_run=run)
+        self.assertTrue(ep_harness._provenance_complete(provenance, args))
+        for field in ep_harness.REQUIRED_GIT_RUN_FIELDS:
+            with self.subTest(field=field):
+                incomplete = argparse.Namespace(
+                    image_digest="sha256:test", git_run={**run, field: None}
+                )
+                self.assertFalse(ep_harness._provenance_complete(provenance, incomplete))
+
+    def test_official_provenance_requires_resolved_backend_build_identity(self) -> None:
+        run = {key: "value" for key in ep_harness.REQUIRED_GIT_RUN_FIELDS}
+        args = argparse.Namespace(backend="flashinfer", image_digest="sha256:test", git_run=run)
+        complete = {
+            "flashinfer_version": "0.6.14", "flashinfer_commit": "pkg-0.6.14",
+            "flashinfer_stack": "flashinfer-python=0.6.14 torch=2.9.0",
+        }
+        self.assertTrue(ep_harness._provenance_complete(complete, args))
+        for field, value in (("flashinfer_commit", "pkg-unknown"),
+                             ("flashinfer_stack", None),
+                             ("flashinfer_stack", "capture-failed")):
+            with self.subTest(field=field, value=value):
+                self.assertFalse(ep_harness._provenance_complete(
+                    {**complete, field: value}, args))
+
+        doc = _doc()
+        doc["validity"].update(provenance_complete=True, workload_source="canonical-serialized")
+        doc["publication_status"] = "official"
+        doc["workload"].update(source="canonical-serialized", workload_id="workload-1")
+        doc["backend_provenance"] = {"deepep_version": "1.2.1", "deepep_commit": "pkg-unknown"}
+        errors, _warnings, _status = vr.validate_doc(doc, None, "bad-provenance.json")
+        self.assertTrue(any("unresolved backend identity" in error for error in errors), errors)
+
+    def test_validator_rejects_platform_topology_mismatch(self) -> None:
+        doc = _doc()
+        doc["runner"] = "gb200-8x"
+        doc["placement"].update(nodes=2, gpus_per_node=4, scale_up_domain=72)
+        errors, _warnings, _status = vr.validate_doc(doc, None, "good-topology.json")
+        self.assertEqual(errors, [])
+        doc["placement"]["scale_up_domain"] = 8
+        errors, _warnings, _status = vr.validate_doc(doc, None, "bad-topology.json")
+        self.assertTrue(any("expected 72 for gb200" in error for error in errors), errors)
+
+    def test_aggregate_fails_closed_on_malformed_or_non_object_documents(self) -> None:
+        fixtures = (
+            ("broken.json", "{"),
+            ("broken.ndjson", '{"family":"moe"}\nnot-json\n'),
+            ("scalar.json", '"not-an-object"'),
+        )
+        for name, contents in fixtures:
+            with self.subTest(name=name), tempfile.TemporaryDirectory() as tmp:
+                with open(os.path.join(tmp, name), "w") as fh:
+                    fh.write(contents)
+                with self.assertRaises(SystemExit):
+                    ar.aggregate(tmp)
+
+    def test_bundle_recursively_rejects_sensitive_fields_and_value_shapes(self) -> None:
+        make_bundle.assert_publication_safe([{
+            "family": "moe",
+            "runner": "test-runner",
+            "provenance": {"source_sha": "abc123"},
+        }])
+        unsafe = (
+            {"nested": {"environment": {}}},
+            {"nested": {"hostname": "private-host"}},
+            {"nested": {"detail": "/home/private/result.json"}},
+            {"nested": {"detail": "192.0.2.1"}},
+            {"nested": {"detail": "2001:db8::1"}},
+            {"nested": {"detail": "ssh://user@private-host"}},
+        )
+        for document in unsafe:
+            with self.subTest(document=list(document)):
+                with self.assertRaises(SystemExit):
+                    make_bundle.assert_publication_safe([document])
+
+    def test_bundle_rejects_non_ep_families(self) -> None:
+        with self.assertRaisesRegex(SystemExit, "unsupported family"):
+            make_bundle.validate([{
+                "family": "kv-cache",
+                "publication_status": "official",
+                "rows": [],
+            }], None)
+
+    def test_summary_keeps_only_ep_docs_and_reports_failed_attempts(self) -> None:
+        valid = _doc()
+        valid["status"] = "valid"
+        failure = _failed({"backend": "deepep", "phase": "decode"}, attempt_id="2")
+        failure["status"] = "failed"
+        with tempfile.TemporaryDirectory() as tmp:
+            for name, document in (
+                ("valid.json", valid),
+                ("failed.json", failure),
+                ("foreign.json", {"family": "kv-cache", "status": "valid"}),
+            ):
+                with open(os.path.join(tmp, name), "w") as fh:
+                    json.dump(document, fh)
+            docs = summarize.load_results(tmp, None, None)
+
+        self.assertEqual(len(docs), 2)
+        self.assertEqual({doc["family"] for doc in docs}, {"moe"})
+        rendered = summarize.render_markdown(docs)
+        self.assertIn("Failed attempts", rendered)
+        self.assertIn("attempt", rendered)
+        self.assertNotIn("kv-cache", rendered)
+
+    def test_bundle_rejects_cross_chip_canonical_workload_drift(self) -> None:
+        def canonical(runner: str, routing_hash: str) -> dict:
+            doc = _doc()
+            doc.update(
+                runner=runner,
+                case_id=f"case-{runner}",
+                suite="ep-core-v1",
+                workload_name="deepseek-v3-v1",
+                required_publication="comparable-experimental",
+                phase="decode",
+                ep_size=8,
+                eplb={"enabled": False},
+            )
+            doc["shape"]["activation_profile"] = "normal"
+            doc["workload"].update(
+                source="canonical-serialized",
+                activation_identity="activation-a",
+            )
+            doc["rows"][0]["routing_hash"] = routing_hash
+            return doc
+
+        docs = [canonical("h100-dgxc", "route-a"), canonical("b300", "route-b")]
+        self.assertEqual(len(vr.cross_document_workload_issues(docs)), 1)
+        with self.assertRaisesRegex(SystemExit, "cross-document workload identity"):
+            make_bundle.validate(docs, None)
+
+    def test_bundle_coverage_rejects_case_id_with_wrong_semantics_or_rows(self) -> None:
+        case_id = "cxv1-0123456789abcdefabcd"
+        case = {
+            "case_id": case_id, "suite": "ep-core-v1", "workload": "deepseek-v3-v1",
+            "required_publication": "comparable-experimental", "backend": "deepep",
+            "mode": "normal", "dtype": "bf16",
+            "contract": "layout-and-dispatch-v1", "routing": "uniform", "phase": "decode",
+            "ep": 8, "eplb": False, "combine_quant_mode": "none",
+            "resource_mode": "tuned", "activation_profile": "normal",
+            "placement": "packed", "routing_step": "0", "uneven_tokens": "none",
+            "hidden": "", "topk": "", "experts": "", "samples_per_point": 512,
+            "warmup_semantics": "full-roundtrip-per-trial-point-v1", "ladder": "8",
+            "timing": "8:64:32", "canonical": False, "nodes": "1",
+            "gpus_per_node": 8, "scale_up_domain": 8,
+        }
+        matrix = {"include": [{"sku": "h100-dgxc", "gpus_per_node": 8,
+                                "scale_up_domain": 8, "cases": [case]}]}
+        doc = _doc()
+        doc.update(case_id=case_id, suite=case["suite"], workload_name=case["workload"],
+                   required_publication=case["required_publication"], resource_mode="tuned",
+                   runner="h100-dgxc-slurm_19")
+        self.assertEqual(
+            make_bundle.validate_expected_coverage([doc], matrix),
+            {"expected": 1, "observed": 1, "complete": True},
+        )
+
+        mutations = (
+            lambda value: value.update(suite="wrong-suite"),
+            lambda value: value.update(phase="prefill"),
+            lambda value: value["shape"].update(routing="zipf"),
+            lambda value: value["rows"][0].update(tokens_per_rank=16),
+            lambda value: value.update(runner="b200-dgxc-slurm_19"),
+            lambda value: value["placement"].update(gpus_per_node=4),
+        )
+        for mutate in mutations:
+            with self.subTest(mutate=mutate):
+                mismatched = copy.deepcopy(doc)
+                mutate(mismatched)
+                with self.assertRaisesRegex(SystemExit, "identity_mismatch"):
+                    make_bundle.validate_expected_coverage([mismatched], matrix)
+
+    def test_bundle_coverage_resolves_blank_ladder_to_v1_phase_default(self) -> None:
+        case_id = "cxv1-0123456789abcdefabcd"
+        case = {"case_id": case_id, "required_publication": "diagnostic",
+                "phase": "decode", "ladder": ""}
+        doc = {"family": "moe", "case_id": case_id,
+               "required_publication": "diagnostic", "publication_status": "diagnostic",
+               "phase": "decode", "rows": [
+            {"tokens_per_rank": token}
+            for token in (1, 2, 4, 8, 16, 32, 64, 128)
+        ]}
+        matrix = {"include": [{"cases": [case]}]}
+        self.assertEqual(
+            make_bundle.validate_expected_coverage([doc], matrix),
+            {"expected": 1, "observed": 1, "complete": True},
+        )
+        doc["rows"].pop()
+        with self.assertRaisesRegex(SystemExit, "identity_mismatch"):
+            make_bundle.validate_expected_coverage([doc], matrix)
+
+    def test_aggregate_preserves_distinct_failed_cases(self) -> None:
+        case = {
+            "suite": "ep-routing-v1", "workload": "deepseek-v3-v1",
+            "backend": "deepep", "phase": "decode", "ep": 8, "mode": "normal",
+            "dispatch_dtype": "bf16", "contract": "layout-and-dispatch-v1",
+            "routing": "zipf", "eplb": False, "combine_quant_mode": "none",
+            "resource_mode": "tuned", "tokens_ladder": "128",
+        }
+        docs = [
+            _failed(case),
+            _failed({**case, "eplb": True}, "2026-07-03T00:00:01Z"),
+            _failed(case, "2026-07-03T00:00:02Z"),
+        ]
+        with tempfile.TemporaryDirectory() as tmp:
+            for index, doc in enumerate(docs):
+                with open(os.path.join(tmp, f"{index}.json"), "w") as fh:
+                    json.dump(doc, fh)
+            got = ar.aggregate(tmp)
+        self.assertEqual(len(got), 2)
+        by_eplb = {doc["failure"]["case"]["eplb"]: doc for doc in got}
+        self.assertEqual(by_eplb[False]["generated_at"], "2026-07-03T00:00:02Z")
+
+    def test_aggregate_projects_one_newest_usable_outcome_per_case(self) -> None:
+        older = _doc()
+        older.update(case_id="case-a", generated_at="2026-07-03T00:00:01Z")
+        newer = copy.deepcopy(older)
+        newer["generated_at"] = "2026-07-03T00:00:02Z"
+        failed = _failed({"backend": "deepep", "phase": "decode"},
+                         "2026-07-03T00:00:03Z", case_id="case-a")
+        with tempfile.TemporaryDirectory() as tmp:
+            for index, doc in enumerate((older, newer, failed)):
+                with open(os.path.join(tmp, f"{index}.json"), "w") as fh:
+                    json.dump(doc, fh)
+            got = ar.aggregate(tmp)
+        self.assertEqual(len(got), 1)
+        self.assertEqual(got[0]["generated_at"], newer["generated_at"])
+
+    def test_aggregate_failed_identity_covers_scheduled_axes(self) -> None:
+        case = {
+            "suite": "ep-routing-v1", "workload": "deepseek-v3-v1",
+            "backend": "deepep", "phase": "decode", "ep": 8, "mode": "normal",
+            "dispatch_dtype": "bf16", "contract": "layout-and-dispatch-v1",
+            "routing": "zipf", "eplb": False, "combine_quant_mode": "none",
+            "resource_mode": "tuned", "tokens_ladder": "128",
+        }
+        replacements = {
+            "suite": "ep-core-v1", "workload": "other", "backend": "uccl",
+            "phase": "prefill", "ep": 4, "mode": "ll", "dispatch_dtype": "fp8",
+            "contract": "runtime-visible-v1", "routing": "uniform", "eplb": True,
+            "combine_quant_mode": "fp8", "resource_mode": "normalized",
+            "tokens_ladder": "512 2048",
+        }
+        baseline = ar._key(_failed(case))
+        for field, value in replacements.items():
+            with self.subTest(field=field):
+                self.assertNotEqual(baseline, ar._key(_failed({**case, field: value})))
+
+        self.assertEqual(ar._key(_failed(case, case_id="case-a")),
+                         ar._key(_failed({**case, "routing": "uniform"}, case_id="case-a")))
+        self.assertNotEqual(ar._key(_failed(case, case_id="case-a")),
+                            ar._key(_failed(case, case_id="case-b")))
+
+    def test_sampling_nonconformance_is_diagnostic(self) -> None:
+        validity = _doc()["validity"]
+        validity["sampling_conformance"] = "nonconformant"
+        self.assertEqual(vr.derive_publication_status(validity), "diagnostic")
+        self.assertEqual(ep_harness._derive_publication_status(validity), "diagnostic")
+
+    def test_historical_v4_keeps_variable_sample_semantics(self) -> None:
+        doc = _doc(iters=200, trials=3, samples=600)
+        doc["schema_version"] = 4
+        doc["validity"].pop("sampling_conformance")
+        doc["reproduction"].pop("sampling_contract")
+        doc["reproduction"].pop("samples_per_point")
+        errors, warnings, status = vr.validate_doc(doc, None, "historical-v4.json")
+        self.assertEqual(status, "comparable-experimental")
+        self.assertEqual(errors, [])
+        self.assertEqual(warnings, [])
+
+        doc["schema_version"] = 3
+        registry = vr.load_schema_registry()
+        selected, schema_errors = vr._schema_for_doc(doc, registry)
+        self.assertIs(selected, registry[4])
+        self.assertEqual(schema_errors, [])
+        errors, warnings, status = vr.validate_doc(doc, None, "historical-v3.json")
+        self.assertEqual(status, "comparable-experimental")
+        self.assertEqual(errors, [])
+        self.assertEqual(warnings, [])
+
+    def test_v5_failed_case_is_schema_selected_but_sampling_exempt(self) -> None:
+        doc = {
+            "schema_version": 5,
+            "family": "moe",
+            "record_type": "failed-case",
+            "runner": "test",
+            "backend": "deepep",
+            "publication_status": "failed",
+            "rows": [],
+            "failure": {"failure_mode": "timeout", "return_code": 124, "case": {}},
+        }
+        errors, warnings, status = vr.validate_doc(doc, vr.load_schema_registry(), "failed-v5.json")
+        self.assertEqual((errors, warnings, status), ([], [], "failed"))
+
+        doc["schema_version"] = 6
+        errors, _warnings, _status = vr.validate_doc(doc, vr.load_schema_registry(), "failed-v6.json")
+        self.assertTrue(any("unsupported schema_version" in error for error in errors), errors)
+
+    def test_scheduled_failed_case_requires_attributable_identity(self) -> None:
+        case_id = "cxv1-0123456789abcdefabcd"
+        case = {
+            "case_id": case_id, "suite": "ep-core-v1", "workload": "deepseek-v3-v1",
+            "required_publication": "official", "backend": "deepep", "phase": "decode",
+            "ep": 8, "dispatch_dtype": "bf16", "mode": "normal",
+            "contract": "layout-and-dispatch-v1", "routing": "uniform", "eplb": False,
+            "combine_quant_mode": "none", "resource_mode": "tuned", "tokens_ladder": "",
+            "gpus_per_node": 8, "scale_up_domain": 8,
+            "sampling_contract": "fixed-512-v1", "samples_per_point": 512,
+            "iters": 8, "trials": 64, "warmup": 32,
+            "warmup_semantics": "full-roundtrip-per-trial-point-v1",
+        }
+        doc = _failed(case, case_id=case_id, suite="ep-core-v1",
+                      workload_name="deepseek-v3-v1", required_publication="official",
+                      mode="normal", ep_size=8,
+                      measurement_contract="layout-and-dispatch-v1")
+        errors, _warnings, status = vr.validate_doc(
+            doc, vr.load_schema_registry(), "scheduled-failure.json")
+        self.assertEqual((errors, status), ([], "failed"))
+        del case["routing"]
+        errors, _warnings, _status = vr.validate_doc(
+            doc, vr.load_schema_registry(), "missing-routing.json")
+        self.assertTrue(any("failure.case.routing" in error for error in errors), errors)
+
+    def test_v5_missing_publication_status_is_not_legacy(self) -> None:
+        doc = _doc()
+        doc.pop("publication_status")
+        errors, _warnings, status = vr.validate_doc(doc, vr.load_schema_registry(), "malformed-v5.json")
+        self.assertNotEqual(status, "legacy-experimental")
+        self.assertTrue(errors)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py
new file mode 100644
index 0000000000..db68afb4ca
--- /dev/null
+++ b/experimental/CollectiveX/tests/workload.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""CollectiveX — canonical, serialized MoE routing workloads (goal Part 1: workload identity).
+
+A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent
+file, and referenced by an immutable `workload_id`. Every official benchmark point consumes the
+SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a
+checksum match, not by trusting that two machines re-ran the same seeded generator.
+
+Layout on disk (one workload = two files, basename = workload_id):
+  <dir>/<workload_id>.npz            topk_idx [gt,topk] int32, topk_weights [gt,topk] float32
+  <dir>/<workload_id>.manifest.json  dims, routing profile, generator version, seed, SHA-256s
+
+Split by dependency so it runs where each step lives:
+  * build_workload()  needs torch (via routing.py) — run on a node/container.
+  * load/verify/manifest  need only numpy + stdlib — run on a login node or in CI.
+
+Seeded runtime generation (routing.build_global_routing) stays for local dev; canonical files
+are how cross-hardware comparisons are gated.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+
+WORKLOAD_SCHEMA_VERSION = 1
+# Bump when routing.build_global_routing's numerics change so a stale file can't masquerade as
+# current. The workload_id folds this in: same id <=> same generator + params.
+GENERATOR_VERSION = "collectivex-routing-v1"
+GATE_WEIGHT_FORMAT = "softmax-of-randn-f32"   # how topk_weights are produced (see routing.py)
+ACTIVATION_GENERATOR = "collectivex-activation-v1"  # bump if the activation value-generator changes
+ACTIVATION_PROFILE_DEFAULT = "normal"               # seeded N(0,1) per token; the only wired profile
+
+
+def _sha256(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
+
+
+def compute_workload_id(routing: str, hidden: int, topk: int, experts: int,
+                        global_tokens: int, seed: int, generator: str = GENERATOR_VERSION,
+                        step: int = 0) -> str:
+    """Deterministic id over the identity-defining params. Same params+generator => same id.
+    `step` is the temporal snapshot for moving/alternating routing; folded in ONLY when non-zero
+    so every existing (step=0) canonical workload keeps its id."""
+    key = (f"{generator}|routing={routing}|hidden={hidden}|topk={topk}|experts={experts}"
+           f"|gt={global_tokens}|seed={seed}")
+    if step:
+        key += f"|step={step}"
+    return _sha256(key.encode())[:16]
+
+
+def compute_activation_identity(activation_profile, seed, hidden,
+                                generator=ACTIVATION_GENERATOR) -> str:
+    """Deterministic identity of the activation VALUE distribution (scaffold). Today activations
+    are seeded N(0,1) and NOT serialized, so identity = a descriptor hash. The formula MUST match
+    the inline one in ep_harness so a manifest and a result doc agree. Becomes the byte-hash of
+    the serialized activations once a model-trace value rig lands."""
+    key = f"{activation_profile}|seed={seed}|hidden={hidden}|gen={generator}"
+    return _sha256(key.encode())[:16]
+
+
+def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank,
+                   idx_np, weights_np, routing_stats=None,
+                   activation_profile=ACTIVATION_PROFILE_DEFAULT):
+    """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib."""
+    idx_bytes = idx_np.astype("int32").tobytes()
+    w_bytes = weights_np.astype("float32").tobytes()
+    wid = compute_workload_id(routing, hidden, topk, experts, global_tokens, seed)
+    return {
+        "schema_version": WORKLOAD_SCHEMA_VERSION,
+        "workload_id": wid,
+        "generator_version": GENERATOR_VERSION,
+        "gate_weight_format": GATE_WEIGHT_FORMAT,
+        "dims": {"hidden": hidden, "topk": topk, "experts": experts,
+                 "global_tokens": int(global_tokens), "experts_per_rank": experts_per_rank},
+        "routing_profile": routing,
+        "seed": seed,
+        "checksums": {  # SHA-256 over the raw little-endian array bytes (int32 / float32)
+            "topk_idx": _sha256(idx_bytes),
+            "topk_weights": _sha256(w_bytes),   # gate-weight (value) distribution identity
+            "trace": _sha256(idx_bytes + w_bytes),   # full-workload identity
+        },
+        "routing_stats": routing_stats or {},
+        # Activation value distribution (scaffold): name + deterministic descriptor identity.
+        # NOT under checksums — activations are not byte-serialized today (see compute_activation_identity).
+        "activation_profile": activation_profile,
+        "activation_identity": compute_activation_identity(activation_profile, seed, hidden),
+    }
+
+
+def build_workload(hidden, topk, experts, routing, global_tokens, seed, experts_per_rank,
+                   activation_profile=ACTIVATION_PROFILE_DEFAULT):
+    """Generate a canonical trace. Needs torch (routing.py). Returns (idx_np, weights_np, manifest)."""
+    import numpy as np
+    import routing as _routing
+    idx_t, w_t = _routing.build_global_routing(global_tokens, experts, topk, routing, seed,
+                                               experts_per_rank)
+    rstats = _routing.routing_stats(idx_t, experts, experts_per_rank, weights=w_t)
+    idx_np = idx_t.detach().cpu().numpy().astype(np.int32)
+    w_np = w_t.detach().cpu().numpy().astype(np.float32)
+    manifest = build_manifest(routing, hidden, topk, experts, global_tokens, seed,
+                              experts_per_rank, idx_np, w_np, rstats,
+                              activation_profile=activation_profile)
+    return idx_np, w_np, manifest
+
+
+def save_workload(out_dir, idx_np, weights_np, manifest) -> str:
+    import numpy as np
+    os.makedirs(out_dir, exist_ok=True)
+    wid = manifest["workload_id"]
+    np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"),
+                        topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32))
+    with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh:
+        json.dump(manifest, fh, indent=2, sort_keys=True)
+    return wid
+
+
+def load_workload(npz_path, verify=True):
+    """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest).
+    Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums."""
+    import numpy as np
+    base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path
+    with open(base + ".manifest.json") as fh:
+        manifest = json.load(fh)
+    z = np.load(base + ".npz")
+    idx_np, w_np = z["topk_idx"], z["topk_weights"]
+    if verify:
+        ok, reason = verify_workload(manifest, idx_np, w_np)
+        if not ok:
+            raise ValueError(f"workload checksum mismatch for {base}: {reason}")
+    return idx_np, w_np, manifest
+
+
+def verify_workload(manifest, idx_np, weights_np):
+    """Recompute checksums and compare to the manifest. Returns (ok, reason)."""
+    import numpy as np  # noqa: F401
+    ib = idx_np.astype("int32").tobytes()
+    wb = weights_np.astype("float32").tobytes()
+    cs = manifest.get("checksums", {})
+    if _sha256(ib) != cs.get("topk_idx"):
+        return False, "topk_idx hash differs"
+    if _sha256(wb) != cs.get("topk_weights"):
+        return False, "topk_weights hash differs"
+    if _sha256(ib + wb) != cs.get("trace"):
+        return False, "trace hash differs"
+    wid = compute_workload_id(manifest["routing_profile"], manifest["dims"]["hidden"],
+                              manifest["dims"]["topk"], manifest["dims"]["experts"],
+                              manifest["dims"]["global_tokens"], manifest["seed"],
+                              manifest.get("generator_version", GENERATOR_VERSION))
+    if wid != manifest["workload_id"]:
+        return False, f"workload_id mismatch (recomputed {wid} != {manifest['workload_id']})"
+    return True, "ok"
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+    import sys
+    import tempfile
+    # (1) workload_id determinism + sensitivity — pure stdlib, always runs.
+    a = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    b = compute_workload_id("zipf", 7168, 8, 256, 4096, 67)
+    c = compute_workload_id("uniform", 7168, 8, 256, 4096, 67)
+    assert a == b, "workload_id must be deterministic"
+    assert a != c, "workload_id must depend on routing"
+    print(f"workload_id determinism OK (zipf={a} uniform={c})")
+    # (2) build/save/load/verify roundtrip + cross-build identity — needs torch+numpy.
+    try:
+        import numpy as np  # noqa: F401
+        try:
+            idx, w, man = build_workload(7168, 8, 256, "zipf", 512, 67, 32)
+            built = True
+        except Exception as exc:   # torch missing on a login node
+            print(f"(torch unavailable — synthesizing arrays to test load/verify: {exc!r})")
+            idx = np.random.default_rng(0).integers(0, 256, size=(512, 8)).astype(np.int32)
+            w = np.random.default_rng(1).random((512, 8)).astype(np.float32)
+            man = build_manifest("zipf", 7168, 8, 256, 512, 67, 32, idx, w)
+            built = False
+        with tempfile.TemporaryDirectory() as d:
+            wid = save_workload(d, idx, w, man)
+            idx2, w2, man2 = load_workload(os.path.join(d, f"{wid}.npz"), verify=True)
+            assert (idx2 == idx).all() and (w2 == w).all(), "roundtrip array mismatch"
+            ok, reason = verify_workload(man2, idx2, w2)
+            assert ok, reason
+            # tamper -> must fail
+            idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256
+            bad, _ = verify_workload(man2, idx2, w2)
+            assert not bad, "verify must catch tampering"
+        print(f"save/load/verify roundtrip OK (workload_id={wid}, built_via_torch={built})")
+    except ImportError:
+        print("(numpy unavailable — skipped serialization roundtrip; id logic passed)")
+    print("workload self-test: PASS")
+    sys.exit(0)
diff --git a/experimental/CollectiveX/validate_results.py b/experimental/CollectiveX/validate_results.py
new file mode 100644
index 0000000000..49110191b6
--- /dev/null
+++ b/experimental/CollectiveX/validate_results.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+"""CollectiveX result validator (goal Part 1: schema + validation tooling).
+
+Validates EP result JSON docs against their versioned schema (v4 historical, v5 current) and the
+project's semantic gates:
+schema shape, provenance completeness, workload identity (incl. cross-run trace-signature
+agreement within a comparison_key), measurement-contract membership, byte-contract presence,
+the fixed-512-v1 sample contract, and — crucially — that `publication_status` is the
+MACHINE-DERIVED function of `validity` (no doc may hand-label itself official). Exits non-zero when any doc claims
+`official` but fails a gate (or, with --require-official, when any doc isn't official).
+
+Requires `jsonschema`; validation never falls back to a partial structural check.
+v3 docs (no publication_status) load as legacy/experimental and are reported, not failed.
+
+  python3 validate_results.py results/*.json
+  python3 validate_results.py --require-official results/
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+import jsonschema
+
+import capability
+
+SAMPLING_CONTRACT = "fixed-512-v1"
+TIMED_SAMPLES_PER_POINT = 512
+TIMED_ITERS_PER_TRIAL = 8
+TRIALS_PER_POINT = 64
+WARMUP_ITERS_PER_TRIAL = 32
+WARMUP_SEMANTICS = "full-roundtrip-per-trial-point-v1"
+HISTORICAL_V4_MIN_SAMPLES_OFFICIAL = 100
+CURRENT_SCHEMA_VERSION = 5
+HERE = os.path.dirname(os.path.abspath(__file__))
+SCHEMA_PATHS = {
+    3: os.path.join(HERE, "schemas", "ep-result-v4.schema.json"),
+    4: os.path.join(HERE, "schemas", "ep-result-v4.schema.json"),
+    5: os.path.join(HERE, "schemas", "ep-result-v5.schema.json"),
+}
+# Must stay in sync with the measurement_contract enum in the versioned result schemas.
+# (mori-quant-combine-v1 is reserved for the MoRI PR311 quant-combine axis; no emitter yet).
+KNOWN_CONTRACTS = {"layout-and-dispatch-v1", "cached-layout-comm-only-v1", "runtime-visible-v1",
+                   "mori-quant-combine-v1"}
+PUB_STATES = {"official", "comparable-experimental", "diagnostic", "invalid", "failed"}
+REQUIRED_BACKEND_PROVENANCE = {
+    "deepep": ("deepep_version", "deepep_commit"),
+    "deepep-hybrid": ("deepep_commit", "branch"),
+    "flashinfer": ("flashinfer_version", "flashinfer_commit", "flashinfer_stack"),
+    "uccl": ("uccl_version", "uccl_commit"),
+    "mori": ("mori_commit",),
+    "nccl-ep": ("nccl_version",),
+}
+
+
+def _resolved_provenance_value(field: str, value) -> bool:
+    if value is None:
+        return False
+    text = str(value).strip().lower()
+    if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}:
+        return False
+    if "capture-failed" in text:
+        return False
+    if field.endswith("_commit"):
+        if text in {"main", "hybrid-ep", "uccl", "pkg-uccl"}:
+            return False
+        if text.endswith(("-unknown", "-none", "-main", "-hybrid-ep")):
+            return False
+    return True
+
+
+def backend_provenance_issues(doc: dict) -> list[str]:
+    provenance = doc.get("backend_provenance")
+    if not isinstance(provenance, dict):
+        provenance = {}
+    return [field for field in REQUIRED_BACKEND_PROVENANCE.get(doc.get("backend"), ())
+            if not _resolved_provenance_value(field, provenance.get(field))]
+
+
+def _normalized_sku(value) -> str | None:
+    value = str(value or "").lower()
+    return next((sku for sku in sorted(capability.PLATFORMS, key=len, reverse=True)
+                 if value == sku or value.startswith(f"{sku}-") or value.startswith(f"{sku}_")),
+                None)
+
+
+def topology_issues(doc: dict) -> list[str]:
+    sku = _normalized_sku(doc.get("runner"))
+    try:
+        current = int(doc.get("schema_version") or 0) >= CURRENT_SCHEMA_VERSION
+    except (TypeError, ValueError):
+        current = False
+    if not sku or not current:
+        return []
+    placement = doc.get("placement")
+    if not isinstance(placement, dict):
+        placement = {}
+    issues = []
+    for field in ("gpus_per_node", "scale_up_domain"):
+        expected = int(capability.PLATFORMS[sku][field])
+        if placement.get(field) != expected:
+            issues.append(f"placement.{field}={placement.get(field)!r}, expected {expected} for {sku}")
+    return issues
+
+
+def derive_publication_status(v: dict, require_sampling: bool = True) -> str:
+    """MUST mirror ep_harness._derive_publication_status — the validator's job is to confirm the
+    recorded status equals this derivation."""
+    if v.get("execution_status") != "complete":
+        return "failed"
+    if (v.get("semantic_correctness") != "pass" or v.get("measurement_conformance") != "conformant"
+            or v.get("workload_identity") == "inconsistent"):
+        return "invalid"
+    sound = (v.get("semantic_correctness") == "pass"
+             and str(v.get("workload_identity", "")).startswith("consistent")
+             and v.get("measurement_conformance") == "conformant")
+    if str(v.get("resource_conformance", "")).endswith("nonconforming"):
+        return "diagnostic"
+    if require_sampling and v.get("sampling_conformance") != "conformant":
+        return "diagnostic"
+    # contract-level anomaly (goal P1-e/f): demotes to diagnostic unless waived (anomaly_free).
+    if not v.get("anomaly_free", True):
+        return "diagnostic"
+    if sound and v.get("provenance_complete") and v.get("workload_source") == "canonical-serialized":
+        return "official"
+    if sound:
+        return "comparable-experimental"
+    return "diagnostic"
+
+
+def load_schema_registry() -> dict[int, dict]:
+    """Load every supported EP schema keyed by the document's schema_version."""
+    schemas, loaded = {}, {}
+    for version, path in SCHEMA_PATHS.items():
+        if path not in loaded:
+            with open(path) as fh:
+                loaded[path] = json.load(fh)
+        schemas[version] = loaded[path]
+    return schemas
+
+
+def _schema_for_doc(doc: dict, schema_or_registry) -> tuple[dict | None, list[str]]:
+    if schema_or_registry is None:
+        return None, []
+    # Backward-compatible programmatic/CLI override: a raw JSON schema applies to every input doc.
+    if "$schema" in schema_or_registry:
+        return schema_or_registry, []
+    version = doc.get("schema_version")
+    schema = schema_or_registry.get(version)
+    if schema is None:
+        return None, [f"unsupported schema_version {version!r}; supported={sorted(schema_or_registry)}"]
+    return schema, []
+
+
+def _schema_check(doc, schema):
+    """Validate a document with the required JSON Schema implementation."""
+    try:
+        jsonschema.validate(doc, schema)
+        return []
+    except jsonschema.ValidationError as exc:
+        return [f"schema: {exc.message}"]
+    except jsonschema.SchemaError as exc:
+        return [f"invalid schema: {exc.message}"]
+
+
+def _sampling_contract_issues(doc: dict) -> list[str]:
+    """Verify the fixed sample basis from configuration through stored histograms."""
+    issues = []
+    repro = doc.get("reproduction") or {}
+    if repro.get("sampling_contract") != SAMPLING_CONTRACT:
+        issues.append(f"sampling_contract must be '{SAMPLING_CONTRACT}'")
+    iters, trials, warmup = repro.get("iters"), repro.get("trials"), repro.get("warmup")
+    expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL)
+    if (iters, trials, warmup) != expected:
+        issues.append(f"iters:trials:warmup={iters}:{trials}:{warmup}, expected "
+                      f"{expected[0]}:{expected[1]}:{expected[2]}")
+    if repro.get("warmup_semantics") != WARMUP_SEMANTICS:
+        issues.append(f"warmup_semantics must be '{WARMUP_SEMANTICS}'")
+    if repro.get("samples_per_point") != TIMED_SAMPLES_PER_POINT:
+        issues.append(f"reproduction.samples_per_point must equal {TIMED_SAMPLES_PER_POINT}")
+    for row in doc.get("rows", []):
+        t = row.get("tokens_per_rank")
+        if row.get("samples_pooled") != TIMED_SAMPLES_PER_POINT:
+            issues.append(f"T={t}: samples_pooled={row.get('samples_pooled')}, "
+                          f"expected {TIMED_SAMPLES_PER_POINT}")
+        if isinstance(trials, int) and row.get("trials") != trials:
+            issues.append(f"T={t}: row trials={row.get('trials')}, reproduction trials={trials}")
+        raw = row.get("raw_samples") or {}
+        for op in ("dispatch", "combine", "roundtrip"):
+            hist = raw.get(op) or {}
+            if hist.get("n") != TIMED_SAMPLES_PER_POINT:
+                issues.append(f"T={t}: raw_samples.{op}.n={hist.get('n')}, "
+                              f"expected {TIMED_SAMPLES_PER_POINT}")
+            counts = hist.get("counts")
+            if not isinstance(counts, list):
+                issues.append(f"T={t}: raw_samples.{op}.counts is missing")
+            elif sum(counts) != TIMED_SAMPLES_PER_POINT:
+                issues.append(f"T={t}: raw_samples.{op}.counts sum to {sum(counts)}, "
+                              f"expected {TIMED_SAMPLES_PER_POINT}")
+    return issues
+
+
+def validate_doc(doc, schema, path):
+    errs, warns = [], []
+    legacy = "publication_status" not in doc
+    try:
+        declared_version = int(doc.get("schema_version") or 0)
+    except (TypeError, ValueError):
+        declared_version = 0
+    if legacy and declared_version <= 3:
+        warns.append("legacy (v3, no publication_status) — loads as experimental, not comparable as official")
+        return errs, warns, "legacy-experimental"
+    selected_schema, schema_errors = _schema_for_doc(doc, schema)
+    errs += schema_errors
+    errs += _schema_check(doc, selected_schema) if selected_schema else []
+    scheduled = bool(doc.get("suite") or doc.get("required_publication"))
+    if scheduled:
+        for field in ("case_id", "suite", "workload_name", "required_publication", "phase",
+                      "ep_size", "mode", "measurement_contract"):
+            if doc.get(field) in (None, ""):
+                errs.append(f"scheduled result missing {field}")
+    if doc.get("record_type") == "failed-case":
+        # Intentionally preserved failure skeleton (judge-by-data doctrine): validate the
+        # skeleton contract only — the full-sweep gates below do not apply.
+        if doc.get("publication_status") != "failed":
+            errs.append(f"failed-case record with publication_status '{doc.get('publication_status')}' (must be 'failed')")
+        if doc.get("rows"):
+            errs.append("failed-case record must have empty rows")
+        fail = doc.get("failure") or {}
+        if not fail.get("failure_mode") or "return_code" not in fail:
+            errs.append("failed-case record missing failure evidence (failure_mode/return_code)")
+        if scheduled:
+            case = fail.get("case") or {}
+            for field in ("case_id", "suite", "workload", "required_publication", "backend",
+                          "phase", "ep", "dispatch_dtype", "mode", "contract", "routing",
+                          "eplb", "combine_quant_mode", "resource_mode", "tokens_ladder",
+                          "gpus_per_node", "scale_up_domain",
+                          "sampling_contract", "samples_per_point", "iters", "trials", "warmup",
+                          "warmup_semantics"):
+                if field not in case or (field != "tokens_ladder" and case[field] in (None, "")):
+                    errs.append(f"scheduled failed-case missing failure.case.{field}")
+        return errs, warns, "failed"
+    v = doc.get("validity", {})
+    recorded = doc.get("publication_status")
+    schema_version = declared_version
+    require_sampling = schema_version >= CURRENT_SCHEMA_VERSION
+    sampling_issues = _sampling_contract_issues(doc) if require_sampling else []
+    if require_sampling:
+        observed_sampling = "conformant" if not sampling_issues else "nonconformant"
+        recorded_sampling = v.get("sampling_conformance")
+        if recorded_sampling != observed_sampling:
+            errs.append(f"validity.sampling_conformance={recorded_sampling!r}, but artifact is "
+                        f"{observed_sampling} under {SAMPLING_CONTRACT}")
+    provenance_issues = backend_provenance_issues(doc)
+    if v.get("provenance_complete") and provenance_issues:
+        errs.append("validity.provenance_complete=true with unresolved backend identity: "
+                    + ", ".join(provenance_issues))
+    errs.extend(topology_issues(doc))
+    derived = derive_publication_status(v, require_sampling=require_sampling)
+    if recorded != derived:
+        errs.append(f"publication_status '{recorded}' != machine-derived '{derived}' (validity tampered or stale)")
+    # byte + contract + sample gates
+    if doc.get("measurement_contract") not in KNOWN_CONTRACTS:
+        errs.append(f"unknown measurement_contract {doc.get('measurement_contract')}")
+    rows = doc.get("rows", [])
+    for r in rows:
+        if "byte_contracts" not in r:
+            errs.append(f"T={r.get('tokens_per_rank')}: missing byte_contracts")
+            break
+        for op in ("dispatch", "combine", "roundtrip"):
+            if op not in r or "p99" not in r.get(op, {}):
+                errs.append(f"T={r.get('tokens_per_rank')}: missing {op} percentiles")
+                break
+    # anomaly self-consistency (goal P1-e): validity.anomaly_free must equal (no anomalies or waived).
+    anoms = doc.get("anomalies") or []
+    waived = (doc.get("anomaly_summary") or {}).get("waived", False)
+    expect_anomaly_free = (len(anoms) == 0) or bool(waived)
+    if v.get("anomaly_free", True) != expect_anomaly_free:
+        errs.append(f"validity.anomaly_free={v.get('anomaly_free')} but {len(anoms)} anomalies "
+                    f"(waived={waived}) imply {expect_anomaly_free}")
+    if anoms and not waived and recorded not in ("diagnostic", "invalid", "failed"):
+        errs.append(f"{len(anoms)} unwaived timing anomaly(ies) but status={recorded} (must be diagnostic)")
+    if sampling_issues:
+        if recorded in ("official", "comparable-experimental"):
+            errs.extend(f"comparison-grade sampling violation: {issue}" for issue in sampling_issues)
+        else:
+            warns.extend(f"sampling diagnostic: {issue}" for issue in sampling_issues)
+    # official-grade gates
+    if recorded == "official":
+        if not v.get("provenance_complete"):
+            errs.append("official but provenance_complete=false")
+        if v.get("workload_source") != "canonical-serialized":
+            errs.append("official but workload not canonical-serialized")
+        # goal P1: official requires NON-NULL workload identity (id + signature).
+        wl = doc.get("workload") or {}
+        if not wl.get("workload_id"):
+            errs.append("official but workload_id is null (non-null workload identity required)")
+        if not wl.get("trace_signature"):
+            errs.append("official but trace_signature is null")
+        if anoms and not waived:
+            errs.append("official but has unwaived timing anomalies")
+        if require_sampling:
+            if rows and any(r.get("samples_pooled") != TIMED_SAMPLES_PER_POINT for r in rows):
+                errs.append(f"official but a point does not have exactly {TIMED_SAMPLES_PER_POINT} pooled samples")
+        elif rows and min((r.get("samples_pooled", 0) for r in rows)) < HISTORICAL_V4_MIN_SAMPLES_OFFICIAL:
+            errs.append(f"v4 official but a point has <{HISTORICAL_V4_MIN_SAMPLES_OFFICIAL} pooled samples")
+        if not all(r.get("correct") for r in rows):
+            errs.append("official but a point failed correctness")
+    return errs, warns, recorded
+
+
+def cross_document_workload_issues(docs: list[dict]) -> list[str]:
+    """Find canonical same-workload cells whose realized per-T identity differs."""
+    observed: dict[tuple, dict[int, set[tuple]]] = {}
+    for doc in docs:
+        if doc.get("family") != "moe" or doc.get("record_type") == "failed-case":
+            continue
+        workload = doc.get("workload") or {}
+        if workload.get("source") != "canonical-serialized":
+            continue
+        shape = doc.get("shape") or {}
+        reproduction = doc.get("reproduction") or {}
+        eplb = doc.get("eplb") or {}
+        key = (
+            doc.get("suite"), doc.get("workload_name"), doc.get("phase"), doc.get("ep_size"),
+            shape.get("hidden"), shape.get("topk"), shape.get("experts"),
+            shape.get("dispatch_dtype"), shape.get("routing"), bool(eplb.get("enabled")),
+            reproduction.get("routing_step", 0), reproduction.get("uneven_tokens", "none"),
+            shape.get("activation_profile", "normal"),
+        )
+        activation_identity = workload.get("activation_identity")
+        mapping_hash = eplb.get("mapping_hash") if eplb.get("enabled") else None
+        for row in doc.get("rows", []):
+            tokens, routing_hash = row.get("tokens_per_rank"), row.get("routing_hash")
+            if tokens is None or not routing_hash:
+                continue
+            identity = (str(routing_hash), activation_identity, mapping_hash)
+            observed.setdefault(key, {}).setdefault(int(tokens), set()).add(identity)
+
+    issues = []
+    for key, per_token in observed.items():
+        for tokens, identities in per_token.items():
+            if len(identities) > 1:
+                issues.append(
+                    f"canonical workload identity conflict for suite={key[0]!r} "
+                    f"workload={key[1]!r} phase={key[2]!r} ep={key[3]!r} T={tokens}"
+                )
+    return issues
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP result validator")
+    ap.add_argument("paths", nargs="+", help="result JSON files or dirs")
+    ap.add_argument("--schema", default="",
+                    help="override with one schema for all docs; blank selects v3-v5 by schema_version")
+    ap.add_argument("--require-official", action="store_true",
+                    help="fail if any non-legacy doc is not 'official'")
+    a = ap.parse_args()
+    schema = json.load(open(a.schema)) if a.schema else load_schema_registry()
+    files = []
+    for p in a.paths:
+        if os.path.isdir(p):
+            files += glob.glob(os.path.join(p, "**", "*.json"), recursive=True)
+        else:
+            files.append(p)
+    files = sorted(f for f in files if not os.path.basename(f).startswith("env_"))
+
+    # cross-run workload identity: within a comparison_key, the realized routing must be the SAME
+    # workload. We check PER-TOKEN routing_hash agreement (not the whole trace_signature) so two
+    # runs of the same config at DIFFERENT ladders (e.g. a capped cross-vendor sweep 1..16 vs a full
+    # 1..128 headline) are NOT falsely flagged — only a genuine conflict (same T, different routing
+    # bytes) is a different workload.
+    by_ck = {}   # ck -> {T: {routing_hash: [files]}}
+    validated_docs = []
+    bad = 0
+    for f in files:
+        try:
+            doc = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if doc.get("family") != "moe":
+            continue
+        validated_docs.append(doc)
+        errs, warns, status = validate_doc(doc, schema, f)
+        # A well-formed failed-case is preserved evidence, not a benchmark validation failure. Its
+        # versioned schema and failure fields are still validated before this reporting shortcut.
+        if doc.get("record_type") == "failed-case":
+            fm = (doc.get("failure") or {}).get("failure_mode", "?")
+            if errs:
+                bad += 1
+                print(f"[FAIL] {os.path.basename(f):70s} status=failed")
+                for e in errs:
+                    print(f"        ERROR: {e}")
+            else:
+                print(f"[FAILED-CASE] {os.path.basename(f):68s} mode={fm}  (preserved, schema-valid evidence)")
+            continue
+        ck = doc.get("comparison_key")
+        # routing_step (temporal) + uneven_tokens change the realized workload but are NOT in the
+        # comparison_key (they live in reproduction) — include them in the cross-run grouping so a
+        # moving-hotspot step / uneven-allocation variant isn't falsely flagged as a conflicting
+        # same-config workload.
+        repro = doc.get("reproduction") or {}
+        gk = (ck, repro.get("routing_step", 0), repro.get("uneven_tokens", "none")) if ck else None
+        if gk:
+            for r in doc.get("rows", []):
+                T, rh = r.get("tokens_per_rank"), r.get("routing_hash")
+                if T is not None and rh:
+                    by_ck.setdefault(gk, {}).setdefault(T, {}).setdefault(rh, []).append(os.path.basename(f))
+        tag = "OK" if not errs else "FAIL"
+        if errs:
+            bad += 1
+        if a.require_official and status not in ("official",) and not errs:
+            tag = "FAIL"
+            bad += 1
+            errs = [f"not official (status={status})"]
+        print(f"[{tag}] {os.path.basename(f):70s} status={status}")
+        for e in errs:
+            print(f"        ERROR: {e}")
+        for w in warns:
+            print(f"        note: {w}")
+    # report cross-run identity CONFLICTS: same comparison_key + same T but DIFFERENT routing bytes
+    # (a genuine "not the same workload" — different hardware ran different routing for one point).
+    for gk, perT in by_ck.items():
+        ck = gk[0]
+        conflicts = {T: hs for T, hs in perT.items() if len(hs) > 1}
+        if conflicts:
+            bad += 1
+            print(f"[FAIL] comparison_key {ck[:12]} (step={gk[1]},uneven={gk[2]}): per-T routing-hash CONFLICT — not the same workload:")
+            for T, hs in sorted(conflicts.items()):
+                print(f"        T={T}: " + "; ".join(f"{h[:10]}=[{', '.join(fs)}]" for h, fs in hs.items()))
+    for issue in cross_document_workload_issues(validated_docs):
+        bad += 1
+        print(f"[FAIL] {issue}")
+    print(f"\n{'FAILED' if bad else 'PASS'}: {len(files)} files, {bad} problem(s)")
+
+    return 1 if bad else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())