SemiAnalysisAI · Oseltamivir · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,236 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Manual one-off diagnostics. Promoted v1 coverage uses collectivex-sweep.yml.
+
+on:
+  workflow_dispatch:
+    inputs:
+      sku:
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, b200-multinode, mi355x, mi300x, mi325x, h100-dgxc, h200, b300, gb300]
+      benchmark:
+        # mori runs only on mi355x; nccl/deepep/uccl/all + the collective benches on NVIDIA SKUs.
+        # offload/copy-engine/kv-cache are single-process memcpy-family collectives (family!=moe).
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, deepep-hybrid, mori, uccl, nccl-ep, flashinfer, flashinfer-combine-fp8, flashinfer-combine-fp8-directcast, flashinfer-combine-nvfp4, nixl, mori-io, nccl-kv, mooncake, offload, copy-engine, kv-cache, rl-mesh, allreduce-fw, allreduce-fw-vllm, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      timing:
+        # Combined timing knobs "iters:trials:warmup" (GitHub caps workflow_dispatch at 25 inputs,
+        # so these share one). fixed-512-v1 requires this exact profile on every SKU/backend.
+        description: 'EP timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32)'
+        type: string
+        default: '8:64:32'
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision (fp8 scale-layout recipes + FlashInfer OCP-microscaling mxfp8/nvfp4)
+        type: choice
+        default: bf16
+        options: [bf16, fp8, fp8-pertoken, fp8-directcast, mxfp8, mxfp4, nvfp4]
+      mode:
+        # LL is retained for manual diagnostics only; it is not a promoted v1 dimension.
+        description: EP kernel path (LL is diagnostic only)
+        type: choice
+        default: normal
+        options: [normal, ll]
+      resource_mode:
+        # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
+        # tuned = each backend's own recommended/default launch config.
+        description: Comm resource regime
+        type: choice
+        default: tuned
+        options: [normalized, tuned, default]
+      contract:
+        # [cl]/[rv] are retained for explicit diagnostics, never promoted v1 comparisons.
+        description: Measurement contract (non-default contracts are diagnostic only)
+        type: choice
+        default: layout-and-dispatch-v1
+        options: [layout-and-dispatch-v1, cached-layout-comm-only-v1, runtime-visible-v1]
+      routing:
+        # v1 schedules uniform and zipf only. The remaining choices are one-off diagnostics.
+        description: EP routing distribution
+        type: choice
+        default: uniform
+        options: [uniform, zipf, balanced, balanced-rank-local, hotspot-single]
+      eplb:
+        # EPLB = replicate hot experts + balanced-place (the remedy for skewed routing). A pure
+        # routing-trace transform; experts -> num_logical+redundant. Meaningful with zipf*.
+        description: Apply EPLB expert replication/placement
+        type: boolean
+        default: false
+      canonical:
+        # Consume a CANONICAL serialized workload (generated deterministically in-container) instead
+        # of seeded-runtime. A canonical-serialized run with full GHA provenance is publication
+        # 'official' — this is the switch that promotes a cohort past comparable-experimental.
+        description: Use canonical serialized workload (official-grade workload identity)
+        type: boolean
+        default: false
+      activation_profile:
+        # Activation VALUE distribution of expert inputs. normal = headline; the others stress a
+        # future quantized combine (latency-neutral under bf16 — the expected null result).
+        description: Activation value profile
+        type: choice
+        default: normal
+        options: [normal, zeros, small-amplitude, wide-dynamic-range, fp8-saturation]
+      sm_fraction:
+        # normalized comm-resource fraction (DeepEP sm_fraction*SMs / MoRI ~*CUs). Sweep this with
+        # resource_mode=normalized to build the resource-Pareto (latency vs comm fraction). Blank =
+        # harness default 0.18.
+        description: Normalized comm-resource fraction (resource_mode=normalized)
+        type: string
+        default: ''
+      hidden:
+        # Manual shape override. Blank = deepseek-v3-v1 default 7168.
+        description: MoE hidden dim (model-derived workloads); blank = 7168
+        type: string
+        default: ''
+      topk:
+        description: MoE top-k (model-derived workloads); blank = 8
+        type: string
+        default: ''
+      experts:
+        description: MoE total experts (model-derived workloads); blank = 256
+        type: string
+        default: ''
+      uneven_tokens:
+        # Manual diagnostic only; not a promoted v1 dimension.
+        description: Uneven source-token allocation
+        type: choice
+        default: none
+        options: [none, linear, empty-rank]
+
+concurrency:
+  # Group per (SKU + FULL config): GitHub keeps only one running + one pending per group and
+  # cancels the rest, so a coarse per-SKU group made a fan-out of many configs on one SKU
+  # self-cancel down to ~2. Including dtype/mode/contract/routing/eplb/phase gives each config
+  # its OWN group -> all configs survive; they queue only on the runner's own capacity, not on
+  # GitHub concurrency. cancel-in-progress FALSE so a re-dispatch of the SAME config queues.
+  # Resource/value axes remain in the group so distinct diagnostics do not self-cancel.
+  group: collectivex-${{ github.ref }}-${{ inputs.sku }}-${{ inputs.benchmark }}-${{ inputs.dispatch_dtype }}-${{ inputs.mode }}-${{ inputs.contract }}-${{ inputs.routing }}-${{ inputs.eplb }}-${{ inputs.phase }}-${{ inputs.resource_mode }}-${{ inputs.sm_fraction }}-${{ inputs.activation_profile }}-${{ inputs.hidden }}-${{ inputs.topk }}-${{ inputs.experts }}-${{ inputs.uneven_tokens }}-${{ inputs.nodes }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    # The bare `h200` label spans TWO clusters: 14 h200-dgxc runners (login-0; the EP
+    # path is validated there) and 2 h200-cw (CoreWeave) runners that have no
+    # launch_h200-cw.sh and die exit 127. Pin h200 to the h200-dgxc pool so every
+    # dispatch lands where the launcher + FS + partition are known-good. Other SKUs are
+    # single-pool, so pass the sku through unchanged.
+    runs-on: ${{ inputs.sku == 'h200' && 'h200-dgxc' || inputs.sku }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        # nccl/rccl are collective primitives — phase is meaningless, so run ONE job (not
+        # the same work twice). EP backends: 'both' -> decode + prefill; else a single job.
+        phase: ${{ fromJSON((inputs.benchmark == 'nccl' || inputs.benchmark == 'rccl') && '["na"]' || (inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase))) }}
+    env:
+      # flashinfer-combine-{fp8,nvfp4} = the flashinfer EP backend with a QUANTIZED COMBINE OUTPUT
+      # (MXFP8 e4m3+e8m0, or NVFP4 e2m1, via the flashinfer-main moe_a2a_combine output_dtype). Map to
+      # CX_BENCH=flashinfer + CX_COMBINE_DTYPE (run_flashinfer_suite builds flashinfer-main when
+      # CX_COMBINE_DTYPE!=bf16). Input-cap-safe (a benchmark CHOICE, not a new input).
+      CX_BENCH: ${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }}
+      # allreduce-fw-vllm = the framework all-reduce bench in a vLLM container (container switch for
+      # the vLLM custom-AR, goal 215) — set CX_IMAGE to a vLLM cuda image; the launcher uses CX_IMAGE
+      # when non-empty, else cx_default_image. Input-cap-safe (a benchmark CHOICE).
+      CX_IMAGE: ${{ inputs.benchmark == 'allreduce-fw-vllm' && 'vllm/vllm-openai:latest' || '' }}
+      # startsWith catches both flashinfer-combine-fp8 and -fp8-directcast (both fp8 combine output;
+      # the -directcast variant differs only in CX_QC_SCALE=scalar below — a single output_scalar_scale,
+      # no per-block scales = the unscaled direct-cast fp8 combine).
+      CX_COMBINE_DTYPE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }}
+      CX_COMBINE_QUANT_MODE: ${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }}
+      CX_QC_SCALE: ${{ inputs.benchmark == 'flashinfer-combine-fp8-directcast' && 'scalar' || '' }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
+      CX_MODE: ${{ inputs.mode }}
+      CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
+      CX_MEASUREMENT_CONTRACT: ${{ inputs.contract }}
+      CX_ROUTING: ${{ inputs.routing }}
+      CX_EPLB: ${{ inputs.eplb && '1' || '' }}
+      # Canonical serialized workload (official-grade identity) + value diagnostics.
+      CX_CANONICAL: ${{ inputs.canonical && '1' || '' }}
+      CX_ACTIVATION_PROFILE: ${{ inputs.activation_profile }}
+      CX_SM_FRACTION: ${{ inputs.sm_fraction }}
+      # Manual shape and uneven-allocation diagnostics.
+      CX_HIDDEN: ${{ inputs.hidden }}
+      CX_TOPK: ${{ inputs.topk }}
+      CX_EXPERTS: ${{ inputs.experts }}
+      CX_UNEVEN_TOKENS: ${{ inputs.uneven_tokens }}
+      CX_TIMING: ${{ inputs.timing }}
+      # GHA run provenance: run_ep records git_run (repo/run/attempt/ref/sha/job) -> a GHA result
+      # is provenance_complete (publication_status >= comparable-experimental, official w/ canonical).
+      COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+      COLLECTIVEX_ARTIFACT_NAME: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      # MI355X: pin to the warm-squash, writable nodes.
+      CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      # Reject an unsupported backend/SKU/mode/dtype/contract BEFORE consuming the runner
+      # (review #3): fail fast on the login node, not after a salloc. 'all' fans out per
+      # vendor in-container, so skip the single-combo check for it.
+      - name: Validate capability
+        if: inputs.benchmark != 'all'
+        run: |
+          python3 experimental/CollectiveX/tests/capability.py \
+            --sku "${{ inputs.sku }}" \
+            --backend "${{ startsWith(inputs.benchmark, 'flashinfer-combine') && 'flashinfer' || (inputs.benchmark == 'allreduce-fw-vllm' && 'allreduce-fw' || inputs.benchmark) }}" \
+            --mode "${{ inputs.mode }}" --dtype "${{ inputs.dispatch_dtype }}" \
+            --contract "${{ inputs.contract }}" \
+            --combine-dtype "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'bf16') }}" \
+            --combine-quant-mode "${{ startsWith(inputs.benchmark, 'flashinfer-combine-fp8') && 'fp8' || (inputs.benchmark == 'flashinfer-combine-nvfp4' && 'nvfp4' || 'none') }}"
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn