NVIDIA · ChenhanYu · Jun 14, 2026 · Jun 14, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -178,6 +178,10 @@ def run_simple(args):
                 f"or extend _MAX_SEQ_LEN_KEY in run.py."
             )
         engine_args[key] = args.max_seq_len
+    if args.max_num_tokens is not None:
+        if args.engine != "TRTLLM":
+            raise ValueError("--max_num_tokens is currently only wired for --engine TRTLLM.")
+        engine_args["max_num_tokens"] = args.max_num_tokens
     sampling_kwargs = args.runtime_params.get("sampling_kwargs", {"temperature": 0})
     if args.temperature is not None:
         sampling_kwargs["temperature"] = args.temperature
@@ -349,6 +353,16 @@ def run_simple(args):
             "throughput_32k split (32K input + 4K output + 4K headroom)."
         ),
     )
+    parser.add_argument(
+        "--max_num_tokens",
+        type=int,
+        required=False,
+        default=None,
+        help=(
+            "TRT-LLM max batched tokens. Overrides engine_args.max_num_tokens "
+            "from --runtime_params for --engine TRTLLM."
+        ),
+    )
     parser.add_argument(
         "--output_length", type=int, required=False, default=4096, help="Output length"
     )

@@ -0,0 +1,74 @@
+# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 via TRT-LLM.
+#
+# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per
+# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB
+# H100/A100. Size by total expert storage, not active params.
+#
+# Slurm run — cells override per-cell knobs via pipeline.task_N.args+=[...]:
+#
+#   uv run slurm.py \
+#     --yaml modules/Model-Optimizer/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml \
+#     --yes detach=true \
+#     pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/qualitative","--draft_length 7"] \
+#     pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/throughput_32k","--num_requests 80","--draft_length 7"]
+
+job_name: NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4_specdec_bench_mtp_trtllm
+
+pipeline:
+  global_vars:
+    hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+
+  # task_0: SPEED qualitative split
+  task_0:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
+      - --engine TRTLLM
+      - --speculative_algorithm MTP
+      - --draft_length 3
+      - --tp_size 4
+      - --ep_size 1
+      - --concurrency 32
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/qualitative
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+      - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # task_1: SPEED throughput_32k split
+  task_1:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
+      - --engine TRTLLM
+      - --speculative_algorithm MTP
+      - --draft_length 3
+      - --tp_size 4
+      - --ep_size 1
+      - --concurrency 8
+      - --num_requests 80
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/throughput_32k
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+      - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10