Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions examples/specdec_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ def run_simple(args):
f"or extend _MAX_SEQ_LEN_KEY in run.py."
)
engine_args[key] = args.max_seq_len
if args.max_num_tokens is not None:
if args.engine != "TRTLLM":
raise ValueError("--max_num_tokens is currently only wired for --engine TRTLLM.")
engine_args["max_num_tokens"] = args.max_num_tokens
sampling_kwargs = args.runtime_params.get("sampling_kwargs", {"temperature": 0})
if args.temperature is not None:
sampling_kwargs["temperature"] = args.temperature
Expand Down Expand Up @@ -349,6 +353,16 @@ def run_simple(args):
"throughput_32k split (32K input + 4K output + 4K headroom)."
),
)
parser.add_argument(
"--max_num_tokens",
type=int,
required=False,
default=None,
help=(
"TRT-LLM max batched tokens. Overrides engine_args.max_num_tokens "
"from --runtime_params for --engine TRTLLM."
),
)
parser.add_argument(
"--output_length", type=int, required=False, default=4096, help="Output length"
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 via TRT-LLM.
#
# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per
# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB
# H100/A100. Size by total expert storage, not active params.
#
# Slurm run — cells override per-cell knobs via pipeline.task_N.args+=[...]:
#
# uv run slurm.py \
# --yaml modules/Model-Optimizer/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml \
# --yes detach=true \
# pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/qualitative","--draft_length 7"] \
# pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/throughput_32k","--num_requests 80","--draft_length 7"]

job_name: NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4_specdec_bench_mtp_trtllm

pipeline:
global_vars:
hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4

# task_0: SPEED qualitative split
task_0:
script: common/specdec_bench/quick_check.sh
args:
- --dataset speed
- --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
- --engine TRTLLM
- --speculative_algorithm MTP
- --draft_length 3
- --tp_size 4
- --ep_size 1
- --concurrency 32
- --output_length 4096
- --aa_timing
- --show_progress
- --save_dir /scratchspace/{sweep_name_default}/qualitative
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
- HF_LOCAL: /hf-local
- TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 4
gpus_per_node: 4
container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

# task_1: SPEED throughput_32k split
task_1:
script: common/specdec_bench/quick_check.sh
args:
- --dataset speed
- --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
- --engine TRTLLM
- --speculative_algorithm MTP
- --draft_length 3
- --tp_size 4
- --ep_size 1
- --concurrency 8
- --num_requests 80
- --output_length 4096
- --aa_timing
- --show_progress
- --save_dir /scratchspace/{sweep_name_default}/throughput_32k
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
- HF_LOCAL: /hf-local
- TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 4
gpus_per_node: 4
container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
Loading