diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index bdb6a4602..486264648 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -12609,11 +12609,11 @@ minimaxm3-fp4-b300-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 2 } - - { tp: 4, conc-start: 1, conc-end: 2 } + - { tp: 4, conc-start: 1, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 64, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3944e67c5..13e964bf2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4433,3 +4433,9 @@ - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts." - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001 + +- config-keys: + - minimaxm3-fp4-b300-vllm + description: + - "Add high concurrency configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1994