diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py index bff36397af..394435e16b 100644 --- a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py +++ b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py @@ -112,7 +112,10 @@ def generate_maxtext_config(vllm_config: VllmConfig) -> pyconfig.HyperParameters if hasattr(vllm_config.model_config.hf_config, "text_config") else vllm_config.model_config.hf_config ) - hidden_size = getattr(hf_config, "moe_intermediate_size", None) + hidden_size = ( + getattr(hf_config, "moe_intermediate_size", None) + or getattr(hf_config, "intermediate_size", None) + ) num_lanes = pltpu.get_tpu_info().num_lanes num_kv_heads = hf_config.num_key_value_heads diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py index 020956098c..c0f7c58a49 100644 --- a/src/maxtext/layers/moe.py +++ b/src/maxtext/layers/moe.py @@ -2448,6 +2448,8 @@ def fused_moe_matmul( # Map MaxText config fields to fused_moe_func args activation = self.config.mlp_activations[0] # e.g. "silu" + if activation == "sigmoid": + activation = "swigluoai" scoring_fn = self.config.routed_score_func if self.config.routed_score_func else "softmax" # Check if the model architecture intrinsically renormalizes weights