diff --git a/aphrodite/metal/compat.py b/aphrodite/metal/compat.py index 4dda3f30d3..5ae4589a24 100644 --- a/aphrodite/metal/compat.py +++ b/aphrodite/metal/compat.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compatibility patches for vLLM + transformers version mismatches. +"""Compatibility patches for Aphrodite + transformers version mismatches. Applied once at platform registration time. Optional missing dependencies are logged; unexpected runtime errors are allowed to surface so regressions remain @@ -146,8 +146,7 @@ def _stack_qwen36_moe_per_expert_weights( and ``...mlp.experts.down_proj``, both stacked along axis 0 over experts. Mirrors the (scan -> validate -> walk) structure of upstream - ml-explore/mlx-lm#1224. Removable once vllm-metal's mlx-lm pin bumps - past that merge. + ml-explore/mlx-lm#1224. No-op when no per-expert keys are present (dense Qwen3.5/3.6 or already- stacked MoE checkpoints). diff --git a/aphrodite/metal/config.py b/aphrodite/metal/config.py index ac84ad9088..2cc3396e30 100644 --- a/aphrodite/metal/config.py +++ b/aphrodite/metal/config.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Configuration for vLLM Metal plugin via environment variables.""" +"""Configuration for Aphrodite Metal plugin via environment variables.""" import os from dataclasses import dataclass @@ -37,7 +37,7 @@ @dataclass class MetalConfig: - """Configuration for vLLM Metal plugin.""" + """Configuration for Aphrodite Metal plugin.""" memory_fraction: float # -1.0 means "auto" (calculate minimal needed) use_mlx: bool diff --git a/aphrodite/metal/envs.py b/aphrodite/metal/envs.py index 2c3462834e..ca09050a70 100644 --- a/aphrodite/metal/envs.py +++ b/aphrodite/metal/envs.py @@ -1,16 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -"""Environment variable definitions for the vLLM Metal plugin. +"""Environment variable definitions for the Aphrodite Metal plugin. This module is the single source of truth for all ``APHRODITE_METAL_*`` (and ``APHRODITE_MLX_*``) environment variables. It mirrors the lazy-evaluation -pattern used by ``vllm/envs.py``: each variable is read from +pattern used by ``aphrodite/envs.py``: each variable is read from ``os.environ`` on access via ``__getattr__``, so values are never stale and ``monkeypatch.setenv`` works in tests without extra resets. During plugin registration (``aphrodite.metal._register``), the ``environment_variables`` dict is merged into ``aphrodite.envs.environment_variables`` so that ``validate_environ()`` -recognises our variables and does not emit spurious "Unknown vLLM +recognises our variables and does not emit spurious "Unknown Aphrodite environment variable" warnings. """ @@ -84,5 +84,5 @@ def __getattr__(name: str) -> Any: def __dir__() -> list[str]: - # Mirrors vllm/envs.py; enables tab-completion and introspection. + # Mirrors aphrodite/envs.py; enables tab-completion and introspection. return list(environment_variables.keys()) diff --git a/aphrodite/metal/metal_backend.py b/aphrodite/metal/metal_backend.py index 2098c26b89..090cd0d282 100644 --- a/aphrodite/metal/metal_backend.py +++ b/aphrodite/metal/metal_backend.py @@ -18,7 +18,7 @@ class MetalBackend(AttentionBackend): block_size, and the hybrid-block-size math via Platform._align_hybrid_block_size) can read Metal's MultipleOf(16) alignment constraint. The Metal paged-attention kernels are tuned for - block_size=16; advertising MultipleOf(16) makes vLLM's selector default + block_size=16; advertising MultipleOf(16) makes Aphrodite's selector default to 16 and lets hybrid models align to multiples of 16. It is never dispatched to as a real attention backend — the actual Metal paged attention lives in metal_kernel_backend/paged_attention.py. The diff --git a/aphrodite/metal/metal_kernel_backend/attention_sdpa.py b/aphrodite/metal/metal_kernel_backend/attention_sdpa.py index a6bf34effa..e63c6f7686 100644 --- a/aphrodite/metal/metal_kernel_backend/attention_sdpa.py +++ b/aphrodite/metal/metal_kernel_backend/attention_sdpa.py @@ -110,7 +110,7 @@ def _build_block_tables( """Build kernel-compatible block tables, translating if necessary. When ``cache_block_size`` exceeds the kernel's compiled block sizes, - each vLLM block ``b`` is expanded into ``ratio`` kernel blocks + each Aphrodite block ``b`` is expanded into ``ratio`` kernel blocks ``[b*ratio, b*ratio+ratio)``. The cache is reshaped later to match (zero-copy). @@ -136,7 +136,7 @@ def _build_block_tables( return result # Hybrid path — translate large block_size to a kernel-compatible one. - # Vectorized: each vLLM block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1]. + # Vectorized: each Aphrodite block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1]. kernel_bs = _pick_kernel_block_size(cache_block_size) ratio = cache_block_size // kernel_bs @@ -447,10 +447,10 @@ def sdpa_forward( max_seq_len = ctx.max_context_len or max(ctx.context_lens) # --- Block tables (with hybrid block-size translation) --- - # vLLM may inflate block_size (e.g. 544) to align attention pages with + # Aphrodite may inflate block_size (e.g. 544) to align attention pages with # mamba pages in hybrid models. The Metal kernel only supports small # block sizes (8, 16, 32). _build_block_tables handles the translation: - # it expands each vLLM block into multiple kernel blocks and returns the + # it expands each Aphrodite block into multiple kernel blocks and returns the # kernel-compatible block_size. The cache is reshaped to match (zero-copy). block_tables, kernel_block_size = _build_block_tables(ctx, kv_cache.block_size) diff --git a/aphrodite/metal/multimodal/__init__.py b/aphrodite/metal/multimodal/__init__.py index a9f7e1583a..482d716b17 100644 --- a/aphrodite/metal/multimodal/__init__.py +++ b/aphrodite/metal/multimodal/__init__.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Generic multimodal helpers for vLLM Metal.""" +"""Generic multimodal helpers for Aphrodite Metal.""" from __future__ import annotations diff --git a/aphrodite/metal/multimodal/embeddings.py b/aphrodite/metal/multimodal/embeddings.py index 40e9aafde4..acdd8b2ff8 100644 --- a/aphrodite/metal/multimodal/embeddings.py +++ b/aphrodite/metal/multimodal/embeddings.py @@ -16,7 +16,7 @@ def merge_multimodal_embeddings( ) -> mx.array: """Splice multimodal embeddings into placeholder positions. - Mirrors ``vllm/model_executor/models/utils.py`` + Mirrors ``aphrodite/model_executor/models/utils.py`` ``_merge_multimodal_embeddings`` for MLX arrays. Returns a new array; ``inputs_embeds`` is not mutated. """ diff --git a/aphrodite/metal/multimodal/qwen3_vl/adapter.py b/aphrodite/metal/multimodal/qwen3_vl/adapter.py index d16c5bcbd5..41577e1c2c 100644 --- a/aphrodite/metal/multimodal/qwen3_vl/adapter.py +++ b/aphrodite/metal/multimodal/qwen3_vl/adapter.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Qwen3-VL multimodal adapter for vLLM Metal.""" +"""Qwen3-VL multimodal adapter for Aphrodite Metal.""" from __future__ import annotations @@ -30,10 +30,10 @@ def get_mrope_input_positions( ) -> tuple[mx.array, int]: """Return ``((3, seq_len) int32 positions, mrope_position_delta)``. - Calls upstream vLLM's mm_features-driven Qwen3-VL M-RoPE helper with a + Calls upstream Aphrodite's mm_features-driven Qwen3-VL M-RoPE helper with a minimal image-only config shim, then converts the returned torch tensor to an MLX array. This keeps the position-building policy upstream-owned - while the vllm-metal runner can consume MLX arrays. + while the aphrodite metal runner can consume MLX arrays. """ if not input_tokens: return mx.zeros((3, 0), dtype=mx.int32), 0 diff --git a/aphrodite/metal/platform.py b/aphrodite/metal/platform.py index 7e1ab6cfc5..567a956aa8 100644 --- a/aphrodite/metal/platform.py +++ b/aphrodite/metal/platform.py @@ -220,10 +220,10 @@ def get_torch_device(cls, device_id: int = 0) -> torch.device: @classmethod def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None: - """Check and update vLLM configuration for Metal compatibility. + """Check and update Aphrodite configuration for Metal compatibility. Args: - aphrodite_config: vLLM configuration object + aphrodite_config: Aphrodite configuration object """ config = get_config() parallel_config = aphrodite_config.parallel_config @@ -231,13 +231,10 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None: compilation_config = aphrodite_config.compilation_config # Metal execution is MLX-backed. Torch Inductor/CUDAGraph settings do - # not apply to the actual model path, so normalize them here rather - # than requiring users to pass --enforce-eager. + # not apply to the actual model path, so disable those compilation + # surfaces without overriding the user's eager-mode flag here. from aphrodite.config.compilation import CompilationMode, CUDAGraphMode - if model_config is not None and not model_config.enforce_eager: - logger.info("Metal: forcing eager mode; torch.compile/CUDAGraphs are not used on MLX.") - model_config.enforce_eager = True compilation_config.mode = CompilationMode.NONE compilation_config.cudagraph_mode = CUDAGraphMode.NONE compilation_config.max_cudagraph_capture_size = 0 @@ -385,7 +382,7 @@ def _find_non_ssm_backend( def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> None: """Update block_size for Metal platform. - Delegates to vLLM's base implementation, which reads the Metal kernel + Delegates to Aphrodite's base implementation, which reads the Metal kernel alignment (MultipleOf(16)) from our :meth:`_find_non_ssm_backend` override. Adds a one-time warning when paged attention is enabled for a hybrid model, explaining the cache-block-size translation mechanism @@ -403,16 +400,16 @@ def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> N # block-size translation mechanism. # # Background: - # - vLLM requires block_size=160 (or larger) for hybrid models to satisfy + # - Aphrodite requires block_size=160 (or larger) for hybrid models to satisfy # page size divisibility validation between SDPA and Mamba layers. # - # Solution (PR #235): - # - vLLM sees a large block_size (e.g., 144 = 16 * 9) for its scheduler + # Solution: + # - Aphrodite sees a large block_size (e.g., 144 = 16 * 9) for its scheduler # validation. # - The Metal kernel uses a translated block_size (16, the kernel sweet # spot) that it supports. - # - Each vLLM block is split into ratio = cache_block_size / kernel_block_size - # kernel blocks. For example, one vLLM block of 144 tokens becomes 9 kernel + # - Each Aphrodite block is split into ratio = cache_block_size / kernel_block_size + # kernel blocks. For example, one Aphrodite block of 144 tokens becomes 9 kernel # blocks of 16 tokens each. # - The KV cache is reshaped (zero-copy) to match: [num_blocks, 144, ...] → # [num_blocks*9, 16, ...]. The physical memory layout is unchanged. @@ -423,17 +420,17 @@ def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> N if model_config.is_hybrid and metal_config.use_paged_attention: logger.warning( "Hybrid model (e.g., Qwen3.5) with paged attention enabled. " - "Using block-size translation (PR #235) to convert vLLM's large " + "Using block-size translation (PR #235) to convert Aphrodite's large " "block_size to a Metal kernel-compatible size.\n" - " Mechanism: Each vLLM block is split into multiple kernel blocks.\n" - " Example: vLLM block_size=144 → kernel block_size=16 (ratio=9).\n" + " Mechanism: Each Aphrodite block is split into multiple kernel blocks.\n" + " Example: Aphrodite block_size=144 → kernel block_size=16 (ratio=9).\n" " The KV cache is reshaped (zero-copy) and block tables are expanded.\n" " This is a logical transformation — physical memory is unchanged." ) # Delegate the rest to upstream. With our ``_find_non_ssm_backend`` # returning :class:`MetalBackend` (which advertises ``MultipleOf(16)``), - # vLLM's Phase 1 picks a kernel-aligned default of 16 for non-hybrid + # Aphrodite's Phase 1 picks a kernel-aligned default of 16 for non-hybrid # models (matching the kernel sweet spot), and Phase 2 # (``_align_hybrid_block_size``) handles hybrid alignment. The kernel # layer (``_pick_kernel_block_size``) validates the final @@ -453,10 +450,10 @@ def get_attn_backend_cls( if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN: logger.info(f"Cannot use {selected_backend} backend on Metal/MLX.") if attn_selector_config.use_mla: - # MLA attention is handled by the vllm-metal model runner (MLAPagedAttentionWrapper), - # not by vLLM's attention backend selector. Continue to return CPU_ATTN below. + # MLA attention is handled by the aphrodite metal model runner (MLAPagedAttentionWrapper), + # not by Aphrodite's attention backend selector. Continue to return CPU_ATTN below. logger.info( - "MLA model detected; attention handled by vllm-metal model runner" + "MLA model detected; attention handled by aphrodite metal model runner" ) if attn_selector_config.use_sparse: raise NotImplementedError("Sparse Attention is not supported on Metal/MLX.") diff --git a/aphrodite/metal/profiler/__init__.py b/aphrodite/metal/profiler/__init__.py index addeb5b607..322d191c01 100644 --- a/aphrodite/metal/profiler/__init__.py +++ b/aphrodite/metal/profiler/__init__.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Metal frame-capture profiler for vLLM Metal.""" +"""Metal frame-capture profiler for Aphrodite Metal.""" from aphrodite.metal.profiler.wrapper import MetalProfilerWrapper diff --git a/aphrodite/metal/profiler/wrapper.py b/aphrodite/metal/profiler/wrapper.py index 0688da45cb..e5ce81ac26 100644 --- a/aphrodite/metal/profiler/wrapper.py +++ b/aphrodite/metal/profiler/wrapper.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Metal frame-capture wrapper for vLLM's WorkerProfiler abstraction. +"""Metal frame-capture wrapper for Aphrodite's WorkerProfiler abstraction. Subclasses ``aphrodite.profiler.wrapper.WorkerProfiler`` so that the manual start/stop surface — ``LLM.start_profile`` / ``LLM.stop_profile``, the @@ -35,7 +35,7 @@ class MetalProfilerWrapper(WorkerProfiler): - """Metal frame-capture flavor of vLLM's WorkerProfiler. + """Metal frame-capture flavor of Aphrodite's WorkerProfiler. Trace output: ``/.gputrace`` """ diff --git a/aphrodite/metal/pytorch_backend/tensor_bridge.py b/aphrodite/metal/pytorch_backend/tensor_bridge.py index fd0424b15d..e0a185f101 100644 --- a/aphrodite/metal/pytorch_backend/tensor_bridge.py +++ b/aphrodite/metal/pytorch_backend/tensor_bridge.py @@ -15,7 +15,6 @@ # MPS has a 4GB (2^32 bytes) limit for MPSTemporaryNDArray allocations. # Metal may allocate multiple temporary buffers internally, so we use a # conservative threshold of 1GB to avoid hitting the limit. -# See: https://github.com/anthropics/vllm-metal/issues/43 _MPS_SAFE_SIZE_BYTES = 1 << 30 # 1GB # MLX to PyTorch dtype mapping @@ -150,7 +149,6 @@ def mlx_to_torch( tensor = tensor.to(device) else: # Large tensor - keep on CPU to avoid MPS 4GB limit crash - # See: https://github.com/anthropics/vllm-metal/issues/43 logger.debug( "Tensor too large for MPS (%d bytes > %d limit), keeping on CPU", _get_tensor_size_bytes(array), diff --git a/aphrodite/metal/stt/__init__.py b/aphrodite/metal/stt/__init__.py index 638fb425c3..71fe4f5b1d 100644 --- a/aphrodite/metal/stt/__init__.py +++ b/aphrodite/metal/stt/__init__.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Speech-to-Text support for vLLM Metal.""" +"""Speech-to-Text support for Aphrodite Metal.""" from aphrodite.metal.stt.loader import load_model from aphrodite.metal.stt.protocol import TranscriptionResult, TranscriptionSegment diff --git a/aphrodite/metal/stt/policy.py b/aphrodite/metal/stt/policy.py index b192fce4ae..66c5879051 100644 --- a/aphrodite/metal/stt/policy.py +++ b/aphrodite/metal/stt/policy.py @@ -5,15 +5,15 @@ from typing import Protocol -# Nominal memory reported to vLLM scheduler for STT models. +# Nominal memory reported to Aphrodite scheduler for STT models. # No KV cache is actually allocated; this just passes minimum-memory checks. STT_SCHED_AVAILABLE_BYTES = 1 << 30 # 1 GiB -# Block size reported to vLLM for STT models (minimal, no real KV cache). +# Block size reported to Aphrodite for STT models (minimal, no real KV cache). STT_SCHED_BLOCK_BYTES = 1 # Nominal head size for the placeholder KV spec used only to satisfy -# vLLM scheduler initialization for STT models. +# Aphrodite scheduler initialization for STT models. STT_SCHED_NOMINAL_HEAD_SIZE = 64 diff --git a/aphrodite/metal/stt/qwen3_asr/adapter.py b/aphrodite/metal/stt/qwen3_asr/adapter.py index e77faa1fc5..b88ff087c4 100644 --- a/aphrodite/metal/stt/qwen3_asr/adapter.py +++ b/aphrodite/metal/stt/qwen3_asr/adapter.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Qwen3-ASR runtime adapter for vLLM STT execution.""" +"""Qwen3-ASR runtime adapter for Aphrodite STT execution.""" from __future__ import annotations diff --git a/aphrodite/metal/stt/qwen3_asr/config.py b/aphrodite/metal/stt/qwen3_asr/config.py index f3e790b447..cf72580751 100644 --- a/aphrodite/metal/stt/qwen3_asr/config.py +++ b/aphrodite/metal/stt/qwen3_asr/config.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Qwen3-ASR configuration (MLX-free). -Keep this module free of MLX imports so vLLM compat code can import config and +Keep this module free of MLX imports so Aphrodite compat code can import config and shape helpers during planning/registration without pulling in the model stack. """ @@ -10,7 +10,7 @@ from dataclasses import dataclass, field from aphrodite.transformers_utils.configs.qwen3_asr import ( - Qwen3ASRConfig as VllmQwen3ASRConfig, + Qwen3ASRConfig as AphroditeQwen3ASRConfig, ) # Maximum decode tokens for Qwen3-ASR decode loop. @@ -82,8 +82,8 @@ class Qwen3ASRConfig: n_audio_ctx: int = 1500 @classmethod - def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig: - """Adapt the upstream vLLM/HF config into the local MLX model config.""" + def _from_aphrodite_config(cls, config: AphroditeQwen3ASRConfig) -> Qwen3ASRConfig: + """Adapt the upstream Aphrodite/HF config into the local MLX model config.""" thinker = config.thinker_config audio = thinker.audio_config text = thinker.text_config @@ -131,4 +131,4 @@ def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig: @classmethod def from_dict(cls, d: dict) -> Qwen3ASRConfig: """Create config from config.json using the upstream schema owner.""" - return cls._from_aphrodite_config(VllmQwen3ASRConfig.from_dict(d)) + return cls._from_aphrodite_config(AphroditeQwen3ASRConfig.from_dict(d)) diff --git a/aphrodite/metal/stt/qwen3_asr/model.py b/aphrodite/metal/stt/qwen3_asr/model.py index 09b122505c..044ee04342 100644 --- a/aphrodite/metal/stt/qwen3_asr/model.py +++ b/aphrodite/metal/stt/qwen3_asr/model.py @@ -558,7 +558,7 @@ def decode_step( return self.language_model.forward_embeds(embeds, cache) def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter: - """Create the model-owned runtime adapter used by the vLLM runner.""" + """Create the model-owned runtime adapter used by the Aphrodite runner.""" # Local import: avoid import-time cycles (adapter imports transcriber). from .adapter import Qwen3ASRRuntimeAdapter diff --git a/aphrodite/metal/stt/runtime.py b/aphrodite/metal/stt/runtime.py index cd6c4f6edd..3408210d59 100644 --- a/aphrodite/metal/stt/runtime.py +++ b/aphrodite/metal/stt/runtime.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -"""STT runtime adapter contract used by the vLLM runner. +"""STT runtime adapter contract used by the Aphrodite runner. -The vLLM runner delegates STT execution to model-owned runtime adapters under +The Aphrodite runner delegates STT execution to model-owned runtime adapters under `stt//adapter.py` so shared code does not accumulate per-model branches. """ @@ -26,7 +26,7 @@ class STTRuntimeAdapter(ABC): - """Model-owned bridge between vLLM STT inputs and per-model STT execution. + """Model-owned bridge between Aphrodite STT inputs and per-model STT execution. Concrete implementations live under `stt//adapter.py` and own: - input_features normalization to the model's expected encoder input shape diff --git a/aphrodite/metal/stt/whisper/adapter.py b/aphrodite/metal/stt/whisper/adapter.py index e398041a25..79a7630d3f 100644 --- a/aphrodite/metal/stt/whisper/adapter.py +++ b/aphrodite/metal/stt/whisper/adapter.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Whisper runtime adapter for vLLM STT execution.""" +"""Whisper runtime adapter for Aphrodite STT execution.""" from __future__ import annotations diff --git a/aphrodite/metal/stt/whisper/model.py b/aphrodite/metal/stt/whisper/model.py index 95857b7c8e..3680eae70f 100644 --- a/aphrodite/metal/stt/whisper/model.py +++ b/aphrodite/metal/stt/whisper/model.py @@ -273,7 +273,7 @@ def __init__(self, config: WhisperConfig, dtype: mx.Dtype = mx.float16): self._alignment_heads = mx.array(np.asarray(all_heads.nonzero()).T) def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter: - """Create the model-owned runtime adapter used by the vLLM runner.""" + """Create the model-owned runtime adapter used by the Aphrodite runner.""" # Local import: avoid import-time cycles (adapter imports transcriber). from .adapter import WhisperRuntimeAdapter diff --git a/aphrodite/metal/utils.py b/aphrodite/metal/utils.py index 767cfabb3a..7f716a2c46 100644 --- a/aphrodite/metal/utils.py +++ b/aphrodite/metal/utils.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Metal utility functions for vLLM Metal plugin.""" +"""Metal utility functions for Aphrodite Metal plugin.""" import logging import os @@ -24,7 +24,7 @@ def get_model_download_path(model_repo_name: str) -> str: Example: ```bash - APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache vllm serve Qwen/Qwen2.5-0.5B + APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache aphrodite run Qwen/Qwen2.5-0.5B ``` """ if Path(model_repo_name).exists(): diff --git a/aphrodite/metal/v1/__init__.py b/aphrodite/metal/v1/__init__.py index 5e738d79e1..474186a806 100644 --- a/aphrodite/metal/v1/__init__.py +++ b/aphrodite/metal/v1/__init__.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""vLLM v1 compatibility module for Metal platform.""" +"""Aphrodite v1 compatibility module for Metal platform.""" __all__ = ["MetalWorker"] diff --git a/aphrodite/metal/v1/cache_policy.py b/aphrodite/metal/v1/cache_policy.py index f8df7b7fe1..3efce09b51 100644 --- a/aphrodite/metal/v1/cache_policy.py +++ b/aphrodite/metal/v1/cache_policy.py @@ -66,7 +66,7 @@ class TurboQuantAttentionSpec(FullAttentionSpec): """FullAttentionSpec for TurboQuant-compressed KV cache. Reports the true packed byte count per page via an override of - ``real_page_size_bytes`` so vLLM's scheduler can budget more blocks + ``real_page_size_bytes`` so Aphrodite's scheduler can budget more blocks than the FP16 formula would allow — without lying about ``head_size`` (the ``head_size_v`` reverse-engineering trick the previous version used produced negative values for aggressive 2-bit configs). @@ -146,9 +146,9 @@ def _build_turboquant_attention_spec( def _register_turboquant_spec_manager() -> None: - """Register ``TurboQuantAttentionSpec`` in vLLM's spec→manager map. + """Register ``TurboQuantAttentionSpec`` in Aphrodite's spec→manager map. - vLLM's ``get_manager_for_kv_cache_spec`` uses strict-type lookup + Aphrodite's ``get_manager_for_kv_cache_spec`` uses strict-type lookup (``spec_manager_map[type(spec)]``), not ``isinstance``, so the ``FullAttentionSpec`` entry does not cover subclasses. We reuse ``FullAttentionManager`` because a TurboQuant cache is accessed @@ -157,7 +157,7 @@ def _register_turboquant_spec_manager() -> None: inside the Metal kernel). Mirrors the upstream registration for ``MLAAttentionSpec`` (which - vLLM also maps to ``FullAttentionManager``). + Aphrodite also maps to ``FullAttentionManager``). """ try: from aphrodite.v1.core.single_type_kv_cache_manager import ( @@ -165,7 +165,7 @@ def _register_turboquant_spec_manager() -> None: spec_manager_map, ) except ImportError: - # vLLM shape changed; let the scheduler raise its own clearer error. + # Aphrodite shape changed; let the scheduler raise its own clearer error. return spec_manager_map.setdefault(TurboQuantAttentionSpec, FullAttentionManager) diff --git a/aphrodite/metal/v1/contiguous_cache.py b/aphrodite/metal/v1/contiguous_cache.py index 088606a7f2..5da9912091 100644 --- a/aphrodite/metal/v1/contiguous_cache.py +++ b/aphrodite/metal/v1/contiguous_cache.py @@ -320,7 +320,7 @@ def _merge_rotating_kv_caches( mlx-lm <= 0.29.1 uses ``c.offset`` which can exceed the underlying array size after the cache has rotated, causing a broadcast shape error. - This workaround can be removed once vllm-metal can depend on an mlx-lm version + This workaround can be removed once aphrodite metal can depend on an mlx-lm version that includes the upstream fix (ml-explore/mlx-lm#738) and has been verified to work with gpt-oss models end-to-end. """ diff --git a/aphrodite/metal/v1/mm/__init__.py b/aphrodite/metal/v1/mm/__init__.py index 740928870c..750a441114 100644 --- a/aphrodite/metal/v1/mm/__init__.py +++ b/aphrodite/metal/v1/mm/__init__.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""v1 multimodal runtime helpers for vLLM Metal.""" +"""v1 multimodal runtime helpers for Aphrodite Metal.""" from __future__ import annotations diff --git a/aphrodite/metal/v1/mm/encoder_cache.py b/aphrodite/metal/v1/mm/encoder_cache.py index c03e766815..27f32a0bf9 100644 --- a/aphrodite/metal/v1/mm/encoder_cache.py +++ b/aphrodite/metal/v1/mm/encoder_cache.py @@ -11,7 +11,7 @@ class EncoderCache: """Store multimodal features and MLX encoder outputs by request/hash. - Mirrors upstream vLLM's v1 GPU ``EncoderCache`` with the tensor type + Mirrors upstream Aphrodite's v1 GPU ``EncoderCache`` with the tensor type changed from ``torch.Tensor`` to ``mlx.core.array``. """ @@ -29,7 +29,7 @@ def remove_request(self, req_id: str) -> None: def reset_mm_cache(self) -> None: """Mirror upstream's profiling-cache reset hook.""" - # TODO: Implement when vllm-metal adds profiling-time MM cache state. + # TODO: Implement when aphrodite metal adds profiling-time MM cache state. pass def reset_encoder_cache(self) -> None: diff --git a/aphrodite/metal/v1/model_adapter.py b/aphrodite/metal/v1/model_adapter.py index 17dc24c403..aa8e978e8e 100644 --- a/aphrodite/metal/v1/model_adapter.py +++ b/aphrodite/metal/v1/model_adapter.py @@ -60,11 +60,11 @@ def build_sliding_window_per_layer( """Return per-layer sliding window sizes, or None for no enforcement.""" -# Models/configs that vLLM flags as multimodal but must be loaded via mlx_lm. +# Models/configs that Aphrodite flags as multimodal but must be loaded via mlx_lm. # gemma4: mlx_vlm forward path produces garbled output vs mlx_lm. _TEXT_BACKBONE_OVERRIDE_TYPES: frozenset[str] = frozenset({"gemma4"}) # Qwen3.5/Qwen3.6 conditional-generation wrappers expose a multimodal config, -# but vllm-metal only serves them in text-only mode. Route them through +# but Aphrodite metal only serves them in text-only mode. Route them through # mlx_lm's qwen3_5 text loader; the mlx_vlm wrapper adds multimodal processing # overhead and some local text-only MLX checkpoints do not behave correctly # through the VLM forward path. @@ -135,7 +135,7 @@ def normalize_model_config(self, model_config: ModelConfig) -> None: When the active serve mode routes a multimodal checkpoint through the text-only compatibility path, leaving ``multimodal_config`` populated - causes vLLM to eagerly initialize multimodal processors that the + causes Aphrodite to eagerly initialize multimodal processors that the compatibility path intentionally bypasses. Clearing it here makes ``is_multimodal_model`` ``False`` so the input processor skips that setup. The ``should_force_text_backbone`` predicate is the single diff --git a/aphrodite/metal/v1/model_lifecycle.py b/aphrodite/metal/v1/model_lifecycle.py index c393863d53..f52fb5d5a1 100644 --- a/aphrodite/metal/v1/model_lifecycle.py +++ b/aphrodite/metal/v1/model_lifecycle.py @@ -104,7 +104,7 @@ def _mlx_lm_compatible_model_path(model_name: str): yield model_name return - with TemporaryDirectory(prefix="vllm-metal-mlx-lm-") as tmpdir: + with TemporaryDirectory(prefix="aphrodite-metal-mlx-lm-") as tmpdir: compat_path = Path(tmpdir) for src in model_path.iterdir(): @@ -147,7 +147,7 @@ def load(self) -> None: return model_config = runner.model_config - # vLLM model_config shape varies across backends. + # Aphrodite model_config shape varies across backends. hf_config = getattr(model_config, "hf_config", None) is_vlm = bool(getattr(model_config, "is_multimodal_model", False)) if self._model_adapter.should_force_text_backbone(hf_config): diff --git a/aphrodite/metal/v1/model_runner.py b/aphrodite/metal/v1/model_runner.py index 283eb7ccf4..ee7050b5f9 100644 --- a/aphrodite/metal/v1/model_runner.py +++ b/aphrodite/metal/v1/model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -Metal vLLM v1 model runner. +Metal Aphrodite v1 model runner. Orchestration only: coordinates scheduling, dispatch, and output assembly. Model-specific behavior belongs in adapters; backend-specific kernels live in @@ -87,8 +87,8 @@ def _create_request_generator( ) -> torch.Generator | None: """Create a per-request generator for seeded sampling. - vLLM uses a per-request generator only when an explicit seed is provided. - For unseeded sampling, vLLM relies on the global RNG state. + Aphrodite uses a per-request generator only when an explicit seed is provided. + For unseeded sampling, Aphrodite relies on the global RNG state. """ if sampling_params.seed is None: return None @@ -105,7 +105,7 @@ class RequestState: token_ids: list[int] # Length of the original prompt (prefix) within `token_ids`. - # vLLM applies repetition penalties to both prompt+output tokens, but applies + # Aphrodite applies repetition penalties to both prompt+output tokens, but applies # presence/frequency penalties only to generated (output) tokens. prompt_len: int cache: list[AnyCache] # Per-layer caches (KVCache, RotatingKVCache, or ArraysCache) @@ -315,7 +315,7 @@ def _slice_logprobs_row( class MetalModelRunner: """Model runner for MLX-based inference on Metal. - Implements the vLLM v1 model runner interface for Apple Silicon. + Implements the Aphrodite v1 model runner interface for Apple Silicon. Uses true batched decode with BatchKVCache for efficient parallel processing. """ @@ -327,7 +327,7 @@ def __init__( """Initialize model runner. Args: - aphrodite_config: vLLM configuration + aphrodite_config: Aphrodite configuration device: PyTorch device (CPU for Metal interop) """ self.aphrodite_config = aphrodite_config @@ -359,7 +359,7 @@ def __init__( self._gdn_free_slots: list[int] = [] self._gdn_needs_materialize = False - # vLLM Sampler for token sampling with temperature, top_k, top_p support + # Aphrodite Sampler for token sampling with temperature, top_k, top_p support self._sampler = Sampler() # Build logits processors (includes custom plugins from entry-points) @@ -375,7 +375,7 @@ def __init__( custom_logitsprocs, ) - # vLLM v1 async scheduling calls sample_tokens after execute_model. + # Aphrodite v1 async scheduling calls sample_tokens after execute_model. # Keep the latest execution output so sample_tokens can return it. self._pending_output: ModelRunnerOutput | None = None @@ -1705,7 +1705,7 @@ def execute_model( ) return None - # Defensive invariant: the vLLM scheduler sets has_structured_output_requests + # Defensive invariant: the Aphrodite scheduler sets has_structured_output_requests # only when at least one SO request is present in the *current* scheduled # batch (not the global queue). Any such request on the paged path must # contribute a paged decode or prefill entry, so has_paged_work() must be @@ -1736,7 +1736,7 @@ def sample_tokens( ) -> ModelRunnerOutput | None: """Wait for GPU forward, sample tokens, and postprocess. - Called by the vLLM v1 engine after ``execute_model`` returns ``None``. + Called by the Aphrodite v1 engine after ``execute_model`` returns ``None``. For the paged path, this is where the actual GPU synchronization, token sampling, and request state updates happen — allowing the scheduler to run while the GPU was computing the forward pass. @@ -1755,7 +1755,7 @@ def sample_tokens( return output # Async scheduling: execute_model may have failed; return None so - # vLLM can surface the original exception. + # Aphrodite can surface the original exception. logger.error( "sample_tokens called with no pending state — " "neither _execute_model_state nor _pending_output was set." diff --git a/aphrodite/metal/v1/sampling_batch.py b/aphrodite/metal/v1/sampling_batch.py index 0d02eeb806..d46f1c47fb 100644 --- a/aphrodite/metal/v1/sampling_batch.py +++ b/aphrodite/metal/v1/sampling_batch.py @@ -9,15 +9,15 @@ import mlx.core as mx import torch + +from aphrodite.metal.pytorch_backend.tensor_bridge import mlx_to_torch from aphrodite.sampling_params import SamplingParams from aphrodite.utils.torch_utils import make_tensor_with_pad +from aphrodite.v1.outputs import LogprobsTensors from aphrodite.v1.sample.logits_processor import LogitsProcessors from aphrodite.v1.sample.logits_processor.interface import BatchUpdate from aphrodite.v1.sample.metadata import SamplingMetadata from aphrodite.v1.sample.sampler import Sampler -from aphrodite.v1.outputs import LogprobsTensors - -from aphrodite.metal.pytorch_backend.tensor_bridge import mlx_to_torch GREEDY_TEMPERATURE_EPS = 1e-5 @@ -74,85 +74,46 @@ def __init__( self.logitsprocs = logitsprocs or LogitsProcessors() self.generators = {} if generators is None else generators self.all_greedy = all( - sampling_params.temperature < GREEDY_TEMPERATURE_EPS - for sampling_params in self.sampling_params_list + sampling_params.temperature < GREEDY_TEMPERATURE_EPS for sampling_params in self.sampling_params_list ) self.all_random = not self.all_greedy and all( - sampling_params.temperature >= GREEDY_TEMPERATURE_EPS - for sampling_params in self.sampling_params_list - ) - self.no_top_p = all( - sampling_params.top_p == 1.0 - for sampling_params in self.sampling_params_list + sampling_params.temperature >= GREEDY_TEMPERATURE_EPS for sampling_params in self.sampling_params_list ) + self.no_top_p = all(sampling_params.top_p == 1.0 for sampling_params in self.sampling_params_list) self.no_top_k = all( - not (0 < sampling_params.top_k < self.vocab_size) - for sampling_params in self.sampling_params_list + not (0 < sampling_params.top_k < self.vocab_size) for sampling_params in self.sampling_params_list ) self.no_dynatemp = all( - sampling_params.dynatemp_min <= 0.0 - and sampling_params.dynatemp_max <= 0.0 - for sampling_params in self.sampling_params_list - ) - self.no_top_a = all( - sampling_params.top_a <= 0.0 for sampling_params in self.sampling_params_list - ) - self.no_dry = all( - sampling_params.dry_multiplier <= 0.0 + sampling_params.dynatemp_min <= 0.0 and sampling_params.dynatemp_max <= 0.0 for sampling_params in self.sampling_params_list ) + self.no_top_a = all(sampling_params.top_a <= 0.0 for sampling_params in self.sampling_params_list) + self.no_dry = all(sampling_params.dry_multiplier <= 0.0 for sampling_params in self.sampling_params_list) self.no_no_repeat_ngram = all( - sampling_params.no_repeat_ngram_size <= 0 - for sampling_params in self.sampling_params_list - ) - self.no_tfs = all( - sampling_params.tfs >= 1.0 for sampling_params in self.sampling_params_list - ) - self.no_eta_cutoff = all( - sampling_params.eta_cutoff <= 0.0 - for sampling_params in self.sampling_params_list + sampling_params.no_repeat_ngram_size <= 0 for sampling_params in self.sampling_params_list ) + self.no_tfs = all(sampling_params.tfs >= 1.0 for sampling_params in self.sampling_params_list) + self.no_eta_cutoff = all(sampling_params.eta_cutoff <= 0.0 for sampling_params in self.sampling_params_list) self.no_epsilon_cutoff = all( - sampling_params.epsilon_cutoff <= 0.0 - for sampling_params in self.sampling_params_list - ) - self.no_typical_p = all( - sampling_params.typical_p >= 1.0 - for sampling_params in self.sampling_params_list + sampling_params.epsilon_cutoff <= 0.0 for sampling_params in self.sampling_params_list ) + self.no_typical_p = all(sampling_params.typical_p >= 1.0 for sampling_params in self.sampling_params_list) self.no_quadratic = all( - sampling_params.smoothing_factor <= 0.0 - for sampling_params in self.sampling_params_list - ) - self.no_xtc = all( - sampling_params.xtc_probability <= 0.0 - for sampling_params in self.sampling_params_list - ) - self.no_top_nsigma = all( - sampling_params.nsigma <= 0.0 for sampling_params in self.sampling_params_list - ) - self.no_mirostat = all( - sampling_params.mirostat_mode == 0 - for sampling_params in self.sampling_params_list - ) - self.no_skew = all( - sampling_params.skew == 0.0 for sampling_params in self.sampling_params_list + sampling_params.smoothing_factor <= 0.0 for sampling_params in self.sampling_params_list ) + self.no_xtc = all(sampling_params.xtc_probability <= 0.0 for sampling_params in self.sampling_params_list) + self.no_top_nsigma = all(sampling_params.nsigma <= 0.0 for sampling_params in self.sampling_params_list) + self.no_mirostat = all(sampling_params.mirostat_mode == 0 for sampling_params in self.sampling_params_list) + self.no_skew = all(sampling_params.skew == 0.0 for sampling_params in self.sampling_params_list) self.no_allowed_token_ids = all( - not sampling_params.allowed_token_ids - for sampling_params in self.sampling_params_list + not sampling_params.allowed_token_ids for sampling_params in self.sampling_params_list ) self.no_bad_words = all( - not sampling_params.bad_words_token_ids - for sampling_params in self.sampling_params_list - ) - self.no_logit_bias = all( - not sampling_params.logit_bias - for sampling_params in self.sampling_params_list + not sampling_params.bad_words_token_ids for sampling_params in self.sampling_params_list ) + self.no_logit_bias = all(not sampling_params.logit_bias for sampling_params in self.sampling_params_list) self.no_logprob_token_ids = all( - sampling_params.logprob_token_ids is None - for sampling_params in self.sampling_params_list + sampling_params.logprob_token_ids is None for sampling_params in self.sampling_params_list ) self.no_penalties = all( sampling_params.frequency_penalty == 0.0 @@ -170,10 +131,7 @@ def can_use_native_greedy( """Return whether MLX argmax matches the requested sampling behavior.""" return all( sampling_params.temperature < GREEDY_TEMPERATURE_EPS - and ( - sampling_params.top_k <= 0 - or (vocab_size is not None and sampling_params.top_k >= vocab_size) - ) + and (sampling_params.top_k <= 0 or (vocab_size is not None and sampling_params.top_k >= vocab_size)) and sampling_params.top_p == 1.0 and sampling_params.min_p == 0.0 and sampling_params.top_a == 0.0 @@ -207,10 +165,7 @@ def _make_temperature(self) -> torch.Tensor | None: return None return torch.tensor( - [ - sampling_params.temperature - for sampling_params in self.sampling_params_list - ], + [sampling_params.temperature for sampling_params in self.sampling_params_list], dtype=torch.float32, device=self.device, ) @@ -231,9 +186,7 @@ def _make_top_k(self) -> torch.Tensor | None: return torch.tensor( [ - sampling_params.top_k - if 0 < sampling_params.top_k < self.vocab_size - else self.vocab_size + sampling_params.top_k if 0 < sampling_params.top_k < self.vocab_size else self.vocab_size for sampling_params in self.sampling_params_list ], dtype=torch.int32, @@ -251,10 +204,7 @@ def _make_float_tensor( return None field_name = source_attr or attr return torch.tensor( - [ - float(getattr(sampling_params, field_name)) - for sampling_params in self.sampling_params_list - ], + [float(getattr(sampling_params, field_name)) for sampling_params in self.sampling_params_list], dtype=torch.float32, device=self.device, ) @@ -271,10 +221,7 @@ def _make_int_tensor( return None field_name = source_attr or attr return torch.tensor( - [ - int(getattr(sampling_params, field_name)) - for sampling_params in self.sampling_params_list - ], + [int(getattr(sampling_params, field_name)) for sampling_params in self.sampling_params_list], dtype=dtype, device=self.device, ) @@ -295,26 +242,17 @@ def _make_penalty_tensors( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: frequency_penalties = torch.tensor( - [ - sampling_params.frequency_penalty - for sampling_params in self.sampling_params_list - ], + [sampling_params.frequency_penalty for sampling_params in self.sampling_params_list], dtype=torch.float32, device=self.device, ) presence_penalties = torch.tensor( - [ - sampling_params.presence_penalty - for sampling_params in self.sampling_params_list - ], + [sampling_params.presence_penalty for sampling_params in self.sampling_params_list], dtype=torch.float32, device=self.device, ) repetition_penalties = torch.tensor( - [ - sampling_params.repetition_penalty - for sampling_params in self.sampling_params_list - ], + [sampling_params.repetition_penalty for sampling_params in self.sampling_params_list], dtype=torch.float32, device=self.device, ) @@ -460,7 +398,7 @@ def can_use_native_random_for_batch(self) -> bool: ) def make_sampling_metadata(self) -> SamplingMetadata: - """Create vLLM ``SamplingMetadata`` for this batch.""" + """Create Aphrodite ``SamplingMetadata`` for this batch.""" self._refresh_logits_processors() ( frequency_penalties, @@ -482,41 +420,23 @@ def make_sampling_metadata(self) -> SamplingMetadata: top_p=self._make_top_p(), top_k=self._make_top_k(), top_a=self._make_float_tensor("top_a", disabled=self.no_top_a), - dry_multiplier=self._make_float_tensor( - "dry_multiplier", disabled=self.no_dry - ), + dry_multiplier=self._make_float_tensor("dry_multiplier", disabled=self.no_dry), dry_base=self._make_float_tensor("dry_base", disabled=self.no_dry), - dry_allowed_length=self._make_int_tensor( - "dry_allowed_length", disabled=self.no_dry - ), + dry_allowed_length=self._make_int_tensor("dry_allowed_length", disabled=self.no_dry), dry_sequence_breaker_ids=self._make_dry_sequence_breaker_ids(), dry_ranges=self._make_int_tensor( "dry_ranges", disabled=self.no_dry, source_attr="dry_range", ), - dry_max_ngram=self._make_int_tensor( - "dry_max_ngram", disabled=self.no_dry - ), - dry_max_occurrences=self._make_int_tensor( - "dry_max_occurrences", disabled=self.no_dry - ), - dry_early_exit_match_len=self._make_int_tensor( - "dry_early_exit_match_len", disabled=self.no_dry - ), - no_repeat_ngram_size=self._make_int_tensor( - "no_repeat_ngram_size", disabled=self.no_no_repeat_ngram - ), + dry_max_ngram=self._make_int_tensor("dry_max_ngram", disabled=self.no_dry), + dry_max_occurrences=self._make_int_tensor("dry_max_occurrences", disabled=self.no_dry), + dry_early_exit_match_len=self._make_int_tensor("dry_early_exit_match_len", disabled=self.no_dry), + no_repeat_ngram_size=self._make_int_tensor("no_repeat_ngram_size", disabled=self.no_no_repeat_ngram), tfs=self._make_float_tensor("tfs", disabled=self.no_tfs), - eta_cutoff=self._make_float_tensor( - "eta_cutoff", disabled=self.no_eta_cutoff - ), - epsilon_cutoff=self._make_float_tensor( - "epsilon_cutoff", disabled=self.no_epsilon_cutoff - ), - typical_p=self._make_float_tensor( - "typical_p", disabled=self.no_typical_p - ), + eta_cutoff=self._make_float_tensor("eta_cutoff", disabled=self.no_eta_cutoff), + epsilon_cutoff=self._make_float_tensor("epsilon_cutoff", disabled=self.no_epsilon_cutoff), + typical_p=self._make_float_tensor("typical_p", disabled=self.no_typical_p), quadratic_smoothing_factor=self._make_float_tensor( "quadratic_smoothing_factor", disabled=self.no_quadratic, @@ -527,26 +447,16 @@ def make_sampling_metadata(self) -> SamplingMetadata: disabled=self.no_quadratic, source_attr="smoothing_curve", ), - xtc_threshold=self._make_float_tensor( - "xtc_threshold", disabled=self.no_xtc - ), - xtc_probability=self._make_float_tensor( - "xtc_probability", disabled=self.no_xtc - ), + xtc_threshold=self._make_float_tensor("xtc_threshold", disabled=self.no_xtc), + xtc_probability=self._make_float_tensor("xtc_probability", disabled=self.no_xtc), top_nsigma=self._make_float_tensor( "top_nsigma", disabled=self.no_top_nsigma, source_attr="nsigma", ), - mirostat_mode=self._make_int_tensor( - "mirostat_mode", disabled=self.no_mirostat - ), - mirostat_tau=self._make_float_tensor( - "mirostat_tau", disabled=self.no_mirostat - ), - mirostat_eta=self._make_float_tensor( - "mirostat_eta", disabled=self.no_mirostat - ), + mirostat_mode=self._make_int_tensor("mirostat_mode", disabled=self.no_mirostat), + mirostat_tau=self._make_float_tensor("mirostat_tau", disabled=self.no_mirostat), + mirostat_eta=self._make_float_tensor("mirostat_eta", disabled=self.no_mirostat), skew=self._make_float_tensor("skew", disabled=self.no_skew), generators=self.generators, max_num_logprobs=self._make_max_num_logprobs(), @@ -561,13 +471,8 @@ def make_sampling_metadata(self) -> SamplingMetadata: logit_bias=self._make_logit_bias(), logitsprocs=self.logitsprocs, logprob_token_ids=self._make_logprob_token_ids(), - temperature_last=[ - sampling_params.temperature_last - for sampling_params in self.sampling_params_list - ], - persistent_data={ - index: {} for index in range(len(self.sampling_params_list)) - }, + temperature_last=[sampling_params.temperature_last for sampling_params in self.sampling_params_list], + persistent_data={index: {} for index in range(len(self.sampling_params_list))}, spec_token_ids=None, ) @@ -593,16 +498,12 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array: if not batch.no_top_k: top_ks = [ - sampling_params.top_k - if 0 < sampling_params.top_k < batch.vocab_size - else batch.vocab_size + sampling_params.top_k if 0 < sampling_params.top_k < batch.vocab_size else batch.vocab_size for sampling_params in batch.sampling_params_list ] max_top_k = max(top_ks) if max_top_k < batch.vocab_size: - topk_indices = mx.argpartition(-logits, max_top_k - 1, axis=-1)[ - :, :max_top_k - ] + topk_indices = mx.argpartition(-logits, max_top_k - 1, axis=-1)[:, :max_top_k] logits = mx.take_along_axis(logits, topk_indices, axis=-1) if len(set(top_ks)) != 1: positions = mx.arange(max_top_k)[None, :] @@ -612,15 +513,10 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array: if not batch.no_top_p: sorted_positions = mx.argsort(-logits, axis=-1) sorted_logits = mx.take_along_axis(logits, sorted_positions, axis=-1) - sorted_indices = mx.take_along_axis( - topk_indices, sorted_positions, axis=-1 - ) + sorted_indices = mx.take_along_axis(topk_indices, sorted_positions, axis=-1) sorted_probs = mx.softmax(sorted_logits, axis=-1) top_ps = mx.array( - [ - sampling_params.top_p - for sampling_params in batch.sampling_params_list - ], + [sampling_params.top_p for sampling_params in batch.sampling_params_list], dtype=mx.float32, )[:, None] # Keep the first token that crosses top-p, matching nucleus @@ -629,14 +525,10 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array: remove = (mx.cumsum(sorted_probs, axis=-1) - sorted_probs) > top_ps sorted_logits = mx.where(remove, -float("inf"), sorted_logits) sampled_positions = mx.random.categorical(sorted_logits, axis=-1) - return mx.take_along_axis( - sorted_indices, sampled_positions[:, None], axis=-1 - )[:, 0] + return mx.take_along_axis(sorted_indices, sampled_positions[:, None], axis=-1)[:, 0] sampled_positions = mx.random.categorical(logits, axis=-1) - return mx.take_along_axis( - topk_indices, sampled_positions[:, None], axis=-1 - )[:, 0] + return mx.take_along_axis(topk_indices, sampled_positions[:, None], axis=-1)[:, 0] topk_values = mx.topk(logits, max_top_k, axis=-1) topk_thresholds = mx.min(topk_values, axis=-1, keepdims=True) @@ -655,9 +547,7 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array: remove = (mx.cumsum(sorted_probs, axis=-1) - sorted_probs) > top_ps sorted_logits = mx.where(remove, -float("inf"), sorted_logits) sampled_positions = mx.random.categorical(sorted_logits, axis=-1) - return mx.take_along_axis( - sorted_indices, sampled_positions[:, None], axis=-1 - )[:, 0] + return mx.take_along_axis(sorted_indices, sampled_positions[:, None], axis=-1)[:, 0] return mx.random.categorical(logits, axis=-1) @@ -671,7 +561,7 @@ def sample_from_logits( """Sample tokens from pre-sliced 2D logits ``(batch_size, vocab)``. Single entry point for all sampling paths. Chooses native MLX greedy - when possible, otherwise bridges to the vLLM torch sampler. + when possible, otherwise bridges to the Aphrodite torch sampler. """ if batch.can_use_native_greedy_for_batch(): tokens = _mlx_greedy_sample(logits_2d) @@ -713,7 +603,7 @@ def sample_decode_tokens( logits: Full logits array, shape ``(1, total_tokens, vocab)``. decode_reqs: ``(req_id, RequestState)`` pairs for decode requests. num_decode: Number of decode requests (prefix of the token dimension). - sampler: vLLM Sampler instance. + sampler: Aphrodite Sampler instance. device: PyTorch device for the torch bridge path. vocab_size: Model vocabulary size. logitsprocs: Optional logits processors. @@ -727,17 +617,9 @@ def sample_decode_tokens( decode_logits = logits[0, :num_decode, :] # (num_decode, vocab) sampling_params_list = [state.sampling_params for _, state in decode_reqs] - prompt_token_ids_list = [ - state.token_ids[: state.prompt_len] for _, state in decode_reqs - ] - output_tokens_list = [ - state.token_ids[state.prompt_len :] for _, state in decode_reqs - ] - generators = { - i: state.generator - for i, (_, state) in enumerate(decode_reqs) - if state.generator is not None - } + prompt_token_ids_list = [state.token_ids[: state.prompt_len] for _, state in decode_reqs] + output_tokens_list = [state.token_ids[state.prompt_len :] for _, state in decode_reqs] + generators = {i: state.generator for i, (_, state) in enumerate(decode_reqs) if state.generator is not None} batch = SamplingBatch( sampling_params_list, @@ -769,7 +651,7 @@ def sample_prefill_tokens( prefill_reqs: List of ``PrefillRequest`` objects. cu_seqlens: Cumulative sequence lengths for logit position lookup. num_decode: Number of decode requests (offset into cu_seqlens). - sampler: vLLM Sampler instance. + sampler: Aphrodite Sampler instance. device: PyTorch device for the torch bridge path. vocab_size: Model vocabulary size. logitsprocs: Optional logits processors. @@ -790,11 +672,7 @@ def sample_prefill_tokens( else: prompt_len = len(pr.token_ids) - prompt_for_meta = ( - pr.full_prompt_token_ids - if pr.full_prompt_token_ids is not None - else pr.token_ids - ) + prompt_for_meta = pr.full_prompt_token_ids if pr.full_prompt_token_ids is not None else pr.token_ids generators = {} if pr.generator is None else {0: pr.generator} batch = SamplingBatch( @@ -839,9 +717,7 @@ def _merge_single_row_logprobs( continue pad = width - row.logprobs.shape[1] if pad: - token_rows.append( - torch.nn.functional.pad(row.logprob_token_ids[:1], (0, pad)) - ) + token_rows.append(torch.nn.functional.pad(row.logprob_token_ids[:1], (0, pad))) logprob_rows.append(torch.nn.functional.pad(row.logprobs[:1], (0, pad))) else: token_rows.append(row.logprob_token_ids[:1]) diff --git a/aphrodite/platforms/cpu.py b/aphrodite/platforms/cpu.py index 0ae45499a0..74b3af119a 100644 --- a/aphrodite/platforms/cpu.py +++ b/aphrodite/platforms/cpu.py @@ -18,7 +18,12 @@ from aphrodite.utils.mem_constants import GiB_bytes from aphrodite.v1.attention.backends.registry import AttentionBackendEnum -from .interface import CpuArchEnum, Platform, PlatformEnum +from .interface import ( + CpuArchEnum, + Platform, + PlatformEnum, + log_extension_import_failure, +) logger = init_logger(__name__) @@ -371,24 +376,32 @@ def import_kernels(cls) -> None: try: import aphrodite._C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._C: %r", e) + log_extension_import_failure( + "aphrodite._C", e, target_logger=logger + ) else: try: import aphrodite._C_AVX512 # noqa: F401 except ImportError as e: if ignored_msg not in e.msg: - logger.warning("Failed to import from aphrodite._C_AVX512: %r", e) + log_extension_import_failure( + "aphrodite._C_AVX512", e, target_logger=logger + ) else: try: import aphrodite._C_AVX2 # noqa: F401 except ImportError as e: if ignored_msg not in e.msg: - logger.warning("Failed to import from aphrodite._C_AVX2: %r", e) + log_extension_import_failure( + "aphrodite._C_AVX2", e, target_logger=logger + ) else: try: import aphrodite._C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._C: %r", e) + log_extension_import_failure( + "aphrodite._C", e, target_logger=logger + ) @classmethod def pack_kv_cache( diff --git a/aphrodite/platforms/interface.py b/aphrodite/platforms/interface.py index b6f673cb2b..8af685390c 100644 --- a/aphrodite/platforms/interface.py +++ b/aphrodite/platforms/interface.py @@ -29,6 +29,28 @@ logger = init_logger(__name__) +_EXTENSION_IMPORT_FAILURES_WARNED: set[str] = set() + + +def log_extension_import_failure( + module_name: str, + exc: ImportError, + *, + target_logger: Any = logger, +) -> None: + """Log extension import failures once at warning level per process. + + Several platform probes can legitimately try the same optional extension + during startup. Keep the first failure visible, but demote repeat failures + to debug so missing optional extensions do not flood server logs. + """ + message = "Failed to import from %s: %r" + if module_name in _EXTENSION_IMPORT_FAILURES_WARNED: + target_logger.debug(message, module_name, exc) + return + _EXTENSION_IMPORT_FAILURES_WARNED.add(module_name) + target_logger.warning(message, module_name, exc) + def in_wsl() -> bool: # Reference: https://github.com/microsoft/WSL/issues/4071 @@ -241,7 +263,7 @@ def import_kernels(cls) -> None: try: import aphrodite._C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._C: %r", e) + log_extension_import_failure("aphrodite._C", e) with contextlib.suppress(ImportError): import aphrodite._moe_C # noqa: F401 diff --git a/aphrodite/platforms/rocm.py b/aphrodite/platforms/rocm.py index 1b6eb43dcb..abd07f16d2 100644 --- a/aphrodite/platforms/rocm.py +++ b/aphrodite/platforms/rocm.py @@ -15,7 +15,12 @@ from aphrodite.logger import init_logger from aphrodite.v1.attention.backends.registry import AttentionBackendEnum -from .interface import DeviceCapability, Platform, PlatformEnum +from .interface import ( + DeviceCapability, + Platform, + PlatformEnum, + log_extension_import_failure, +) if TYPE_CHECKING: from aphrodite.config import AphroditeConfig @@ -36,18 +41,18 @@ amdsmi_topo_get_numa_node_number, ) except ImportError as e: - logger.warning("Failed to import from amdsmi with %r", e) + logger.warning_once("Failed to import from amdsmi with %r", e) try: import aphrodite._C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._C with %r", e) + log_extension_import_failure("aphrodite._C", e, target_logger=logger) # import custom ops, trigger op registration try: import aphrodite._rocm_C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._rocm_C with %r", e) + log_extension_import_failure("aphrodite._rocm_C", e, target_logger=logger) # Models not supported by ROCm. _ROCM_UNSUPPORTED_MODELS: list[str] = []