dphnAI · AlpinDale · May 6, 2026 · May 6, 2026
diff --git a/aphrodite/metal/compat.py b/aphrodite/metal/compat.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compatibility patches for vLLM + transformers version mismatches.
+"""Compatibility patches for Aphrodite + transformers version mismatches.
 
 Applied once at platform registration time. Optional missing dependencies are
 logged; unexpected runtime errors are allowed to surface so regressions remain
@@ -146,8 +146,7 @@ def _stack_qwen36_moe_per_expert_weights(
     and ``...mlp.experts.down_proj``, both stacked along axis 0 over experts.
 
     Mirrors the (scan -> validate -> walk) structure of upstream
-    ml-explore/mlx-lm#1224. Removable once vllm-metal's mlx-lm pin bumps
-    past that merge.
+    ml-explore/mlx-lm#1224.
 
     No-op when no per-expert keys are present (dense Qwen3.5/3.6 or already-
     stacked MoE checkpoints).

diff --git a/aphrodite/metal/config.py b/aphrodite/metal/config.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Configuration for vLLM Metal plugin via environment variables."""
+"""Configuration for Aphrodite Metal plugin via environment variables."""
 
 import os
 from dataclasses import dataclass
@@ -37,7 +37,7 @@
 
 @dataclass
 class MetalConfig:
-    """Configuration for vLLM Metal plugin."""
+    """Configuration for Aphrodite Metal plugin."""
 
     memory_fraction: float  # -1.0 means "auto" (calculate minimal needed)
     use_mlx: bool
@@ -66,8 +66,8 @@
                "APHRODITE_METAL_KV_SHARING_FAST_PREFILL=0."
            )

        if self.use_paged_attention and not self.is_auto_memory:
            if not (0 < self.memory_fraction <= 1):
                raise ValueError(
                    f"Invalid APHRODITE_METAL_MEMORY_FRACTION={self.memory_fraction}. "
                    "Must be a finite value in (0, 1] when paged attention is enabled."

diff --git a/aphrodite/metal/envs.py b/aphrodite/metal/envs.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Environment variable definitions for the vLLM Metal plugin.
+"""Environment variable definitions for the Aphrodite Metal plugin.
 
 This module is the single source of truth for all ``APHRODITE_METAL_*`` (and
 ``APHRODITE_MLX_*``) environment variables.  It mirrors the lazy-evaluation
-pattern used by ``vllm/envs.py``: each variable is read from
+pattern used by ``aphrodite/envs.py``: each variable is read from
 ``os.environ`` on access via ``__getattr__``, so values are never stale
 and ``monkeypatch.setenv`` works in tests without extra resets.
 
 During plugin registration (``aphrodite.metal._register``), the
 ``environment_variables`` dict is merged into
 ``aphrodite.envs.environment_variables`` so that ``validate_environ()``
-recognises our variables and does not emit spurious "Unknown vLLM
+recognises our variables and does not emit spurious "Unknown Aphrodite
 environment variable" warnings.
 """
 
@@ -84,5 +84,5 @@ def __getattr__(name: str) -> Any:
 
 
 def __dir__() -> list[str]:
-    # Mirrors vllm/envs.py; enables tab-completion and introspection.
+    # Mirrors aphrodite/envs.py; enables tab-completion and introspection.
     return list(environment_variables.keys())
diff --git a/aphrodite/metal/metal_backend.py b/aphrodite/metal/metal_backend.py
@@ -18,7 +18,7 @@ class MetalBackend(AttentionBackend):
     block_size, and the hybrid-block-size math via
     Platform._align_hybrid_block_size) can read Metal's MultipleOf(16)
     alignment constraint. The Metal paged-attention kernels are tuned for
-    block_size=16; advertising MultipleOf(16) makes vLLM's selector default
+    block_size=16; advertising MultipleOf(16) makes Aphrodite's selector default
     to 16 and lets hybrid models align to multiples of 16. It is never
     dispatched to as a real attention backend — the actual Metal paged
     attention lives in metal_kernel_backend/paged_attention.py. The

diff --git a/aphrodite/metal/metal_kernel_backend/attention_sdpa.py b/aphrodite/metal/metal_kernel_backend/attention_sdpa.py
@@ -110,7 +110,7 @@ def _build_block_tables(
     """Build kernel-compatible block tables, translating if necessary.
 
     When ``cache_block_size`` exceeds the kernel's compiled block sizes,
-    each vLLM block ``b`` is expanded into ``ratio`` kernel blocks
+    each Aphrodite block ``b`` is expanded into ``ratio`` kernel blocks
     ``[b*ratio, b*ratio+ratio)``.  The cache is reshaped later to
     match (zero-copy).
 
@@ -136,7 +136,7 @@ def _build_block_tables(
         return result
 
     # Hybrid path — translate large block_size to a kernel-compatible one.
-    # Vectorized: each vLLM block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
+    # Vectorized: each Aphrodite block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
     kernel_bs = _pick_kernel_block_size(cache_block_size)
     ratio = cache_block_size // kernel_bs
 
@@ -447,10 +447,10 @@ def sdpa_forward(
     max_seq_len = ctx.max_context_len or max(ctx.context_lens)
 
     # --- Block tables (with hybrid block-size translation) ---
-    # vLLM may inflate block_size (e.g. 544) to align attention pages with
+    # Aphrodite may inflate block_size (e.g. 544) to align attention pages with
     # mamba pages in hybrid models.  The Metal kernel only supports small
     # block sizes (8, 16, 32).  _build_block_tables handles the translation:
-    # it expands each vLLM block into multiple kernel blocks and returns the
+    # it expands each Aphrodite block into multiple kernel blocks and returns the
     # kernel-compatible block_size.  The cache is reshaped to match (zero-copy).
     block_tables, kernel_block_size = _build_block_tables(ctx, kv_cache.block_size)
 

diff --git a/aphrodite/metal/multimodal/__init__.py b/aphrodite/metal/multimodal/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Generic multimodal helpers for vLLM Metal."""
+"""Generic multimodal helpers for Aphrodite Metal."""
 
 from __future__ import annotations
 

diff --git a/aphrodite/metal/multimodal/embeddings.py b/aphrodite/metal/multimodal/embeddings.py
@@ -16,7 +16,7 @@ def merge_multimodal_embeddings(
 ) -> mx.array:
     """Splice multimodal embeddings into placeholder positions.
 
-    Mirrors ``vllm/model_executor/models/utils.py``
+    Mirrors ``aphrodite/model_executor/models/utils.py``
     ``_merge_multimodal_embeddings`` for MLX arrays.  Returns a new array;
     ``inputs_embeds`` is not mutated.
     """

diff --git a/aphrodite/metal/multimodal/qwen3_vl/adapter.py b/aphrodite/metal/multimodal/qwen3_vl/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Qwen3-VL multimodal adapter for vLLM Metal."""
+"""Qwen3-VL multimodal adapter for Aphrodite Metal."""
 
 from __future__ import annotations
 
@@ -30,10 +30,10 @@ def get_mrope_input_positions(
     ) -> tuple[mx.array, int]:
         """Return ``((3, seq_len) int32 positions, mrope_position_delta)``.
 
-        Calls upstream vLLM's mm_features-driven Qwen3-VL M-RoPE helper with a
+        Calls upstream Aphrodite's mm_features-driven Qwen3-VL M-RoPE helper with a
         minimal image-only config shim, then converts the returned torch tensor
         to an MLX array.  This keeps the position-building policy upstream-owned
-        while the vllm-metal runner can consume MLX arrays.
+        while the aphrodite metal runner can consume MLX arrays.
         """
         if not input_tokens:
             return mx.zeros((3, 0), dtype=mx.int32), 0

diff --git a/aphrodite/metal/platform.py b/aphrodite/metal/platform.py
@@ -220,24 +220,21 @@
 
     @classmethod
     def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
-        """Check and update vLLM configuration for Metal compatibility.
+        """Check and update Aphrodite configuration for Metal compatibility.
 
         Args:
-            aphrodite_config: vLLM configuration object
+            aphrodite_config: Aphrodite configuration object
         """
         config = get_config()
         parallel_config = aphrodite_config.parallel_config
         model_config = aphrodite_config.model_config
         compilation_config = aphrodite_config.compilation_config
 
         # Metal execution is MLX-backed. Torch Inductor/CUDAGraph settings do
-        # not apply to the actual model path, so normalize them here rather
-        # than requiring users to pass --enforce-eager.
+        # not apply to the actual model path, so disable those compilation
+        # surfaces without overriding the user's eager-mode flag here.
         from aphrodite.config.compilation import CompilationMode, CUDAGraphMode
 
-        if model_config is not None and not model_config.enforce_eager:
-            logger.info("Metal: forcing eager mode; torch.compile/CUDAGraphs are not used on MLX.")
-            model_config.enforce_eager = True
         compilation_config.mode = CompilationMode.NONE
         compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         compilation_config.max_cudagraph_capture_size = 0
@@ -253,8 +250,8 @@
            config.v_quant = add.get("v_quant", "q3_0")
            config._validate_turboquant()
            logger.info(
                f"TurboQuant enabled via --additional-config: "
                f"k_quant={config.k_quant}, v_quant={config.v_quant}"
            )

        scheduler_config = aphrodite_config.scheduler_config
@@ -272,7 +269,7 @@
            )

        if config.debug:
            logger.info(f"Metal config: {config}")

        # Set worker class for Metal
        if parallel_config.worker_cls == "auto":
@@ -356,8 +353,8 @@
        total_mem = cls.get_device_total_memory()
        available_mem = cls.get_device_available_memory()
        logger.info(
            f"Metal memory: {total_mem / 1e9:.1f}GB total, "
            f"{available_mem / 1e9:.1f}GB available"
        )

    @classmethod
@@ -385,7 +382,7 @@
     def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> None:
         """Update block_size for Metal platform.
 
-        Delegates to vLLM's base implementation, which reads the Metal kernel
+        Delegates to Aphrodite's base implementation, which reads the Metal kernel
         alignment (MultipleOf(16)) from our :meth:`_find_non_ssm_backend`
         override. Adds a one-time warning when paged attention is enabled for
         a hybrid model, explaining the cache-block-size translation mechanism
@@ -403,16 +400,16 @@
         # block-size translation mechanism.
         #
         # Background:
-        # - vLLM requires block_size=160 (or larger) for hybrid models to satisfy
+        # - Aphrodite requires block_size=160 (or larger) for hybrid models to satisfy
         #   page size divisibility validation between SDPA and Mamba layers.
         #
-        # Solution (PR #235):
-        # - vLLM sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
+        # Solution:
+        # - Aphrodite sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
         #   validation.
         # - The Metal kernel uses a translated block_size (16, the kernel sweet
         #   spot) that it supports.
-        # - Each vLLM block is split into ratio = cache_block_size / kernel_block_size
-        #   kernel blocks. For example, one vLLM block of 144 tokens becomes 9 kernel
+        # - Each Aphrodite block is split into ratio = cache_block_size / kernel_block_size
+        #   kernel blocks. For example, one Aphrodite block of 144 tokens becomes 9 kernel
         #   blocks of 16 tokens each.
         # - The KV cache is reshaped (zero-copy) to match: [num_blocks, 144, ...] →
         #   [num_blocks*9, 16, ...]. The physical memory layout is unchanged.
@@ -423,17 +420,17 @@
         if model_config.is_hybrid and metal_config.use_paged_attention:
             logger.warning(
                 "Hybrid model (e.g., Qwen3.5) with paged attention enabled. "
-                "Using block-size translation (PR #235) to convert vLLM's large "
+                "Using block-size translation (PR #235) to convert Aphrodite's large "
                 "block_size to a Metal kernel-compatible size.\n"
-                "  Mechanism: Each vLLM block is split into multiple kernel blocks.\n"
-                "  Example: vLLM block_size=144 → kernel block_size=16 (ratio=9).\n"
+                "  Mechanism: Each Aphrodite block is split into multiple kernel blocks.\n"
+                "  Example: Aphrodite block_size=144 → kernel block_size=16 (ratio=9).\n"
                 "  The KV cache is reshaped (zero-copy) and block tables are expanded.\n"
                 "  This is a logical transformation — physical memory is unchanged."
             )
 
         # Delegate the rest to upstream. With our ``_find_non_ssm_backend``
         # returning :class:`MetalBackend` (which advertises ``MultipleOf(16)``),
-        # vLLM's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
+        # Aphrodite's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
         # models (matching the kernel sweet spot), and Phase 2
         # (``_align_hybrid_block_size``) handles hybrid alignment. The kernel
         # layer (``_pick_kernel_block_size``) validates the final
@@ -451,12 +448,12 @@
        from aphrodite.v1.attention.backends.registry import AttentionBackendEnum

         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info(f"Cannot use {selected_backend} backend on Metal/MLX.")
         if attn_selector_config.use_mla:
-            # MLA attention is handled by the vllm-metal model runner (MLAPagedAttentionWrapper),
-            # not by vLLM's attention backend selector. Continue to return CPU_ATTN below.
+            # MLA attention is handled by the aphrodite metal model runner (MLAPagedAttentionWrapper),
+            # not by Aphrodite's attention backend selector. Continue to return CPU_ATTN below.
             logger.info(
-                "MLA model detected; attention handled by vllm-metal model runner"
+                "MLA model detected; attention handled by aphrodite metal model runner"
             )
         if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on Metal/MLX.")

diff --git a/aphrodite/metal/profiler/__init__.py b/aphrodite/metal/profiler/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal frame-capture profiler for vLLM Metal."""
+"""Metal frame-capture profiler for Aphrodite Metal."""
 
 from aphrodite.metal.profiler.wrapper import MetalProfilerWrapper
 

diff --git a/aphrodite/metal/profiler/wrapper.py b/aphrodite/metal/profiler/wrapper.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal frame-capture wrapper for vLLM's WorkerProfiler abstraction.
+"""Metal frame-capture wrapper for Aphrodite's WorkerProfiler abstraction.
 
 Subclasses ``aphrodite.profiler.wrapper.WorkerProfiler`` so that the manual
 start/stop surface — ``LLM.start_profile`` / ``LLM.stop_profile``, the
@@ -35,7 +35,7 @@
 
 
 class MetalProfilerWrapper(WorkerProfiler):
-    """Metal frame-capture flavor of vLLM's WorkerProfiler.
+    """Metal frame-capture flavor of Aphrodite's WorkerProfiler.
 
     Trace output: ``<profiler_config.torch_profiler_dir>/<trace_name>.gputrace``
     """

diff --git a/aphrodite/metal/pytorch_backend/tensor_bridge.py b/aphrodite/metal/pytorch_backend/tensor_bridge.py
@@ -15,7 +15,6 @@
 # MPS has a 4GB (2^32 bytes) limit for MPSTemporaryNDArray allocations.
 # Metal may allocate multiple temporary buffers internally, so we use a
 # conservative threshold of 1GB to avoid hitting the limit.
-# See: https://github.com/anthropics/vllm-metal/issues/43
 _MPS_SAFE_SIZE_BYTES = 1 << 30  # 1GB
 
 # MLX to PyTorch dtype mapping
@@ -150,7 +149,6 @@ def mlx_to_torch(
             tensor = tensor.to(device)
         else:
             # Large tensor - keep on CPU to avoid MPS 4GB limit crash
-            # See: https://github.com/anthropics/vllm-metal/issues/43
             logger.debug(
                 "Tensor too large for MPS (%d bytes > %d limit), keeping on CPU",
                 _get_tensor_size_bytes(array),

diff --git a/aphrodite/metal/stt/__init__.py b/aphrodite/metal/stt/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Speech-to-Text support for vLLM Metal."""
+"""Speech-to-Text support for Aphrodite Metal."""
 
 from aphrodite.metal.stt.loader import load_model
 from aphrodite.metal.stt.protocol import TranscriptionResult, TranscriptionSegment

diff --git a/aphrodite/metal/stt/policy.py b/aphrodite/metal/stt/policy.py
@@ -5,15 +5,15 @@
 
 from typing import Protocol
 
-# Nominal memory reported to vLLM scheduler for STT models.
+# Nominal memory reported to Aphrodite scheduler for STT models.
 # No KV cache is actually allocated; this just passes minimum-memory checks.
 STT_SCHED_AVAILABLE_BYTES = 1 << 30  # 1 GiB
 
-# Block size reported to vLLM for STT models (minimal, no real KV cache).
+# Block size reported to Aphrodite for STT models (minimal, no real KV cache).
 STT_SCHED_BLOCK_BYTES = 1
 
 # Nominal head size for the placeholder KV spec used only to satisfy
-# vLLM scheduler initialization for STT models.
+# Aphrodite scheduler initialization for STT models.
 STT_SCHED_NOMINAL_HEAD_SIZE = 64
 
 

diff --git a/aphrodite/metal/stt/qwen3_asr/adapter.py b/aphrodite/metal/stt/qwen3_asr/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Qwen3-ASR runtime adapter for vLLM STT execution."""
+"""Qwen3-ASR runtime adapter for Aphrodite STT execution."""
 
 from __future__ import annotations
 

diff --git a/aphrodite/metal/stt/qwen3_asr/config.py b/aphrodite/metal/stt/qwen3_asr/config.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Qwen3-ASR configuration (MLX-free).
 
-Keep this module free of MLX imports so vLLM compat code can import config and
+Keep this module free of MLX imports so Aphrodite compat code can import config and
 shape helpers during planning/registration without pulling in the model stack.
 """
 
@@ -10,7 +10,7 @@
 from dataclasses import dataclass, field
 
 from aphrodite.transformers_utils.configs.qwen3_asr import (
-    Qwen3ASRConfig as VllmQwen3ASRConfig,
+    Qwen3ASRConfig as AphroditeQwen3ASRConfig,
 )
 
 # Maximum decode tokens for Qwen3-ASR decode loop.
@@ -82,8 +82,8 @@ class Qwen3ASRConfig:
     n_audio_ctx: int = 1500
 
     @classmethod
-    def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
-        """Adapt the upstream vLLM/HF config into the local MLX model config."""
+    def _from_aphrodite_config(cls, config: AphroditeQwen3ASRConfig) -> Qwen3ASRConfig:
+        """Adapt the upstream Aphrodite/HF config into the local MLX model config."""
         thinker = config.thinker_config
         audio = thinker.audio_config
         text = thinker.text_config
@@ -131,4 +131,4 @@ def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
     @classmethod
     def from_dict(cls, d: dict) -> Qwen3ASRConfig:
         """Create config from config.json using the upstream schema owner."""
-        return cls._from_aphrodite_config(VllmQwen3ASRConfig.from_dict(d))
+        return cls._from_aphrodite_config(AphroditeQwen3ASRConfig.from_dict(d))
diff --git a/aphrodite/metal/stt/qwen3_asr/model.py b/aphrodite/metal/stt/qwen3_asr/model.py
@@ -558,7 +558,7 @@ def decode_step(
         return self.language_model.forward_embeds(embeds, cache)
 
     def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
-        """Create the model-owned runtime adapter used by the vLLM runner."""
+        """Create the model-owned runtime adapter used by the Aphrodite runner."""
         # Local import: avoid import-time cycles (adapter imports transcriber).
         from .adapter import Qwen3ASRRuntimeAdapter
 

diff --git a/aphrodite/metal/stt/runtime.py b/aphrodite/metal/stt/runtime.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-"""STT runtime adapter contract used by the vLLM runner.
+"""STT runtime adapter contract used by the Aphrodite runner.
 
-The vLLM runner delegates STT execution to model-owned runtime adapters under
+The Aphrodite runner delegates STT execution to model-owned runtime adapters under
 `stt/<model>/adapter.py` so shared code does not accumulate per-model branches.
 """
 
@@ -26,7 +26,7 @@
 
 
 class STTRuntimeAdapter(ABC):
-    """Model-owned bridge between vLLM STT inputs and per-model STT execution.
+    """Model-owned bridge between Aphrodite STT inputs and per-model STT execution.
 
     Concrete implementations live under `stt/<model>/adapter.py` and own:
     - input_features normalization to the model's expected encoder input shape

diff --git a/aphrodite/metal/stt/whisper/adapter.py b/aphrodite/metal/stt/whisper/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Whisper runtime adapter for vLLM STT execution."""
+"""Whisper runtime adapter for Aphrodite STT execution."""
 
 from __future__ import annotations
 

diff --git a/aphrodite/metal/stt/whisper/model.py b/aphrodite/metal/stt/whisper/model.py
@@ -273,7 +273,7 @@
         self._alignment_heads = mx.array(np.asarray(all_heads.nonzero()).T)
 
     def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
-        """Create the model-owned runtime adapter used by the vLLM runner."""
+        """Create the model-owned runtime adapter used by the Aphrodite runner."""
         # Local import: avoid import-time cycles (adapter imports transcriber).
         from .adapter import WhisperRuntimeAdapter
 
@@ -379,8 +379,8 @@
                    continue

                # Transpose Conv1d weights: HF (out, in, kernel) -> MLX (out, kernel, in)
                if "conv1.weight" in k or "conv2.weight" in k:
                    if v.ndim == 3:
                        v = v.transpose(0, 2, 1)

            if v.dtype != self.dtype and v.dtype != mx.uint32:

diff --git a/aphrodite/metal/utils.py b/aphrodite/metal/utils.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal utility functions for vLLM Metal plugin."""
+"""Metal utility functions for Aphrodite Metal plugin."""
 
 import logging
 import os
@@ -24,7 +24,7 @@
     Example:
 
     ```bash
-    APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache vllm serve Qwen/Qwen2.5-0.5B
+    APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache aphrodite run Qwen/Qwen2.5-0.5B
     ```
     """
     if Path(model_repo_name).exists():
@@ -38,9 +38,9 @@

            model_cache_dir = envs.APHRODITE_METAL_MODELSCOPE_CACHE

            logger.info(f"Downloading model {model_repo_name} from ModelScope...")
            model_path = snapshot_download(model_repo_name, cache_dir=model_cache_dir)
            logger.info(f"Model downloaded to {model_path}")
            return str(model_path)
        except ImportError:
            logger.warning(