diff --git a/aphrodite/metal/compat.py b/aphrodite/metal/compat.py
index 4dda3f30d3..5ae4589a24 100644
--- a/aphrodite/metal/compat.py
+++ b/aphrodite/metal/compat.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compatibility patches for vLLM + transformers version mismatches.
+"""Compatibility patches for Aphrodite + transformers version mismatches.
 
 Applied once at platform registration time. Optional missing dependencies are
 logged; unexpected runtime errors are allowed to surface so regressions remain
@@ -146,8 +146,7 @@ def _stack_qwen36_moe_per_expert_weights(
     and ``...mlp.experts.down_proj``, both stacked along axis 0 over experts.
 
     Mirrors the (scan -> validate -> walk) structure of upstream
-    ml-explore/mlx-lm#1224. Removable once vllm-metal's mlx-lm pin bumps
-    past that merge.
+    ml-explore/mlx-lm#1224.
 
     No-op when no per-expert keys are present (dense Qwen3.5/3.6 or already-
     stacked MoE checkpoints).
diff --git a/aphrodite/metal/config.py b/aphrodite/metal/config.py
index ac84ad9088..2cc3396e30 100644
--- a/aphrodite/metal/config.py
+++ b/aphrodite/metal/config.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Configuration for vLLM Metal plugin via environment variables."""
+"""Configuration for Aphrodite Metal plugin via environment variables."""
 
 import os
 from dataclasses import dataclass
@@ -37,7 +37,7 @@
 
 @dataclass
 class MetalConfig:
-    """Configuration for vLLM Metal plugin."""
+    """Configuration for Aphrodite Metal plugin."""
 
     memory_fraction: float  # -1.0 means "auto" (calculate minimal needed)
     use_mlx: bool
diff --git a/aphrodite/metal/envs.py b/aphrodite/metal/envs.py
index 2c3462834e..ca09050a70 100644
--- a/aphrodite/metal/envs.py
+++ b/aphrodite/metal/envs.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Environment variable definitions for the vLLM Metal plugin.
+"""Environment variable definitions for the Aphrodite Metal plugin.
 
 This module is the single source of truth for all ``APHRODITE_METAL_*`` (and
 ``APHRODITE_MLX_*``) environment variables.  It mirrors the lazy-evaluation
-pattern used by ``vllm/envs.py``: each variable is read from
+pattern used by ``aphrodite/envs.py``: each variable is read from
 ``os.environ`` on access via ``__getattr__``, so values are never stale
 and ``monkeypatch.setenv`` works in tests without extra resets.
 
 During plugin registration (``aphrodite.metal._register``), the
 ``environment_variables`` dict is merged into
 ``aphrodite.envs.environment_variables`` so that ``validate_environ()``
-recognises our variables and does not emit spurious "Unknown vLLM
+recognises our variables and does not emit spurious "Unknown Aphrodite
 environment variable" warnings.
 """
 
@@ -84,5 +84,5 @@ def __getattr__(name: str) -> Any:
 
 
 def __dir__() -> list[str]:
-    # Mirrors vllm/envs.py; enables tab-completion and introspection.
+    # Mirrors aphrodite/envs.py; enables tab-completion and introspection.
     return list(environment_variables.keys())
diff --git a/aphrodite/metal/metal_backend.py b/aphrodite/metal/metal_backend.py
index 2098c26b89..090cd0d282 100644
--- a/aphrodite/metal/metal_backend.py
+++ b/aphrodite/metal/metal_backend.py
@@ -18,7 +18,7 @@ class MetalBackend(AttentionBackend):
     block_size, and the hybrid-block-size math via
     Platform._align_hybrid_block_size) can read Metal's MultipleOf(16)
     alignment constraint. The Metal paged-attention kernels are tuned for
-    block_size=16; advertising MultipleOf(16) makes vLLM's selector default
+    block_size=16; advertising MultipleOf(16) makes Aphrodite's selector default
     to 16 and lets hybrid models align to multiples of 16. It is never
     dispatched to as a real attention backend — the actual Metal paged
     attention lives in metal_kernel_backend/paged_attention.py. The
diff --git a/aphrodite/metal/metal_kernel_backend/attention_sdpa.py b/aphrodite/metal/metal_kernel_backend/attention_sdpa.py
index a6bf34effa..e63c6f7686 100644
--- a/aphrodite/metal/metal_kernel_backend/attention_sdpa.py
+++ b/aphrodite/metal/metal_kernel_backend/attention_sdpa.py
@@ -110,7 +110,7 @@ def _build_block_tables(
     """Build kernel-compatible block tables, translating if necessary.
 
     When ``cache_block_size`` exceeds the kernel's compiled block sizes,
-    each vLLM block ``b`` is expanded into ``ratio`` kernel blocks
+    each Aphrodite block ``b`` is expanded into ``ratio`` kernel blocks
     ``[b*ratio, b*ratio+ratio)``.  The cache is reshaped later to
     match (zero-copy).
 
@@ -136,7 +136,7 @@ def _build_block_tables(
         return result
 
     # Hybrid path — translate large block_size to a kernel-compatible one.
-    # Vectorized: each vLLM block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
+    # Vectorized: each Aphrodite block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
     kernel_bs = _pick_kernel_block_size(cache_block_size)
     ratio = cache_block_size // kernel_bs
 
@@ -447,10 +447,10 @@ def sdpa_forward(
     max_seq_len = ctx.max_context_len or max(ctx.context_lens)
 
     # --- Block tables (with hybrid block-size translation) ---
-    # vLLM may inflate block_size (e.g. 544) to align attention pages with
+    # Aphrodite may inflate block_size (e.g. 544) to align attention pages with
     # mamba pages in hybrid models.  The Metal kernel only supports small
     # block sizes (8, 16, 32).  _build_block_tables handles the translation:
-    # it expands each vLLM block into multiple kernel blocks and returns the
+    # it expands each Aphrodite block into multiple kernel blocks and returns the
     # kernel-compatible block_size.  The cache is reshaped to match (zero-copy).
     block_tables, kernel_block_size = _build_block_tables(ctx, kv_cache.block_size)
 
diff --git a/aphrodite/metal/multimodal/__init__.py b/aphrodite/metal/multimodal/__init__.py
index a9f7e1583a..482d716b17 100644
--- a/aphrodite/metal/multimodal/__init__.py
+++ b/aphrodite/metal/multimodal/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Generic multimodal helpers for vLLM Metal."""
+"""Generic multimodal helpers for Aphrodite Metal."""
 
 from __future__ import annotations
 
diff --git a/aphrodite/metal/multimodal/embeddings.py b/aphrodite/metal/multimodal/embeddings.py
index 40e9aafde4..acdd8b2ff8 100644
--- a/aphrodite/metal/multimodal/embeddings.py
+++ b/aphrodite/metal/multimodal/embeddings.py
@@ -16,7 +16,7 @@ def merge_multimodal_embeddings(
 ) -> mx.array:
     """Splice multimodal embeddings into placeholder positions.
 
-    Mirrors ``vllm/model_executor/models/utils.py``
+    Mirrors ``aphrodite/model_executor/models/utils.py``
     ``_merge_multimodal_embeddings`` for MLX arrays.  Returns a new array;
     ``inputs_embeds`` is not mutated.
     """
diff --git a/aphrodite/metal/multimodal/qwen3_vl/adapter.py b/aphrodite/metal/multimodal/qwen3_vl/adapter.py
index d16c5bcbd5..41577e1c2c 100644
--- a/aphrodite/metal/multimodal/qwen3_vl/adapter.py
+++ b/aphrodite/metal/multimodal/qwen3_vl/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Qwen3-VL multimodal adapter for vLLM Metal."""
+"""Qwen3-VL multimodal adapter for Aphrodite Metal."""
 
 from __future__ import annotations
 
@@ -30,10 +30,10 @@ def get_mrope_input_positions(
     ) -> tuple[mx.array, int]:
         """Return ``((3, seq_len) int32 positions, mrope_position_delta)``.
 
-        Calls upstream vLLM's mm_features-driven Qwen3-VL M-RoPE helper with a
+        Calls upstream Aphrodite's mm_features-driven Qwen3-VL M-RoPE helper with a
         minimal image-only config shim, then converts the returned torch tensor
         to an MLX array.  This keeps the position-building policy upstream-owned
-        while the vllm-metal runner can consume MLX arrays.
+        while the aphrodite metal runner can consume MLX arrays.
         """
         if not input_tokens:
             return mx.zeros((3, 0), dtype=mx.int32), 0
diff --git a/aphrodite/metal/platform.py b/aphrodite/metal/platform.py
index 7e1ab6cfc5..567a956aa8 100644
--- a/aphrodite/metal/platform.py
+++ b/aphrodite/metal/platform.py
@@ -220,10 +220,10 @@ def get_torch_device(cls, device_id: int = 0) -> torch.device:
 
     @classmethod
     def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
-        """Check and update vLLM configuration for Metal compatibility.
+        """Check and update Aphrodite configuration for Metal compatibility.
 
         Args:
-            aphrodite_config: vLLM configuration object
+            aphrodite_config: Aphrodite configuration object
         """
         config = get_config()
         parallel_config = aphrodite_config.parallel_config
@@ -231,13 +231,10 @@ def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
         compilation_config = aphrodite_config.compilation_config
 
         # Metal execution is MLX-backed. Torch Inductor/CUDAGraph settings do
-        # not apply to the actual model path, so normalize them here rather
-        # than requiring users to pass --enforce-eager.
+        # not apply to the actual model path, so disable those compilation
+        # surfaces without overriding the user's eager-mode flag here.
         from aphrodite.config.compilation import CompilationMode, CUDAGraphMode
 
-        if model_config is not None and not model_config.enforce_eager:
-            logger.info("Metal: forcing eager mode; torch.compile/CUDAGraphs are not used on MLX.")
-            model_config.enforce_eager = True
         compilation_config.mode = CompilationMode.NONE
         compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         compilation_config.max_cudagraph_capture_size = 0
@@ -385,7 +382,7 @@ def _find_non_ssm_backend(
     def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> None:
         """Update block_size for Metal platform.
 
-        Delegates to vLLM's base implementation, which reads the Metal kernel
+        Delegates to Aphrodite's base implementation, which reads the Metal kernel
         alignment (MultipleOf(16)) from our :meth:`_find_non_ssm_backend`
         override. Adds a one-time warning when paged attention is enabled for
         a hybrid model, explaining the cache-block-size translation mechanism
@@ -403,16 +400,16 @@ def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> N
         # block-size translation mechanism.
         #
         # Background:
-        # - vLLM requires block_size=160 (or larger) for hybrid models to satisfy
+        # - Aphrodite requires block_size=160 (or larger) for hybrid models to satisfy
         #   page size divisibility validation between SDPA and Mamba layers.
         #
-        # Solution (PR #235):
-        # - vLLM sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
+        # Solution:
+        # - Aphrodite sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
         #   validation.
         # - The Metal kernel uses a translated block_size (16, the kernel sweet
         #   spot) that it supports.
-        # - Each vLLM block is split into ratio = cache_block_size / kernel_block_size
-        #   kernel blocks. For example, one vLLM block of 144 tokens becomes 9 kernel
+        # - Each Aphrodite block is split into ratio = cache_block_size / kernel_block_size
+        #   kernel blocks. For example, one Aphrodite block of 144 tokens becomes 9 kernel
         #   blocks of 16 tokens each.
         # - The KV cache is reshaped (zero-copy) to match: [num_blocks, 144, ...] →
         #   [num_blocks*9, 16, ...]. The physical memory layout is unchanged.
@@ -423,17 +420,17 @@ def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> N
         if model_config.is_hybrid and metal_config.use_paged_attention:
             logger.warning(
                 "Hybrid model (e.g., Qwen3.5) with paged attention enabled. "
-                "Using block-size translation (PR #235) to convert vLLM's large "
+                "Using block-size translation (PR #235) to convert Aphrodite's large "
                 "block_size to a Metal kernel-compatible size.\n"
-                "  Mechanism: Each vLLM block is split into multiple kernel blocks.\n"
-                "  Example: vLLM block_size=144 → kernel block_size=16 (ratio=9).\n"
+                "  Mechanism: Each Aphrodite block is split into multiple kernel blocks.\n"
+                "  Example: Aphrodite block_size=144 → kernel block_size=16 (ratio=9).\n"
                 "  The KV cache is reshaped (zero-copy) and block tables are expanded.\n"
                 "  This is a logical transformation — physical memory is unchanged."
             )
 
         # Delegate the rest to upstream. With our ``_find_non_ssm_backend``
         # returning :class:`MetalBackend` (which advertises ``MultipleOf(16)``),
-        # vLLM's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
+        # Aphrodite's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
         # models (matching the kernel sweet spot), and Phase 2
         # (``_align_hybrid_block_size``) handles hybrid alignment. The kernel
         # layer (``_pick_kernel_block_size``) validates the final
@@ -453,10 +450,10 @@ def get_attn_backend_cls(
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info(f"Cannot use {selected_backend} backend on Metal/MLX.")
         if attn_selector_config.use_mla:
-            # MLA attention is handled by the vllm-metal model runner (MLAPagedAttentionWrapper),
-            # not by vLLM's attention backend selector. Continue to return CPU_ATTN below.
+            # MLA attention is handled by the aphrodite metal model runner (MLAPagedAttentionWrapper),
+            # not by Aphrodite's attention backend selector. Continue to return CPU_ATTN below.
             logger.info(
-                "MLA model detected; attention handled by vllm-metal model runner"
+                "MLA model detected; attention handled by aphrodite metal model runner"
             )
         if attn_selector_config.use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on Metal/MLX.")
diff --git a/aphrodite/metal/profiler/__init__.py b/aphrodite/metal/profiler/__init__.py
index addeb5b607..322d191c01 100644
--- a/aphrodite/metal/profiler/__init__.py
+++ b/aphrodite/metal/profiler/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal frame-capture profiler for vLLM Metal."""
+"""Metal frame-capture profiler for Aphrodite Metal."""
 
 from aphrodite.metal.profiler.wrapper import MetalProfilerWrapper
 
diff --git a/aphrodite/metal/profiler/wrapper.py b/aphrodite/metal/profiler/wrapper.py
index 0688da45cb..e5ce81ac26 100644
--- a/aphrodite/metal/profiler/wrapper.py
+++ b/aphrodite/metal/profiler/wrapper.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal frame-capture wrapper for vLLM's WorkerProfiler abstraction.
+"""Metal frame-capture wrapper for Aphrodite's WorkerProfiler abstraction.
 
 Subclasses ``aphrodite.profiler.wrapper.WorkerProfiler`` so that the manual
 start/stop surface — ``LLM.start_profile`` / ``LLM.stop_profile``, the
@@ -35,7 +35,7 @@
 
 
 class MetalProfilerWrapper(WorkerProfiler):
-    """Metal frame-capture flavor of vLLM's WorkerProfiler.
+    """Metal frame-capture flavor of Aphrodite's WorkerProfiler.
 
     Trace output: ``<profiler_config.torch_profiler_dir>/<trace_name>.gputrace``
     """
diff --git a/aphrodite/metal/pytorch_backend/tensor_bridge.py b/aphrodite/metal/pytorch_backend/tensor_bridge.py
index fd0424b15d..e0a185f101 100644
--- a/aphrodite/metal/pytorch_backend/tensor_bridge.py
+++ b/aphrodite/metal/pytorch_backend/tensor_bridge.py
@@ -15,7 +15,6 @@
 # MPS has a 4GB (2^32 bytes) limit for MPSTemporaryNDArray allocations.
 # Metal may allocate multiple temporary buffers internally, so we use a
 # conservative threshold of 1GB to avoid hitting the limit.
-# See: https://github.com/anthropics/vllm-metal/issues/43
 _MPS_SAFE_SIZE_BYTES = 1 << 30  # 1GB
 
 # MLX to PyTorch dtype mapping
@@ -150,7 +149,6 @@ def mlx_to_torch(
             tensor = tensor.to(device)
         else:
             # Large tensor - keep on CPU to avoid MPS 4GB limit crash
-            # See: https://github.com/anthropics/vllm-metal/issues/43
             logger.debug(
                 "Tensor too large for MPS (%d bytes > %d limit), keeping on CPU",
                 _get_tensor_size_bytes(array),
diff --git a/aphrodite/metal/stt/__init__.py b/aphrodite/metal/stt/__init__.py
index 638fb425c3..71fe4f5b1d 100644
--- a/aphrodite/metal/stt/__init__.py
+++ b/aphrodite/metal/stt/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Speech-to-Text support for vLLM Metal."""
+"""Speech-to-Text support for Aphrodite Metal."""
 
 from aphrodite.metal.stt.loader import load_model
 from aphrodite.metal.stt.protocol import TranscriptionResult, TranscriptionSegment
diff --git a/aphrodite/metal/stt/policy.py b/aphrodite/metal/stt/policy.py
index b192fce4ae..66c5879051 100644
--- a/aphrodite/metal/stt/policy.py
+++ b/aphrodite/metal/stt/policy.py
@@ -5,15 +5,15 @@
 
 from typing import Protocol
 
-# Nominal memory reported to vLLM scheduler for STT models.
+# Nominal memory reported to Aphrodite scheduler for STT models.
 # No KV cache is actually allocated; this just passes minimum-memory checks.
 STT_SCHED_AVAILABLE_BYTES = 1 << 30  # 1 GiB
 
-# Block size reported to vLLM for STT models (minimal, no real KV cache).
+# Block size reported to Aphrodite for STT models (minimal, no real KV cache).
 STT_SCHED_BLOCK_BYTES = 1
 
 # Nominal head size for the placeholder KV spec used only to satisfy
-# vLLM scheduler initialization for STT models.
+# Aphrodite scheduler initialization for STT models.
 STT_SCHED_NOMINAL_HEAD_SIZE = 64
 
 
diff --git a/aphrodite/metal/stt/qwen3_asr/adapter.py b/aphrodite/metal/stt/qwen3_asr/adapter.py
index e77faa1fc5..b88ff087c4 100644
--- a/aphrodite/metal/stt/qwen3_asr/adapter.py
+++ b/aphrodite/metal/stt/qwen3_asr/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Qwen3-ASR runtime adapter for vLLM STT execution."""
+"""Qwen3-ASR runtime adapter for Aphrodite STT execution."""
 
 from __future__ import annotations
 
diff --git a/aphrodite/metal/stt/qwen3_asr/config.py b/aphrodite/metal/stt/qwen3_asr/config.py
index f3e790b447..cf72580751 100644
--- a/aphrodite/metal/stt/qwen3_asr/config.py
+++ b/aphrodite/metal/stt/qwen3_asr/config.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Qwen3-ASR configuration (MLX-free).
 
-Keep this module free of MLX imports so vLLM compat code can import config and
+Keep this module free of MLX imports so Aphrodite compat code can import config and
 shape helpers during planning/registration without pulling in the model stack.
 """
 
@@ -10,7 +10,7 @@
 from dataclasses import dataclass, field
 
 from aphrodite.transformers_utils.configs.qwen3_asr import (
-    Qwen3ASRConfig as VllmQwen3ASRConfig,
+    Qwen3ASRConfig as AphroditeQwen3ASRConfig,
 )
 
 # Maximum decode tokens for Qwen3-ASR decode loop.
@@ -82,8 +82,8 @@ class Qwen3ASRConfig:
     n_audio_ctx: int = 1500
 
     @classmethod
-    def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
-        """Adapt the upstream vLLM/HF config into the local MLX model config."""
+    def _from_aphrodite_config(cls, config: AphroditeQwen3ASRConfig) -> Qwen3ASRConfig:
+        """Adapt the upstream Aphrodite/HF config into the local MLX model config."""
         thinker = config.thinker_config
         audio = thinker.audio_config
         text = thinker.text_config
@@ -131,4 +131,4 @@ def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
     @classmethod
     def from_dict(cls, d: dict) -> Qwen3ASRConfig:
         """Create config from config.json using the upstream schema owner."""
-        return cls._from_aphrodite_config(VllmQwen3ASRConfig.from_dict(d))
+        return cls._from_aphrodite_config(AphroditeQwen3ASRConfig.from_dict(d))
diff --git a/aphrodite/metal/stt/qwen3_asr/model.py b/aphrodite/metal/stt/qwen3_asr/model.py
index 09b122505c..044ee04342 100644
--- a/aphrodite/metal/stt/qwen3_asr/model.py
+++ b/aphrodite/metal/stt/qwen3_asr/model.py
@@ -558,7 +558,7 @@ def decode_step(
         return self.language_model.forward_embeds(embeds, cache)
 
     def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
-        """Create the model-owned runtime adapter used by the vLLM runner."""
+        """Create the model-owned runtime adapter used by the Aphrodite runner."""
         # Local import: avoid import-time cycles (adapter imports transcriber).
         from .adapter import Qwen3ASRRuntimeAdapter
 
diff --git a/aphrodite/metal/stt/runtime.py b/aphrodite/metal/stt/runtime.py
index cd6c4f6edd..3408210d59 100644
--- a/aphrodite/metal/stt/runtime.py
+++ b/aphrodite/metal/stt/runtime.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-"""STT runtime adapter contract used by the vLLM runner.
+"""STT runtime adapter contract used by the Aphrodite runner.
 
-The vLLM runner delegates STT execution to model-owned runtime adapters under
+The Aphrodite runner delegates STT execution to model-owned runtime adapters under
 `stt/<model>/adapter.py` so shared code does not accumulate per-model branches.
 """
 
@@ -26,7 +26,7 @@
 
 
 class STTRuntimeAdapter(ABC):
-    """Model-owned bridge between vLLM STT inputs and per-model STT execution.
+    """Model-owned bridge between Aphrodite STT inputs and per-model STT execution.
 
     Concrete implementations live under `stt/<model>/adapter.py` and own:
     - input_features normalization to the model's expected encoder input shape
diff --git a/aphrodite/metal/stt/whisper/adapter.py b/aphrodite/metal/stt/whisper/adapter.py
index e398041a25..79a7630d3f 100644
--- a/aphrodite/metal/stt/whisper/adapter.py
+++ b/aphrodite/metal/stt/whisper/adapter.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Whisper runtime adapter for vLLM STT execution."""
+"""Whisper runtime adapter for Aphrodite STT execution."""
 
 from __future__ import annotations
 
diff --git a/aphrodite/metal/stt/whisper/model.py b/aphrodite/metal/stt/whisper/model.py
index 95857b7c8e..3680eae70f 100644
--- a/aphrodite/metal/stt/whisper/model.py
+++ b/aphrodite/metal/stt/whisper/model.py
@@ -273,7 +273,7 @@ def __init__(self, config: WhisperConfig, dtype: mx.Dtype = mx.float16):
         self._alignment_heads = mx.array(np.asarray(all_heads.nonzero()).T)
 
     def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
-        """Create the model-owned runtime adapter used by the vLLM runner."""
+        """Create the model-owned runtime adapter used by the Aphrodite runner."""
         # Local import: avoid import-time cycles (adapter imports transcriber).
         from .adapter import WhisperRuntimeAdapter
 
diff --git a/aphrodite/metal/utils.py b/aphrodite/metal/utils.py
index 767cfabb3a..7f716a2c46 100644
--- a/aphrodite/metal/utils.py
+++ b/aphrodite/metal/utils.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Metal utility functions for vLLM Metal plugin."""
+"""Metal utility functions for Aphrodite Metal plugin."""
 
 import logging
 import os
@@ -24,7 +24,7 @@ def get_model_download_path(model_repo_name: str) -> str:
     Example:
 
     ```bash
-    APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache vllm serve Qwen/Qwen2.5-0.5B
+    APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache aphrodite run Qwen/Qwen2.5-0.5B
     ```
     """
     if Path(model_repo_name).exists():
diff --git a/aphrodite/metal/v1/__init__.py b/aphrodite/metal/v1/__init__.py
index 5e738d79e1..474186a806 100644
--- a/aphrodite/metal/v1/__init__.py
+++ b/aphrodite/metal/v1/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""vLLM v1 compatibility module for Metal platform."""
+"""Aphrodite v1 compatibility module for Metal platform."""
 
 __all__ = ["MetalWorker"]
 
diff --git a/aphrodite/metal/v1/cache_policy.py b/aphrodite/metal/v1/cache_policy.py
index f8df7b7fe1..3efce09b51 100644
--- a/aphrodite/metal/v1/cache_policy.py
+++ b/aphrodite/metal/v1/cache_policy.py
@@ -66,7 +66,7 @@ class TurboQuantAttentionSpec(FullAttentionSpec):
     """FullAttentionSpec for TurboQuant-compressed KV cache.
 
     Reports the true packed byte count per page via an override of
-    ``real_page_size_bytes`` so vLLM's scheduler can budget more blocks
+    ``real_page_size_bytes`` so Aphrodite's scheduler can budget more blocks
     than the FP16 formula would allow — without lying about ``head_size``
     (the ``head_size_v`` reverse-engineering trick the previous version
     used produced negative values for aggressive 2-bit configs).
@@ -146,9 +146,9 @@ def _build_turboquant_attention_spec(
 
 
 def _register_turboquant_spec_manager() -> None:
-    """Register ``TurboQuantAttentionSpec`` in vLLM's spec→manager map.
+    """Register ``TurboQuantAttentionSpec`` in Aphrodite's spec→manager map.
 
-    vLLM's ``get_manager_for_kv_cache_spec`` uses strict-type lookup
+    Aphrodite's ``get_manager_for_kv_cache_spec`` uses strict-type lookup
     (``spec_manager_map[type(spec)]``), not ``isinstance``, so the
     ``FullAttentionSpec`` entry does not cover subclasses.  We reuse
     ``FullAttentionManager`` because a TurboQuant cache is accessed
@@ -157,7 +157,7 @@ def _register_turboquant_spec_manager() -> None:
     inside the Metal kernel).
 
     Mirrors the upstream registration for ``MLAAttentionSpec`` (which
-    vLLM also maps to ``FullAttentionManager``).
+    Aphrodite also maps to ``FullAttentionManager``).
     """
     try:
         from aphrodite.v1.core.single_type_kv_cache_manager import (
@@ -165,7 +165,7 @@ def _register_turboquant_spec_manager() -> None:
             spec_manager_map,
         )
     except ImportError:
-        # vLLM shape changed; let the scheduler raise its own clearer error.
+        # Aphrodite shape changed; let the scheduler raise its own clearer error.
         return
     spec_manager_map.setdefault(TurboQuantAttentionSpec, FullAttentionManager)
 
diff --git a/aphrodite/metal/v1/contiguous_cache.py b/aphrodite/metal/v1/contiguous_cache.py
index 088606a7f2..5da9912091 100644
--- a/aphrodite/metal/v1/contiguous_cache.py
+++ b/aphrodite/metal/v1/contiguous_cache.py
@@ -320,7 +320,7 @@ def _merge_rotating_kv_caches(
     mlx-lm <= 0.29.1 uses ``c.offset`` which can exceed the underlying array size
     after the cache has rotated, causing a broadcast shape error.
 
-    This workaround can be removed once vllm-metal can depend on an mlx-lm version
+    This workaround can be removed once aphrodite metal can depend on an mlx-lm version
     that includes the upstream fix (ml-explore/mlx-lm#738) and has been verified
     to work with gpt-oss models end-to-end.
     """
diff --git a/aphrodite/metal/v1/mm/__init__.py b/aphrodite/metal/v1/mm/__init__.py
index 740928870c..750a441114 100644
--- a/aphrodite/metal/v1/mm/__init__.py
+++ b/aphrodite/metal/v1/mm/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""v1 multimodal runtime helpers for vLLM Metal."""
+"""v1 multimodal runtime helpers for Aphrodite Metal."""
 
 from __future__ import annotations
 
diff --git a/aphrodite/metal/v1/mm/encoder_cache.py b/aphrodite/metal/v1/mm/encoder_cache.py
index c03e766815..27f32a0bf9 100644
--- a/aphrodite/metal/v1/mm/encoder_cache.py
+++ b/aphrodite/metal/v1/mm/encoder_cache.py
@@ -11,7 +11,7 @@
 class EncoderCache:
     """Store multimodal features and MLX encoder outputs by request/hash.
 
-    Mirrors upstream vLLM's v1 GPU ``EncoderCache`` with the tensor type
+    Mirrors upstream Aphrodite's v1 GPU ``EncoderCache`` with the tensor type
     changed from ``torch.Tensor`` to ``mlx.core.array``.
     """
 
@@ -29,7 +29,7 @@ def remove_request(self, req_id: str) -> None:
 
     def reset_mm_cache(self) -> None:
         """Mirror upstream's profiling-cache reset hook."""
-        # TODO: Implement when vllm-metal adds profiling-time MM cache state.
+        # TODO: Implement when aphrodite metal adds profiling-time MM cache state.
         pass
 
     def reset_encoder_cache(self) -> None:
diff --git a/aphrodite/metal/v1/model_adapter.py b/aphrodite/metal/v1/model_adapter.py
index 17dc24c403..aa8e978e8e 100644
--- a/aphrodite/metal/v1/model_adapter.py
+++ b/aphrodite/metal/v1/model_adapter.py
@@ -60,11 +60,11 @@ def build_sliding_window_per_layer(
         """Return per-layer sliding window sizes, or None for no enforcement."""
 
 
-# Models/configs that vLLM flags as multimodal but must be loaded via mlx_lm.
+# Models/configs that Aphrodite flags as multimodal but must be loaded via mlx_lm.
 # gemma4: mlx_vlm forward path produces garbled output vs mlx_lm.
 _TEXT_BACKBONE_OVERRIDE_TYPES: frozenset[str] = frozenset({"gemma4"})
 # Qwen3.5/Qwen3.6 conditional-generation wrappers expose a multimodal config,
-# but vllm-metal only serves them in text-only mode. Route them through
+# but Aphrodite metal only serves them in text-only mode. Route them through
 # mlx_lm's qwen3_5 text loader; the mlx_vlm wrapper adds multimodal processing
 # overhead and some local text-only MLX checkpoints do not behave correctly
 # through the VLM forward path.
@@ -135,7 +135,7 @@ def normalize_model_config(self, model_config: ModelConfig) -> None:
 
         When the active serve mode routes a multimodal checkpoint through the
         text-only compatibility path, leaving ``multimodal_config`` populated
-        causes vLLM to eagerly initialize multimodal processors that the
+        causes Aphrodite to eagerly initialize multimodal processors that the
         compatibility path intentionally bypasses. Clearing it here makes
         ``is_multimodal_model`` ``False`` so the input processor skips that
         setup. The ``should_force_text_backbone`` predicate is the single
diff --git a/aphrodite/metal/v1/model_lifecycle.py b/aphrodite/metal/v1/model_lifecycle.py
index c393863d53..f52fb5d5a1 100644
--- a/aphrodite/metal/v1/model_lifecycle.py
+++ b/aphrodite/metal/v1/model_lifecycle.py
@@ -104,7 +104,7 @@ def _mlx_lm_compatible_model_path(model_name: str):
         yield model_name
         return
 
-    with TemporaryDirectory(prefix="vllm-metal-mlx-lm-") as tmpdir:
+    with TemporaryDirectory(prefix="aphrodite-metal-mlx-lm-") as tmpdir:
         compat_path = Path(tmpdir)
 
         for src in model_path.iterdir():
@@ -147,7 +147,7 @@ def load(self) -> None:
             return
 
         model_config = runner.model_config
-        # vLLM model_config shape varies across backends.
+        # Aphrodite model_config shape varies across backends.
         hf_config = getattr(model_config, "hf_config", None)
         is_vlm = bool(getattr(model_config, "is_multimodal_model", False))
         if self._model_adapter.should_force_text_backbone(hf_config):
diff --git a/aphrodite/metal/v1/model_runner.py b/aphrodite/metal/v1/model_runner.py
index 283eb7ccf4..ee7050b5f9 100644
--- a/aphrodite/metal/v1/model_runner.py
+++ b/aphrodite/metal/v1/model_runner.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-Metal vLLM v1 model runner.
+Metal Aphrodite v1 model runner.
 
 Orchestration only: coordinates scheduling, dispatch, and output assembly.
 Model-specific behavior belongs in adapters; backend-specific kernels live in
@@ -87,8 +87,8 @@ def _create_request_generator(
 ) -> torch.Generator | None:
     """Create a per-request generator for seeded sampling.
 
-    vLLM uses a per-request generator only when an explicit seed is provided.
-    For unseeded sampling, vLLM relies on the global RNG state.
+    Aphrodite uses a per-request generator only when an explicit seed is provided.
+    For unseeded sampling, Aphrodite relies on the global RNG state.
     """
     if sampling_params.seed is None:
         return None
@@ -105,7 +105,7 @@ class RequestState:
 
     token_ids: list[int]
     # Length of the original prompt (prefix) within `token_ids`.
-    # vLLM applies repetition penalties to both prompt+output tokens, but applies
+    # Aphrodite applies repetition penalties to both prompt+output tokens, but applies
     # presence/frequency penalties only to generated (output) tokens.
     prompt_len: int
     cache: list[AnyCache]  # Per-layer caches (KVCache, RotatingKVCache, or ArraysCache)
@@ -315,7 +315,7 @@ def _slice_logprobs_row(
 class MetalModelRunner:
     """Model runner for MLX-based inference on Metal.
 
-    Implements the vLLM v1 model runner interface for Apple Silicon.
+    Implements the Aphrodite v1 model runner interface for Apple Silicon.
     Uses true batched decode with BatchKVCache for efficient parallel processing.
     """
 
@@ -327,7 +327,7 @@ def __init__(
         """Initialize model runner.
 
         Args:
-            aphrodite_config: vLLM configuration
+            aphrodite_config: Aphrodite configuration
             device: PyTorch device (CPU for Metal interop)
         """
         self.aphrodite_config = aphrodite_config
@@ -359,7 +359,7 @@ def __init__(
         self._gdn_free_slots: list[int] = []
         self._gdn_needs_materialize = False
 
-        # vLLM Sampler for token sampling with temperature, top_k, top_p support
+        # Aphrodite Sampler for token sampling with temperature, top_k, top_p support
         self._sampler = Sampler()
 
         # Build logits processors (includes custom plugins from entry-points)
@@ -375,7 +375,7 @@ def __init__(
             custom_logitsprocs,
         )
 
-        # vLLM v1 async scheduling calls sample_tokens after execute_model.
+        # Aphrodite v1 async scheduling calls sample_tokens after execute_model.
         # Keep the latest execution output so sample_tokens can return it.
         self._pending_output: ModelRunnerOutput | None = None
 
@@ -1705,7 +1705,7 @@ def execute_model(
             )
             return None
 
-        # Defensive invariant: the vLLM scheduler sets has_structured_output_requests
+        # Defensive invariant: the Aphrodite scheduler sets has_structured_output_requests
         # only when at least one SO request is present in the *current* scheduled
         # batch (not the global queue). Any such request on the paged path must
         # contribute a paged decode or prefill entry, so has_paged_work() must be
@@ -1736,7 +1736,7 @@ def sample_tokens(
     ) -> ModelRunnerOutput | None:
         """Wait for GPU forward, sample tokens, and postprocess.
 
-        Called by the vLLM v1 engine after ``execute_model`` returns ``None``.
+        Called by the Aphrodite v1 engine after ``execute_model`` returns ``None``.
         For the paged path, this is where the actual GPU synchronization,
         token sampling, and request state updates happen — allowing the
         scheduler to run while the GPU was computing the forward pass.
@@ -1755,7 +1755,7 @@ def sample_tokens(
             return output
 
         # Async scheduling: execute_model may have failed; return None so
-        # vLLM can surface the original exception.
+        # Aphrodite can surface the original exception.
         logger.error(
             "sample_tokens called with no pending state — "
             "neither _execute_model_state nor _pending_output was set."
diff --git a/aphrodite/metal/v1/sampling_batch.py b/aphrodite/metal/v1/sampling_batch.py
index 0d02eeb806..d46f1c47fb 100644
--- a/aphrodite/metal/v1/sampling_batch.py
+++ b/aphrodite/metal/v1/sampling_batch.py
@@ -9,15 +9,15 @@
 
 import mlx.core as mx
 import torch
+
+from aphrodite.metal.pytorch_backend.tensor_bridge import mlx_to_torch
 from aphrodite.sampling_params import SamplingParams
 from aphrodite.utils.torch_utils import make_tensor_with_pad
+from aphrodite.v1.outputs import LogprobsTensors
 from aphrodite.v1.sample.logits_processor import LogitsProcessors
 from aphrodite.v1.sample.logits_processor.interface import BatchUpdate
 from aphrodite.v1.sample.metadata import SamplingMetadata
 from aphrodite.v1.sample.sampler import Sampler
-from aphrodite.v1.outputs import LogprobsTensors
-
-from aphrodite.metal.pytorch_backend.tensor_bridge import mlx_to_torch
 
 GREEDY_TEMPERATURE_EPS = 1e-5
 
@@ -74,85 +74,46 @@ def __init__(
         self.logitsprocs = logitsprocs or LogitsProcessors()
         self.generators = {} if generators is None else generators
         self.all_greedy = all(
-            sampling_params.temperature < GREEDY_TEMPERATURE_EPS
-            for sampling_params in self.sampling_params_list
+            sampling_params.temperature < GREEDY_TEMPERATURE_EPS for sampling_params in self.sampling_params_list
         )
         self.all_random = not self.all_greedy and all(
-            sampling_params.temperature >= GREEDY_TEMPERATURE_EPS
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_top_p = all(
-            sampling_params.top_p == 1.0
-            for sampling_params in self.sampling_params_list
+            sampling_params.temperature >= GREEDY_TEMPERATURE_EPS for sampling_params in self.sampling_params_list
         )
+        self.no_top_p = all(sampling_params.top_p == 1.0 for sampling_params in self.sampling_params_list)
         self.no_top_k = all(
-            not (0 < sampling_params.top_k < self.vocab_size)
-            for sampling_params in self.sampling_params_list
+            not (0 < sampling_params.top_k < self.vocab_size) for sampling_params in self.sampling_params_list
         )
         self.no_dynatemp = all(
-            sampling_params.dynatemp_min <= 0.0
-            and sampling_params.dynatemp_max <= 0.0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_top_a = all(
-            sampling_params.top_a <= 0.0 for sampling_params in self.sampling_params_list
-        )
-        self.no_dry = all(
-            sampling_params.dry_multiplier <= 0.0
+            sampling_params.dynatemp_min <= 0.0 and sampling_params.dynatemp_max <= 0.0
             for sampling_params in self.sampling_params_list
         )
+        self.no_top_a = all(sampling_params.top_a <= 0.0 for sampling_params in self.sampling_params_list)
+        self.no_dry = all(sampling_params.dry_multiplier <= 0.0 for sampling_params in self.sampling_params_list)
         self.no_no_repeat_ngram = all(
-            sampling_params.no_repeat_ngram_size <= 0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_tfs = all(
-            sampling_params.tfs >= 1.0 for sampling_params in self.sampling_params_list
-        )
-        self.no_eta_cutoff = all(
-            sampling_params.eta_cutoff <= 0.0
-            for sampling_params in self.sampling_params_list
+            sampling_params.no_repeat_ngram_size <= 0 for sampling_params in self.sampling_params_list
         )
+        self.no_tfs = all(sampling_params.tfs >= 1.0 for sampling_params in self.sampling_params_list)
+        self.no_eta_cutoff = all(sampling_params.eta_cutoff <= 0.0 for sampling_params in self.sampling_params_list)
         self.no_epsilon_cutoff = all(
-            sampling_params.epsilon_cutoff <= 0.0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_typical_p = all(
-            sampling_params.typical_p >= 1.0
-            for sampling_params in self.sampling_params_list
+            sampling_params.epsilon_cutoff <= 0.0 for sampling_params in self.sampling_params_list
         )
+        self.no_typical_p = all(sampling_params.typical_p >= 1.0 for sampling_params in self.sampling_params_list)
         self.no_quadratic = all(
-            sampling_params.smoothing_factor <= 0.0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_xtc = all(
-            sampling_params.xtc_probability <= 0.0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_top_nsigma = all(
-            sampling_params.nsigma <= 0.0 for sampling_params in self.sampling_params_list
-        )
-        self.no_mirostat = all(
-            sampling_params.mirostat_mode == 0
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_skew = all(
-            sampling_params.skew == 0.0 for sampling_params in self.sampling_params_list
+            sampling_params.smoothing_factor <= 0.0 for sampling_params in self.sampling_params_list
         )
+        self.no_xtc = all(sampling_params.xtc_probability <= 0.0 for sampling_params in self.sampling_params_list)
+        self.no_top_nsigma = all(sampling_params.nsigma <= 0.0 for sampling_params in self.sampling_params_list)
+        self.no_mirostat = all(sampling_params.mirostat_mode == 0 for sampling_params in self.sampling_params_list)
+        self.no_skew = all(sampling_params.skew == 0.0 for sampling_params in self.sampling_params_list)
         self.no_allowed_token_ids = all(
-            not sampling_params.allowed_token_ids
-            for sampling_params in self.sampling_params_list
+            not sampling_params.allowed_token_ids for sampling_params in self.sampling_params_list
         )
         self.no_bad_words = all(
-            not sampling_params.bad_words_token_ids
-            for sampling_params in self.sampling_params_list
-        )
-        self.no_logit_bias = all(
-            not sampling_params.logit_bias
-            for sampling_params in self.sampling_params_list
+            not sampling_params.bad_words_token_ids for sampling_params in self.sampling_params_list
         )
+        self.no_logit_bias = all(not sampling_params.logit_bias for sampling_params in self.sampling_params_list)
         self.no_logprob_token_ids = all(
-            sampling_params.logprob_token_ids is None
-            for sampling_params in self.sampling_params_list
+            sampling_params.logprob_token_ids is None for sampling_params in self.sampling_params_list
         )
         self.no_penalties = all(
             sampling_params.frequency_penalty == 0.0
@@ -170,10 +131,7 @@ def can_use_native_greedy(
         """Return whether MLX argmax matches the requested sampling behavior."""
         return all(
             sampling_params.temperature < GREEDY_TEMPERATURE_EPS
-            and (
-                sampling_params.top_k <= 0
-                or (vocab_size is not None and sampling_params.top_k >= vocab_size)
-            )
+            and (sampling_params.top_k <= 0 or (vocab_size is not None and sampling_params.top_k >= vocab_size))
             and sampling_params.top_p == 1.0
             and sampling_params.min_p == 0.0
             and sampling_params.top_a == 0.0
@@ -207,10 +165,7 @@ def _make_temperature(self) -> torch.Tensor | None:
             return None
 
         return torch.tensor(
-            [
-                sampling_params.temperature
-                for sampling_params in self.sampling_params_list
-            ],
+            [sampling_params.temperature for sampling_params in self.sampling_params_list],
             dtype=torch.float32,
             device=self.device,
         )
@@ -231,9 +186,7 @@ def _make_top_k(self) -> torch.Tensor | None:
 
         return torch.tensor(
             [
-                sampling_params.top_k
-                if 0 < sampling_params.top_k < self.vocab_size
-                else self.vocab_size
+                sampling_params.top_k if 0 < sampling_params.top_k < self.vocab_size else self.vocab_size
                 for sampling_params in self.sampling_params_list
             ],
             dtype=torch.int32,
@@ -251,10 +204,7 @@ def _make_float_tensor(
             return None
         field_name = source_attr or attr
         return torch.tensor(
-            [
-                float(getattr(sampling_params, field_name))
-                for sampling_params in self.sampling_params_list
-            ],
+            [float(getattr(sampling_params, field_name)) for sampling_params in self.sampling_params_list],
             dtype=torch.float32,
             device=self.device,
         )
@@ -271,10 +221,7 @@ def _make_int_tensor(
             return None
         field_name = source_attr or attr
         return torch.tensor(
-            [
-                int(getattr(sampling_params, field_name))
-                for sampling_params in self.sampling_params_list
-            ],
+            [int(getattr(sampling_params, field_name)) for sampling_params in self.sampling_params_list],
             dtype=dtype,
             device=self.device,
         )
@@ -295,26 +242,17 @@ def _make_penalty_tensors(
         self,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         frequency_penalties = torch.tensor(
-            [
-                sampling_params.frequency_penalty
-                for sampling_params in self.sampling_params_list
-            ],
+            [sampling_params.frequency_penalty for sampling_params in self.sampling_params_list],
             dtype=torch.float32,
             device=self.device,
         )
         presence_penalties = torch.tensor(
-            [
-                sampling_params.presence_penalty
-                for sampling_params in self.sampling_params_list
-            ],
+            [sampling_params.presence_penalty for sampling_params in self.sampling_params_list],
             dtype=torch.float32,
             device=self.device,
         )
         repetition_penalties = torch.tensor(
-            [
-                sampling_params.repetition_penalty
-                for sampling_params in self.sampling_params_list
-            ],
+            [sampling_params.repetition_penalty for sampling_params in self.sampling_params_list],
             dtype=torch.float32,
             device=self.device,
         )
@@ -460,7 +398,7 @@ def can_use_native_random_for_batch(self) -> bool:
         )
 
     def make_sampling_metadata(self) -> SamplingMetadata:
-        """Create vLLM ``SamplingMetadata`` for this batch."""
+        """Create Aphrodite ``SamplingMetadata`` for this batch."""
         self._refresh_logits_processors()
         (
             frequency_penalties,
@@ -482,41 +420,23 @@ def make_sampling_metadata(self) -> SamplingMetadata:
             top_p=self._make_top_p(),
             top_k=self._make_top_k(),
             top_a=self._make_float_tensor("top_a", disabled=self.no_top_a),
-            dry_multiplier=self._make_float_tensor(
-                "dry_multiplier", disabled=self.no_dry
-            ),
+            dry_multiplier=self._make_float_tensor("dry_multiplier", disabled=self.no_dry),
             dry_base=self._make_float_tensor("dry_base", disabled=self.no_dry),
-            dry_allowed_length=self._make_int_tensor(
-                "dry_allowed_length", disabled=self.no_dry
-            ),
+            dry_allowed_length=self._make_int_tensor("dry_allowed_length", disabled=self.no_dry),
             dry_sequence_breaker_ids=self._make_dry_sequence_breaker_ids(),
             dry_ranges=self._make_int_tensor(
                 "dry_ranges",
                 disabled=self.no_dry,
                 source_attr="dry_range",
             ),
-            dry_max_ngram=self._make_int_tensor(
-                "dry_max_ngram", disabled=self.no_dry
-            ),
-            dry_max_occurrences=self._make_int_tensor(
-                "dry_max_occurrences", disabled=self.no_dry
-            ),
-            dry_early_exit_match_len=self._make_int_tensor(
-                "dry_early_exit_match_len", disabled=self.no_dry
-            ),
-            no_repeat_ngram_size=self._make_int_tensor(
-                "no_repeat_ngram_size", disabled=self.no_no_repeat_ngram
-            ),
+            dry_max_ngram=self._make_int_tensor("dry_max_ngram", disabled=self.no_dry),
+            dry_max_occurrences=self._make_int_tensor("dry_max_occurrences", disabled=self.no_dry),
+            dry_early_exit_match_len=self._make_int_tensor("dry_early_exit_match_len", disabled=self.no_dry),
+            no_repeat_ngram_size=self._make_int_tensor("no_repeat_ngram_size", disabled=self.no_no_repeat_ngram),
             tfs=self._make_float_tensor("tfs", disabled=self.no_tfs),
-            eta_cutoff=self._make_float_tensor(
-                "eta_cutoff", disabled=self.no_eta_cutoff
-            ),
-            epsilon_cutoff=self._make_float_tensor(
-                "epsilon_cutoff", disabled=self.no_epsilon_cutoff
-            ),
-            typical_p=self._make_float_tensor(
-                "typical_p", disabled=self.no_typical_p
-            ),
+            eta_cutoff=self._make_float_tensor("eta_cutoff", disabled=self.no_eta_cutoff),
+            epsilon_cutoff=self._make_float_tensor("epsilon_cutoff", disabled=self.no_epsilon_cutoff),
+            typical_p=self._make_float_tensor("typical_p", disabled=self.no_typical_p),
             quadratic_smoothing_factor=self._make_float_tensor(
                 "quadratic_smoothing_factor",
                 disabled=self.no_quadratic,
@@ -527,26 +447,16 @@ def make_sampling_metadata(self) -> SamplingMetadata:
                 disabled=self.no_quadratic,
                 source_attr="smoothing_curve",
             ),
-            xtc_threshold=self._make_float_tensor(
-                "xtc_threshold", disabled=self.no_xtc
-            ),
-            xtc_probability=self._make_float_tensor(
-                "xtc_probability", disabled=self.no_xtc
-            ),
+            xtc_threshold=self._make_float_tensor("xtc_threshold", disabled=self.no_xtc),
+            xtc_probability=self._make_float_tensor("xtc_probability", disabled=self.no_xtc),
             top_nsigma=self._make_float_tensor(
                 "top_nsigma",
                 disabled=self.no_top_nsigma,
                 source_attr="nsigma",
             ),
-            mirostat_mode=self._make_int_tensor(
-                "mirostat_mode", disabled=self.no_mirostat
-            ),
-            mirostat_tau=self._make_float_tensor(
-                "mirostat_tau", disabled=self.no_mirostat
-            ),
-            mirostat_eta=self._make_float_tensor(
-                "mirostat_eta", disabled=self.no_mirostat
-            ),
+            mirostat_mode=self._make_int_tensor("mirostat_mode", disabled=self.no_mirostat),
+            mirostat_tau=self._make_float_tensor("mirostat_tau", disabled=self.no_mirostat),
+            mirostat_eta=self._make_float_tensor("mirostat_eta", disabled=self.no_mirostat),
             skew=self._make_float_tensor("skew", disabled=self.no_skew),
             generators=self.generators,
             max_num_logprobs=self._make_max_num_logprobs(),
@@ -561,13 +471,8 @@ def make_sampling_metadata(self) -> SamplingMetadata:
             logit_bias=self._make_logit_bias(),
             logitsprocs=self.logitsprocs,
             logprob_token_ids=self._make_logprob_token_ids(),
-            temperature_last=[
-                sampling_params.temperature_last
-                for sampling_params in self.sampling_params_list
-            ],
-            persistent_data={
-                index: {} for index in range(len(self.sampling_params_list))
-            },
+            temperature_last=[sampling_params.temperature_last for sampling_params in self.sampling_params_list],
+            persistent_data={index: {} for index in range(len(self.sampling_params_list))},
             spec_token_ids=None,
         )
 
@@ -593,16 +498,12 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array:
 
     if not batch.no_top_k:
         top_ks = [
-            sampling_params.top_k
-            if 0 < sampling_params.top_k < batch.vocab_size
-            else batch.vocab_size
+            sampling_params.top_k if 0 < sampling_params.top_k < batch.vocab_size else batch.vocab_size
             for sampling_params in batch.sampling_params_list
         ]
         max_top_k = max(top_ks)
         if max_top_k < batch.vocab_size:
-            topk_indices = mx.argpartition(-logits, max_top_k - 1, axis=-1)[
-                :, :max_top_k
-            ]
+            topk_indices = mx.argpartition(-logits, max_top_k - 1, axis=-1)[:, :max_top_k]
             logits = mx.take_along_axis(logits, topk_indices, axis=-1)
             if len(set(top_ks)) != 1:
                 positions = mx.arange(max_top_k)[None, :]
@@ -612,15 +513,10 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array:
             if not batch.no_top_p:
                 sorted_positions = mx.argsort(-logits, axis=-1)
                 sorted_logits = mx.take_along_axis(logits, sorted_positions, axis=-1)
-                sorted_indices = mx.take_along_axis(
-                    topk_indices, sorted_positions, axis=-1
-                )
+                sorted_indices = mx.take_along_axis(topk_indices, sorted_positions, axis=-1)
                 sorted_probs = mx.softmax(sorted_logits, axis=-1)
                 top_ps = mx.array(
-                    [
-                        sampling_params.top_p
-                        for sampling_params in batch.sampling_params_list
-                    ],
+                    [sampling_params.top_p for sampling_params in batch.sampling_params_list],
                     dtype=mx.float32,
                 )[:, None]
                 # Keep the first token that crosses top-p, matching nucleus
@@ -629,14 +525,10 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array:
                 remove = (mx.cumsum(sorted_probs, axis=-1) - sorted_probs) > top_ps
                 sorted_logits = mx.where(remove, -float("inf"), sorted_logits)
                 sampled_positions = mx.random.categorical(sorted_logits, axis=-1)
-                return mx.take_along_axis(
-                    sorted_indices, sampled_positions[:, None], axis=-1
-                )[:, 0]
+                return mx.take_along_axis(sorted_indices, sampled_positions[:, None], axis=-1)[:, 0]
 
             sampled_positions = mx.random.categorical(logits, axis=-1)
-            return mx.take_along_axis(
-                topk_indices, sampled_positions[:, None], axis=-1
-            )[:, 0]
+            return mx.take_along_axis(topk_indices, sampled_positions[:, None], axis=-1)[:, 0]
 
         topk_values = mx.topk(logits, max_top_k, axis=-1)
         topk_thresholds = mx.min(topk_values, axis=-1, keepdims=True)
@@ -655,9 +547,7 @@ def _mlx_random_sample(logits: mx.array, batch: SamplingBatch) -> mx.array:
         remove = (mx.cumsum(sorted_probs, axis=-1) - sorted_probs) > top_ps
         sorted_logits = mx.where(remove, -float("inf"), sorted_logits)
         sampled_positions = mx.random.categorical(sorted_logits, axis=-1)
-        return mx.take_along_axis(
-            sorted_indices, sampled_positions[:, None], axis=-1
-        )[:, 0]
+        return mx.take_along_axis(sorted_indices, sampled_positions[:, None], axis=-1)[:, 0]
 
     return mx.random.categorical(logits, axis=-1)
 
@@ -671,7 +561,7 @@ def sample_from_logits(
     """Sample tokens from pre-sliced 2D logits ``(batch_size, vocab)``.
 
     Single entry point for all sampling paths.  Chooses native MLX greedy
-    when possible, otherwise bridges to the vLLM torch sampler.
+    when possible, otherwise bridges to the Aphrodite torch sampler.
     """
     if batch.can_use_native_greedy_for_batch():
         tokens = _mlx_greedy_sample(logits_2d)
@@ -713,7 +603,7 @@ def sample_decode_tokens(
         logits: Full logits array, shape ``(1, total_tokens, vocab)``.
         decode_reqs: ``(req_id, RequestState)`` pairs for decode requests.
         num_decode: Number of decode requests (prefix of the token dimension).
-        sampler: vLLM Sampler instance.
+        sampler: Aphrodite Sampler instance.
         device: PyTorch device for the torch bridge path.
         vocab_size: Model vocabulary size.
         logitsprocs: Optional logits processors.
@@ -727,17 +617,9 @@ def sample_decode_tokens(
     decode_logits = logits[0, :num_decode, :]  # (num_decode, vocab)
 
     sampling_params_list = [state.sampling_params for _, state in decode_reqs]
-    prompt_token_ids_list = [
-        state.token_ids[: state.prompt_len] for _, state in decode_reqs
-    ]
-    output_tokens_list = [
-        state.token_ids[state.prompt_len :] for _, state in decode_reqs
-    ]
-    generators = {
-        i: state.generator
-        for i, (_, state) in enumerate(decode_reqs)
-        if state.generator is not None
-    }
+    prompt_token_ids_list = [state.token_ids[: state.prompt_len] for _, state in decode_reqs]
+    output_tokens_list = [state.token_ids[state.prompt_len :] for _, state in decode_reqs]
+    generators = {i: state.generator for i, (_, state) in enumerate(decode_reqs) if state.generator is not None}
 
     batch = SamplingBatch(
         sampling_params_list,
@@ -769,7 +651,7 @@ def sample_prefill_tokens(
         prefill_reqs: List of ``PrefillRequest`` objects.
         cu_seqlens: Cumulative sequence lengths for logit position lookup.
         num_decode: Number of decode requests (offset into cu_seqlens).
-        sampler: vLLM Sampler instance.
+        sampler: Aphrodite Sampler instance.
         device: PyTorch device for the torch bridge path.
         vocab_size: Model vocabulary size.
         logitsprocs: Optional logits processors.
@@ -790,11 +672,7 @@ def sample_prefill_tokens(
         else:
             prompt_len = len(pr.token_ids)
 
-        prompt_for_meta = (
-            pr.full_prompt_token_ids
-            if pr.full_prompt_token_ids is not None
-            else pr.token_ids
-        )
+        prompt_for_meta = pr.full_prompt_token_ids if pr.full_prompt_token_ids is not None else pr.token_ids
         generators = {} if pr.generator is None else {0: pr.generator}
 
         batch = SamplingBatch(
@@ -839,9 +717,7 @@ def _merge_single_row_logprobs(
             continue
         pad = width - row.logprobs.shape[1]
         if pad:
-            token_rows.append(
-                torch.nn.functional.pad(row.logprob_token_ids[:1], (0, pad))
-            )
+            token_rows.append(torch.nn.functional.pad(row.logprob_token_ids[:1], (0, pad)))
             logprob_rows.append(torch.nn.functional.pad(row.logprobs[:1], (0, pad)))
         else:
             token_rows.append(row.logprob_token_ids[:1])
diff --git a/aphrodite/platforms/cpu.py b/aphrodite/platforms/cpu.py
index 0ae45499a0..74b3af119a 100644
--- a/aphrodite/platforms/cpu.py
+++ b/aphrodite/platforms/cpu.py
@@ -18,7 +18,12 @@
 from aphrodite.utils.mem_constants import GiB_bytes
 from aphrodite.v1.attention.backends.registry import AttentionBackendEnum
 
-from .interface import CpuArchEnum, Platform, PlatformEnum
+from .interface import (
+    CpuArchEnum,
+    Platform,
+    PlatformEnum,
+    log_extension_import_failure,
+)
 
 logger = init_logger(__name__)
 
@@ -371,24 +376,32 @@ def import_kernels(cls) -> None:
                     try:
                         import aphrodite._C  # noqa: F401
                     except ImportError as e:
-                        logger.warning("Failed to import from aphrodite._C: %r", e)
+                        log_extension_import_failure(
+                            "aphrodite._C", e, target_logger=logger
+                        )
                 else:
                     try:
                         import aphrodite._C_AVX512  # noqa: F401
                     except ImportError as e:
                         if ignored_msg not in e.msg:
-                            logger.warning("Failed to import from aphrodite._C_AVX512: %r", e)
+                            log_extension_import_failure(
+                                "aphrodite._C_AVX512", e, target_logger=logger
+                            )
             else:
                 try:
                     import aphrodite._C_AVX2  # noqa: F401
                 except ImportError as e:
                     if ignored_msg not in e.msg:
-                        logger.warning("Failed to import from aphrodite._C_AVX2: %r", e)
+                        log_extension_import_failure(
+                            "aphrodite._C_AVX2", e, target_logger=logger
+                        )
         else:
             try:
                 import aphrodite._C  # noqa: F401
             except ImportError as e:
-                logger.warning("Failed to import from aphrodite._C: %r", e)
+                log_extension_import_failure(
+                    "aphrodite._C", e, target_logger=logger
+                )
 
     @classmethod
     def pack_kv_cache(
diff --git a/aphrodite/platforms/interface.py b/aphrodite/platforms/interface.py
index b6f673cb2b..8af685390c 100644
--- a/aphrodite/platforms/interface.py
+++ b/aphrodite/platforms/interface.py
@@ -29,6 +29,28 @@
 
 logger = init_logger(__name__)
 
+_EXTENSION_IMPORT_FAILURES_WARNED: set[str] = set()
+
+
+def log_extension_import_failure(
+    module_name: str,
+    exc: ImportError,
+    *,
+    target_logger: Any = logger,
+) -> None:
+    """Log extension import failures once at warning level per process.
+
+    Several platform probes can legitimately try the same optional extension
+    during startup.  Keep the first failure visible, but demote repeat failures
+    to debug so missing optional extensions do not flood server logs.
+    """
+    message = "Failed to import from %s: %r"
+    if module_name in _EXTENSION_IMPORT_FAILURES_WARNED:
+        target_logger.debug(message, module_name, exc)
+        return
+    _EXTENSION_IMPORT_FAILURES_WARNED.add(module_name)
+    target_logger.warning(message, module_name, exc)
+
 
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
@@ -241,7 +263,7 @@ def import_kernels(cls) -> None:
         try:
             import aphrodite._C  # noqa: F401
         except ImportError as e:
-            logger.warning("Failed to import from aphrodite._C: %r", e)
+            log_extension_import_failure("aphrodite._C", e)
         with contextlib.suppress(ImportError):
             import aphrodite._moe_C  # noqa: F401
 
diff --git a/aphrodite/platforms/rocm.py b/aphrodite/platforms/rocm.py
index 1b6eb43dcb..abd07f16d2 100644
--- a/aphrodite/platforms/rocm.py
+++ b/aphrodite/platforms/rocm.py
@@ -15,7 +15,12 @@
 from aphrodite.logger import init_logger
 from aphrodite.v1.attention.backends.registry import AttentionBackendEnum
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import (
+    DeviceCapability,
+    Platform,
+    PlatformEnum,
+    log_extension_import_failure,
+)
 
 if TYPE_CHECKING:
     from aphrodite.config import AphroditeConfig
@@ -36,18 +41,18 @@
         amdsmi_topo_get_numa_node_number,
     )
 except ImportError as e:
-    logger.warning("Failed to import from amdsmi with %r", e)
+    logger.warning_once("Failed to import from amdsmi with %r", e)
 
 try:
     import aphrodite._C  # noqa: F401
 except ImportError as e:
-    logger.warning("Failed to import from aphrodite._C with %r", e)
+    log_extension_import_failure("aphrodite._C", e, target_logger=logger)
 
 # import custom ops, trigger op registration
 try:
     import aphrodite._rocm_C  # noqa: F401
 except ImportError as e:
-    logger.warning("Failed to import from aphrodite._rocm_C with %r", e)
+    log_extension_import_failure("aphrodite._rocm_C", e, target_logger=logger)
 
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: list[str] = []