Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions aphrodite/metal/compat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Compatibility patches for vLLM + transformers version mismatches.
"""Compatibility patches for Aphrodite + transformers version mismatches.

Applied once at platform registration time. Optional missing dependencies are
logged; unexpected runtime errors are allowed to surface so regressions remain
Expand Down Expand Up @@ -146,8 +146,7 @@ def _stack_qwen36_moe_per_expert_weights(
and ``...mlp.experts.down_proj``, both stacked along axis 0 over experts.

Mirrors the (scan -> validate -> walk) structure of upstream
ml-explore/mlx-lm#1224. Removable once vllm-metal's mlx-lm pin bumps
past that merge.
ml-explore/mlx-lm#1224.

No-op when no per-expert keys are present (dense Qwen3.5/3.6 or already-
stacked MoE checkpoints).
Expand Down
4 changes: 2 additions & 2 deletions aphrodite/metal/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Configuration for vLLM Metal plugin via environment variables."""
"""Configuration for Aphrodite Metal plugin via environment variables."""

import os
from dataclasses import dataclass
Expand Down Expand Up @@ -37,7 +37,7 @@

@dataclass
class MetalConfig:
"""Configuration for vLLM Metal plugin."""
"""Configuration for Aphrodite Metal plugin."""

memory_fraction: float # -1.0 means "auto" (calculate minimal needed)
use_mlx: bool
Expand Down Expand Up @@ -66,8 +66,8 @@
"APHRODITE_METAL_KV_SHARING_FAST_PREFILL=0."
)

if self.use_paged_attention and not self.is_auto_memory:
if not (0 < self.memory_fraction <= 1):

Check failure on line 70 in aphrodite/metal/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (SIM102)

aphrodite/metal/config.py:69:9: SIM102 Use a single `if` statement instead of nested `if` statements
raise ValueError(
f"Invalid APHRODITE_METAL_MEMORY_FRACTION={self.memory_fraction}. "
"Must be a finite value in (0, 1] when paged attention is enabled."
Expand Down
8 changes: 4 additions & 4 deletions aphrodite/metal/envs.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
"""Environment variable definitions for the vLLM Metal plugin.
"""Environment variable definitions for the Aphrodite Metal plugin.

This module is the single source of truth for all ``APHRODITE_METAL_*`` (and
``APHRODITE_MLX_*``) environment variables. It mirrors the lazy-evaluation
pattern used by ``vllm/envs.py``: each variable is read from
pattern used by ``aphrodite/envs.py``: each variable is read from
``os.environ`` on access via ``__getattr__``, so values are never stale
and ``monkeypatch.setenv`` works in tests without extra resets.

During plugin registration (``aphrodite.metal._register``), the
``environment_variables`` dict is merged into
``aphrodite.envs.environment_variables`` so that ``validate_environ()``
recognises our variables and does not emit spurious "Unknown vLLM
recognises our variables and does not emit spurious "Unknown Aphrodite
environment variable" warnings.
"""

Expand Down Expand Up @@ -84,5 +84,5 @@ def __getattr__(name: str) -> Any:


def __dir__() -> list[str]:
# Mirrors vllm/envs.py; enables tab-completion and introspection.
# Mirrors aphrodite/envs.py; enables tab-completion and introspection.
return list(environment_variables.keys())
2 changes: 1 addition & 1 deletion aphrodite/metal/metal_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class MetalBackend(AttentionBackend):
block_size, and the hybrid-block-size math via
Platform._align_hybrid_block_size) can read Metal's MultipleOf(16)
alignment constraint. The Metal paged-attention kernels are tuned for
block_size=16; advertising MultipleOf(16) makes vLLM's selector default
block_size=16; advertising MultipleOf(16) makes Aphrodite's selector default
to 16 and lets hybrid models align to multiples of 16. It is never
dispatched to as a real attention backend — the actual Metal paged
attention lives in metal_kernel_backend/paged_attention.py. The
Expand Down
8 changes: 4 additions & 4 deletions aphrodite/metal/metal_kernel_backend/attention_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _build_block_tables(
"""Build kernel-compatible block tables, translating if necessary.

When ``cache_block_size`` exceeds the kernel's compiled block sizes,
each vLLM block ``b`` is expanded into ``ratio`` kernel blocks
each Aphrodite block ``b`` is expanded into ``ratio`` kernel blocks
``[b*ratio, b*ratio+ratio)``. The cache is reshaped later to
match (zero-copy).

Expand All @@ -136,7 +136,7 @@ def _build_block_tables(
return result

# Hybrid path — translate large block_size to a kernel-compatible one.
# Vectorized: each vLLM block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
# Vectorized: each Aphrodite block b → [b*ratio, b*ratio+1, …, b*ratio+ratio-1].
kernel_bs = _pick_kernel_block_size(cache_block_size)
ratio = cache_block_size // kernel_bs

Expand Down Expand Up @@ -447,10 +447,10 @@ def sdpa_forward(
max_seq_len = ctx.max_context_len or max(ctx.context_lens)

# --- Block tables (with hybrid block-size translation) ---
# vLLM may inflate block_size (e.g. 544) to align attention pages with
# Aphrodite may inflate block_size (e.g. 544) to align attention pages with
# mamba pages in hybrid models. The Metal kernel only supports small
# block sizes (8, 16, 32). _build_block_tables handles the translation:
# it expands each vLLM block into multiple kernel blocks and returns the
# it expands each Aphrodite block into multiple kernel blocks and returns the
# kernel-compatible block_size. The cache is reshaped to match (zero-copy).
block_tables, kernel_block_size = _build_block_tables(ctx, kv_cache.block_size)

Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/multimodal/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Generic multimodal helpers for vLLM Metal."""
"""Generic multimodal helpers for Aphrodite Metal."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/multimodal/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def merge_multimodal_embeddings(
) -> mx.array:
"""Splice multimodal embeddings into placeholder positions.

Mirrors ``vllm/model_executor/models/utils.py``
Mirrors ``aphrodite/model_executor/models/utils.py``
``_merge_multimodal_embeddings`` for MLX arrays. Returns a new array;
``inputs_embeds`` is not mutated.
"""
Expand Down
6 changes: 3 additions & 3 deletions aphrodite/metal/multimodal/qwen3_vl/adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Qwen3-VL multimodal adapter for vLLM Metal."""
"""Qwen3-VL multimodal adapter for Aphrodite Metal."""

from __future__ import annotations

Expand Down Expand Up @@ -30,10 +30,10 @@ def get_mrope_input_positions(
) -> tuple[mx.array, int]:
"""Return ``((3, seq_len) int32 positions, mrope_position_delta)``.

Calls upstream vLLM's mm_features-driven Qwen3-VL M-RoPE helper with a
Calls upstream Aphrodite's mm_features-driven Qwen3-VL M-RoPE helper with a
minimal image-only config shim, then converts the returned torch tensor
to an MLX array. This keeps the position-building policy upstream-owned
while the vllm-metal runner can consume MLX arrays.
while the aphrodite metal runner can consume MLX arrays.
"""
if not input_tokens:
return mx.zeros((3, 0), dtype=mx.int32), 0
Expand Down
37 changes: 17 additions & 20 deletions aphrodite/metal/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,24 +220,21 @@

@classmethod
def check_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
"""Check and update vLLM configuration for Metal compatibility.
"""Check and update Aphrodite configuration for Metal compatibility.

Args:
aphrodite_config: vLLM configuration object
aphrodite_config: Aphrodite configuration object
"""
config = get_config()
parallel_config = aphrodite_config.parallel_config
model_config = aphrodite_config.model_config
compilation_config = aphrodite_config.compilation_config

# Metal execution is MLX-backed. Torch Inductor/CUDAGraph settings do
# not apply to the actual model path, so normalize them here rather
# than requiring users to pass --enforce-eager.
# not apply to the actual model path, so disable those compilation
# surfaces without overriding the user's eager-mode flag here.
from aphrodite.config.compilation import CompilationMode, CUDAGraphMode

if model_config is not None and not model_config.enforce_eager:
logger.info("Metal: forcing eager mode; torch.compile/CUDAGraphs are not used on MLX.")
model_config.enforce_eager = True
compilation_config.mode = CompilationMode.NONE
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
compilation_config.max_cudagraph_capture_size = 0
Expand All @@ -253,8 +250,8 @@
config.v_quant = add.get("v_quant", "q3_0")
config._validate_turboquant()
logger.info(
f"TurboQuant enabled via --additional-config: "
f"k_quant={config.k_quant}, v_quant={config.v_quant}"

Check failure on line 254 in aphrodite/metal/platform.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/platform.py:253:17: G004 Logging statement uses f-string
)

scheduler_config = aphrodite_config.scheduler_config
Expand All @@ -272,7 +269,7 @@
)

if config.debug:
logger.info(f"Metal config: {config}")

Check failure on line 272 in aphrodite/metal/platform.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/platform.py:272:25: G004 Logging statement uses f-string

# Set worker class for Metal
if parallel_config.worker_cls == "auto":
Expand Down Expand Up @@ -356,8 +353,8 @@
total_mem = cls.get_device_total_memory()
available_mem = cls.get_device_available_memory()
logger.info(
f"Metal memory: {total_mem / 1e9:.1f}GB total, "
f"{available_mem / 1e9:.1f}GB available"

Check failure on line 357 in aphrodite/metal/platform.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/platform.py:356:13: G004 Logging statement uses f-string
)

@classmethod
Expand Down Expand Up @@ -385,7 +382,7 @@
def update_block_size_for_backend(cls, aphrodite_config: "AphroditeConfig") -> None:
"""Update block_size for Metal platform.

Delegates to vLLM's base implementation, which reads the Metal kernel
Delegates to Aphrodite's base implementation, which reads the Metal kernel
alignment (MultipleOf(16)) from our :meth:`_find_non_ssm_backend`
override. Adds a one-time warning when paged attention is enabled for
a hybrid model, explaining the cache-block-size translation mechanism
Expand All @@ -403,16 +400,16 @@
# block-size translation mechanism.
#
# Background:
# - vLLM requires block_size=160 (or larger) for hybrid models to satisfy
# - Aphrodite requires block_size=160 (or larger) for hybrid models to satisfy
# page size divisibility validation between SDPA and Mamba layers.
#
# Solution (PR #235):
# - vLLM sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
# Solution:
# - Aphrodite sees a large block_size (e.g., 144 = 16 * 9) for its scheduler
# validation.
# - The Metal kernel uses a translated block_size (16, the kernel sweet
# spot) that it supports.
# - Each vLLM block is split into ratio = cache_block_size / kernel_block_size
# kernel blocks. For example, one vLLM block of 144 tokens becomes 9 kernel
# - Each Aphrodite block is split into ratio = cache_block_size / kernel_block_size
# kernel blocks. For example, one Aphrodite block of 144 tokens becomes 9 kernel
# blocks of 16 tokens each.
# - The KV cache is reshaped (zero-copy) to match: [num_blocks, 144, ...] →
# [num_blocks*9, 16, ...]. The physical memory layout is unchanged.
Expand All @@ -423,17 +420,17 @@
if model_config.is_hybrid and metal_config.use_paged_attention:
logger.warning(
"Hybrid model (e.g., Qwen3.5) with paged attention enabled. "
"Using block-size translation (PR #235) to convert vLLM's large "
"Using block-size translation (PR #235) to convert Aphrodite's large "
"block_size to a Metal kernel-compatible size.\n"
" Mechanism: Each vLLM block is split into multiple kernel blocks.\n"
" Example: vLLM block_size=144 → kernel block_size=16 (ratio=9).\n"
" Mechanism: Each Aphrodite block is split into multiple kernel blocks.\n"
" Example: Aphrodite block_size=144 → kernel block_size=16 (ratio=9).\n"
" The KV cache is reshaped (zero-copy) and block tables are expanded.\n"
" This is a logical transformation — physical memory is unchanged."
)

# Delegate the rest to upstream. With our ``_find_non_ssm_backend``
# returning :class:`MetalBackend` (which advertises ``MultipleOf(16)``),
# vLLM's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
# Aphrodite's Phase 1 picks a kernel-aligned default of 16 for non-hybrid
# models (matching the kernel sweet spot), and Phase 2
# (``_align_hybrid_block_size``) handles hybrid alignment. The kernel
# layer (``_pick_kernel_block_size``) validates the final
Expand All @@ -451,12 +448,12 @@
from aphrodite.v1.attention.backends.registry import AttentionBackendEnum

if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
logger.info(f"Cannot use {selected_backend} backend on Metal/MLX.")

Check failure on line 451 in aphrodite/metal/platform.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/platform.py:451:25: G004 Logging statement uses f-string
if attn_selector_config.use_mla:
# MLA attention is handled by the vllm-metal model runner (MLAPagedAttentionWrapper),
# not by vLLM's attention backend selector. Continue to return CPU_ATTN below.
# MLA attention is handled by the aphrodite metal model runner (MLAPagedAttentionWrapper),
# not by Aphrodite's attention backend selector. Continue to return CPU_ATTN below.
logger.info(
"MLA model detected; attention handled by vllm-metal model runner"
"MLA model detected; attention handled by aphrodite metal model runner"
)
if attn_selector_config.use_sparse:
raise NotImplementedError("Sparse Attention is not supported on Metal/MLX.")
Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/profiler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Metal frame-capture profiler for vLLM Metal."""
"""Metal frame-capture profiler for Aphrodite Metal."""

from aphrodite.metal.profiler.wrapper import MetalProfilerWrapper

Expand Down
4 changes: 2 additions & 2 deletions aphrodite/metal/profiler/wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Metal frame-capture wrapper for vLLM's WorkerProfiler abstraction.
"""Metal frame-capture wrapper for Aphrodite's WorkerProfiler abstraction.

Subclasses ``aphrodite.profiler.wrapper.WorkerProfiler`` so that the manual
start/stop surface — ``LLM.start_profile`` / ``LLM.stop_profile``, the
Expand Down Expand Up @@ -35,7 +35,7 @@


class MetalProfilerWrapper(WorkerProfiler):
"""Metal frame-capture flavor of vLLM's WorkerProfiler.
"""Metal frame-capture flavor of Aphrodite's WorkerProfiler.

Trace output: ``<profiler_config.torch_profiler_dir>/<trace_name>.gputrace``
"""
Expand Down
2 changes: 0 additions & 2 deletions aphrodite/metal/pytorch_backend/tensor_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# MPS has a 4GB (2^32 bytes) limit for MPSTemporaryNDArray allocations.
# Metal may allocate multiple temporary buffers internally, so we use a
# conservative threshold of 1GB to avoid hitting the limit.
# See: https://github.com/anthropics/vllm-metal/issues/43
_MPS_SAFE_SIZE_BYTES = 1 << 30 # 1GB

# MLX to PyTorch dtype mapping
Expand Down Expand Up @@ -150,7 +149,6 @@ def mlx_to_torch(
tensor = tensor.to(device)
else:
# Large tensor - keep on CPU to avoid MPS 4GB limit crash
# See: https://github.com/anthropics/vllm-metal/issues/43
logger.debug(
"Tensor too large for MPS (%d bytes > %d limit), keeping on CPU",
_get_tensor_size_bytes(array),
Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/stt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Speech-to-Text support for vLLM Metal."""
"""Speech-to-Text support for Aphrodite Metal."""

from aphrodite.metal.stt.loader import load_model
from aphrodite.metal.stt.protocol import TranscriptionResult, TranscriptionSegment
Expand Down
6 changes: 3 additions & 3 deletions aphrodite/metal/stt/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

from typing import Protocol

# Nominal memory reported to vLLM scheduler for STT models.
# Nominal memory reported to Aphrodite scheduler for STT models.
# No KV cache is actually allocated; this just passes minimum-memory checks.
STT_SCHED_AVAILABLE_BYTES = 1 << 30 # 1 GiB

# Block size reported to vLLM for STT models (minimal, no real KV cache).
# Block size reported to Aphrodite for STT models (minimal, no real KV cache).
STT_SCHED_BLOCK_BYTES = 1

# Nominal head size for the placeholder KV spec used only to satisfy
# vLLM scheduler initialization for STT models.
# Aphrodite scheduler initialization for STT models.
STT_SCHED_NOMINAL_HEAD_SIZE = 64


Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/stt/qwen3_asr/adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Qwen3-ASR runtime adapter for vLLM STT execution."""
"""Qwen3-ASR runtime adapter for Aphrodite STT execution."""

from __future__ import annotations

Expand Down
10 changes: 5 additions & 5 deletions aphrodite/metal/stt/qwen3_asr/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
"""Qwen3-ASR configuration (MLX-free).

Keep this module free of MLX imports so vLLM compat code can import config and
Keep this module free of MLX imports so Aphrodite compat code can import config and
shape helpers during planning/registration without pulling in the model stack.
"""

Expand All @@ -10,7 +10,7 @@
from dataclasses import dataclass, field

from aphrodite.transformers_utils.configs.qwen3_asr import (
Qwen3ASRConfig as VllmQwen3ASRConfig,
Qwen3ASRConfig as AphroditeQwen3ASRConfig,
)

# Maximum decode tokens for Qwen3-ASR decode loop.
Expand Down Expand Up @@ -82,8 +82,8 @@ class Qwen3ASRConfig:
n_audio_ctx: int = 1500

@classmethod
def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
"""Adapt the upstream vLLM/HF config into the local MLX model config."""
def _from_aphrodite_config(cls, config: AphroditeQwen3ASRConfig) -> Qwen3ASRConfig:
"""Adapt the upstream Aphrodite/HF config into the local MLX model config."""
thinker = config.thinker_config
audio = thinker.audio_config
text = thinker.text_config
Expand Down Expand Up @@ -131,4 +131,4 @@ def _from_aphrodite_config(cls, config: VllmQwen3ASRConfig) -> Qwen3ASRConfig:
@classmethod
def from_dict(cls, d: dict) -> Qwen3ASRConfig:
"""Create config from config.json using the upstream schema owner."""
return cls._from_aphrodite_config(VllmQwen3ASRConfig.from_dict(d))
return cls._from_aphrodite_config(AphroditeQwen3ASRConfig.from_dict(d))
2 changes: 1 addition & 1 deletion aphrodite/metal/stt/qwen3_asr/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ def decode_step(
return self.language_model.forward_embeds(embeds, cache)

def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
"""Create the model-owned runtime adapter used by the vLLM runner."""
"""Create the model-owned runtime adapter used by the Aphrodite runner."""
# Local import: avoid import-time cycles (adapter imports transcriber).
from .adapter import Qwen3ASRRuntimeAdapter

Expand Down
6 changes: 3 additions & 3 deletions aphrodite/metal/stt/runtime.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
"""STT runtime adapter contract used by the vLLM runner.
"""STT runtime adapter contract used by the Aphrodite runner.

The vLLM runner delegates STT execution to model-owned runtime adapters under
The Aphrodite runner delegates STT execution to model-owned runtime adapters under
`stt/<model>/adapter.py` so shared code does not accumulate per-model branches.
"""

Expand All @@ -26,7 +26,7 @@


class STTRuntimeAdapter(ABC):
"""Model-owned bridge between vLLM STT inputs and per-model STT execution.
"""Model-owned bridge between Aphrodite STT inputs and per-model STT execution.

Concrete implementations live under `stt/<model>/adapter.py` and own:
- input_features normalization to the model's expected encoder input shape
Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/stt/whisper/adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Whisper runtime adapter for vLLM STT execution."""
"""Whisper runtime adapter for Aphrodite STT execution."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion aphrodite/metal/stt/whisper/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@
self._alignment_heads = mx.array(np.asarray(all_heads.nonzero()).T)

def create_runtime_adapter(self, model_path: str) -> STTRuntimeAdapter:
"""Create the model-owned runtime adapter used by the vLLM runner."""
"""Create the model-owned runtime adapter used by the Aphrodite runner."""
# Local import: avoid import-time cycles (adapter imports transcriber).
from .adapter import WhisperRuntimeAdapter

Expand Down Expand Up @@ -379,8 +379,8 @@
continue

# Transpose Conv1d weights: HF (out, in, kernel) -> MLX (out, kernel, in)
if "conv1.weight" in k or "conv2.weight" in k:
if v.ndim == 3:

Check failure on line 383 in aphrodite/metal/stt/whisper/model.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (SIM102)

aphrodite/metal/stt/whisper/model.py:382:17: SIM102 Use a single `if` statement instead of nested `if` statements
v = v.transpose(0, 2, 1)

if v.dtype != self.dtype and v.dtype != mx.uint32:
Expand Down
4 changes: 2 additions & 2 deletions aphrodite/metal/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""Metal utility functions for vLLM Metal plugin."""
"""Metal utility functions for Aphrodite Metal plugin."""

import logging
import os
Expand All @@ -24,7 +24,7 @@
Example:

```bash
APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache vllm serve Qwen/Qwen2.5-0.5B
APHRODITE_USE_MODELSCOPE=True APHRODITE_METAL_MODELSCOPE_CACHE=/path/to/cache aphrodite run Qwen/Qwen2.5-0.5B
```
"""
if Path(model_repo_name).exists():
Expand All @@ -38,9 +38,9 @@

model_cache_dir = envs.APHRODITE_METAL_MODELSCOPE_CACHE

logger.info(f"Downloading model {model_repo_name} from ModelScope...")

Check failure on line 41 in aphrodite/metal/utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/utils.py:41:25: G004 Logging statement uses f-string
model_path = snapshot_download(model_repo_name, cache_dir=model_cache_dir)
logger.info(f"Model downloaded to {model_path}")

Check failure on line 43 in aphrodite/metal/utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

aphrodite/metal/utils.py:43:25: G004 Logging statement uses f-string
return str(model_path)
except ImportError:
logger.warning(
Expand Down
Loading
Loading