Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions csrc/engine/compiler/paged_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ inline void set_zeros(infinicore::Tensor &tensor) {
infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false);
}

inline void set_minus_one(infinicore::Tensor &tensor) {
// For int32 tensors, 0xFF bytes correspond to -1 in two's complement.
std::vector<uint8_t> minus_one(tensor->nbytes(), 0xFF);
infinicore::context::memcpyH2D(tensor->data(), minus_one.data(), tensor->nbytes(), false);
}

} // namespace
namespace infinilm::engine {
PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
Expand Down Expand Up @@ -35,10 +41,9 @@ void PagedCompiler::compile() {
size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end());
compiled_map_decode_.clear();
block_tables_holder_ = infinicore::Tensor::empty(
{nblocks}, infinicore::DataType::I32, infinicore::context::getDevice());
{nblocks * max_batch_size}, infinicore::DataType::I32, infinicore::context::getDevice());
set_zeros(block_tables_holder_);
for (size_t b : decode_batch_sizes_) {
size_t block_per_req = nblocks / b;
InfinilmModel::Input input;
input.input_ids = infinicore::Tensor::empty({1, b}, infinicore::DataType::I64, infinicore::context::getDevice());
input.position_ids = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
Expand All @@ -56,6 +61,7 @@ void PagedCompiler::compile() {
infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
input.cu_seqlens = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
infinicore::context::memcpyH2D(input.cu_seqlens.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
const size_t block_per_req = nblocks;
input.block_tables = block_tables_holder_->as_strided({b, block_per_req}, {(ptrdiff_t)block_per_req, 1});
input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
set_zeros(input.slot_mapping.value());
Expand Down Expand Up @@ -105,6 +111,17 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());
graph_input.input_offsets.value()->copy_from(input.input_offsets.value());
graph_input.cu_seqlens.value()->copy_from(input.cu_seqlens.value());

const size_t compiled_block_per_req = graph_input.block_tables.value()->size(1);
if (block_per_req > compiled_block_per_req) {
// Runtime width exceeds compiled graph slot; fall back to eager path.
return {nullptr, nullptr};
}

// Initialize full padding to -1, then overwrite the narrowed logical region.
// This matches scheduler padding semantics without risking -1 access during graph recording.
auto &graph_block_tables = graph_input.block_tables.value();
set_minus_one(graph_block_tables);
graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());
Comment on lines +121 to 126
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set_minus_one(graph_block_tables) does a full host allocation + H2D copy of b * compiled_block_per_req int32s on every get_compiled() call. This can become a significant per-step overhead (and scales with the compiled dim1, now nblocks). Prefer a device-side fill (if available), or only fill the padding region [block_per_req:compiled_block_per_req] instead of the entire tensor, or reuse a cached host buffer to avoid repeated allocations.

Copilot uses AI. Check for mistakes.

Expand Down
75 changes: 75 additions & 0 deletions python/infinilm/exception_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import logging
from typing import Iterator

logger = logging.getLogger(__name__)


def _iter_exception_chain(
e: BaseException, *, max_depth: int = 6
) -> Iterator[BaseException]:
"""Iterate through exception chain with depth limit."""
cur: BaseException | None = e
depth = 0
seen: set[int] = set()
while cur is not None and depth < max_depth:
cur_id = id(cur)
if cur_id in seen:
break
seen.add(cur_id)
yield cur
depth += 1
cur = cur.__cause__ or cur.__context__


def is_oom_exception(e: BaseException) -> bool:
"""
Conservative OOM detector for MetaX allocator failures and CUDA/PyTorch OOMs.
Checks exception type (when available) and message substrings across chained exceptions.
"""
# PyTorch OOM exception type (only if torch is present in this environment)
try:
import torch # type: ignore

oom_type = getattr(torch, "OutOfMemoryError", None)
if oom_type is not None:
for ex in _iter_exception_chain(e):
if isinstance(ex, oom_type):
return True
except Exception:
pass

# Common patterns observed for allocator failures.
# Keep this allowlist small to avoid hard-exiting on unrelated errors.
patterns = (
# MetaX / infinirt allocator
"hcmalloc",
"infinirtmalloc",
"out of memory",
# CUDA / driver / runtime alloc failures
"cuda out of memory",
"cumemalloc",
"cublas_status_alloc_failed",
"cudnn_status_alloc_failed",
)

for ex in _iter_exception_chain(e):
msg = str(ex)
if not msg:
continue
msg_l = msg.lower()
if any(p in msg_l for p in patterns):
return True
return False


def handle_oom_and_exit(e: BaseException, exit_code: int = 137) -> None:
"""Handle OOM exception by logging and exiting."""
if is_oom_exception(e):
logger.error(
"OOM-like exception: exiting worker with code %d: %r",
exit_code,
e,
exc_info=False,
)
os._exit(exit_code)
83 changes: 49 additions & 34 deletions python/infinilm/infer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from infinilm.lib import _infinilm

from .modeling_utils import parse_dtype
from .exception_utils import handle_oom_and_exit


@dataclass
Expand Down Expand Up @@ -46,9 +47,11 @@ def __init__(
cache_config,
enable_graph_compiling,
attention_backend,
parse_dtype(kv_cache_dtype)._underlying
if kv_cache_dtype is not None
else None,
(
parse_dtype(kv_cache_dtype)._underlying
if kv_cache_dtype is not None
else None
),
)
self.use_cache = False

Expand All @@ -72,39 +75,51 @@ def forward(
top_k=None,
top_p=None,
):
# TODO: Remove `_underlying` and simplify the corresponding code.
input_ids = input_ids._underlying if input_ids is not None else None
position_ids = position_ids._underlying if position_ids is not None else None
past_kv_lengths = (
past_kv_lengths._underlying if past_kv_lengths is not None else None
)
total_kv_lengths = (
total_kv_lengths._underlying if past_kv_lengths is not None else None
)
input_offsets = input_offsets._underlying if input_offsets is not None else None
block_tables = block_tables._underlying if block_tables is not None else None
cu_seqlens = cu_seqlens._underlying if cu_seqlens is not None else None
slot_mapping = slot_mapping._underlying if slot_mapping is not None else None

return infinicore.Tensor(
super()
.forward(
super().Input(
input_ids,
position_ids=position_ids,
past_sequence_lengths=past_kv_lengths,
total_sequence_lengths=total_kv_lengths,
input_offsets=input_offsets,
cu_seqlens=cu_seqlens,
block_tables=block_tables,
slot_mapping=slot_mapping,
temperature=temperature,
top_k=top_k,
top_p=top_p,
try:
# TODO: Remove `_underlying` and simplify the corresponding code.
input_ids = input_ids._underlying if input_ids is not None else None
position_ids = (
position_ids._underlying if position_ids is not None else None
)
past_kv_lengths = (
past_kv_lengths._underlying if past_kv_lengths is not None else None
)
total_kv_lengths = (
total_kv_lengths._underlying if total_kv_lengths is not None else None
)
input_offsets = (
input_offsets._underlying if input_offsets is not None else None
)
block_tables = (
block_tables._underlying if block_tables is not None else None
)
cu_seqlens = cu_seqlens._underlying if cu_seqlens is not None else None
slot_mapping = (
slot_mapping._underlying if slot_mapping is not None else None
)

return infinicore.Tensor(
super()
.forward(
super().Input(
input_ids,
position_ids=position_ids,
past_sequence_lengths=past_kv_lengths,
total_sequence_lengths=total_kv_lengths,
input_offsets=input_offsets,
cu_seqlens=cu_seqlens,
block_tables=block_tables,
slot_mapping=slot_mapping,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
)
.output_ids
)
.output_ids
)
except BaseException as e:
handle_oom_and_exit(e)
raise

def generate(
self,
Expand Down