NVIDIA · rparolin · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -40,6 +40,7 @@ def _import_versioned_module():
 )
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
+from cuda.core._host import Host
 from cuda.core._launch_config import LaunchConfig
 from cuda.core._launcher import launch
 from cuda.core._linker import Linker, LinkerOptions
@@ -49,6 +50,7 @@ def _import_versioned_module():
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
     LegacyPinnedMemoryResource,
+    ManagedBuffer,
     ManagedMemoryResource,
     ManagedMemoryResourceOptions,
     MemoryResource,

diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import threading
+from typing import ClassVar
+
+
+class Host:
+    """Host (CPU) location for managed-memory operations.
+
+    Use one of the three forms:
+
+    * ``Host()`` — generic host (any NUMA node).
+    * ``Host(numa_id=N)`` — specific NUMA node ``N``.
+    * ``Host.numa_current()`` — NUMA node of the calling thread.
+
+    ``Host`` is the symmetric counterpart of :class:`~cuda.core.Device`
+    for managed-memory `prefetch`, `advise`, and `discard_prefetch`
+    targets. Pass either a ``Device`` or a ``Host`` to those operations
+    and to ``ManagedBuffer.preferred_location`` / ``accessed_by``.
+
+    ``Host`` is a singleton class, mirroring :class:`~cuda.core.Device`:
+    constructor calls with the same arguments return the same instance,
+    so ``Host() is Host()`` and ``Host(numa_id=1) is Host(numa_id=1)``.
+    ``Host.numa_current()`` returns its own singleton, distinct from
+    ``Host()`` because it represents a thread-relative location rather
+    than a fixed one.
+    """
+
+    __slots__ = ("__weakref__", "_is_numa_current", "_numa_id")
+
+    # Singleton cache keyed by (numa_id, is_numa_current).
+    _instances: ClassVar[dict[tuple[int | None, bool], Host]] = {}
+    _instances_lock: ClassVar[threading.Lock] = threading.Lock()
+
+    def __new__(cls, numa_id: int | None = None) -> Host:
+        if numa_id is not None and (isinstance(numa_id, bool) or not isinstance(numa_id, int) or numa_id < 0):
+            raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}")
+        return cls._get_or_create(numa_id, is_numa_current=False)
+
+    @classmethod
+    def _get_or_create(cls, numa_id: int | None, is_numa_current: bool) -> Host:
+        key = (numa_id, is_numa_current)
+        cache = cls._instances
+        inst = cache.get(key)
+        if inst is not None:
+            return inst
+        with cls._instances_lock:
+            inst = cache.get(key)
+            if inst is None:
+                inst = object.__new__(cls)
+                object.__setattr__(inst, "_numa_id", numa_id)
+                object.__setattr__(inst, "_is_numa_current", is_numa_current)
+                cache[key] = inst
+            return inst
+
+    @property
+    def numa_id(self) -> int | None:
+        return self._numa_id
+
+    @property
+    def is_numa_current(self) -> bool:
+        return self._is_numa_current
+
+    @classmethod
+    def numa_current(cls) -> Host:
+        """Construct a ``Host`` referring to the calling thread's NUMA node."""
+        return cls._get_or_create(None, is_numa_current=True)
+
+    def __setattr__(self, name: str, value) -> None:
+        raise AttributeError(f"{type(self).__name__} is immutable; cannot set {name!r}")
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Host):
+            return NotImplemented
+        return self is other
+
+    def __hash__(self) -> int:
+        return hash((Host, self._numa_id, self._is_numa_current))
+
+    def __reduce__(self):
+        if self._is_numa_current:
+            return (_reconstruct_numa_current, ())
+        return (Host, (self._numa_id,))
+
+    def __repr__(self) -> str:
+        if self.is_numa_current:
+            return "Host.numa_current()"
+        if self.numa_id is None:
+            return "Host()"
+        return f"Host(numa_id={self.numa_id})"
+
+
+def _reconstruct_numa_current() -> Host:
+    return Host.numa_current()
diff --git a/cuda_core/cuda/core/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py
@@ -7,6 +7,7 @@
 from ._graph_memory_resource import *
 from ._ipc import *
 from ._legacy import *
+from ._managed_buffer import ManagedBuffer
 from ._managed_memory_resource import *
 from ._pinned_memory_resource import *
 from ._virtual_memory_resource import *
diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport DevicePtrHandle
 from cuda.core._stream cimport Stream
 
@@ -31,10 +32,20 @@ cdef class MemoryResource:
     pass
 
 
-# Helper function to create a Buffer from a DevicePtrHandle
+# Helper function to create a Buffer from a DevicePtrHandle.
+# `cls` lets callers materialize Buffer subclasses (e.g. ManagedBuffer for
+# managed-memory allocations); defaults to Buffer.
 cdef Buffer Buffer_from_deviceptr_handle(
     DevicePtrHandle h_ptr,
     size_t size,
     MemoryResource mr,
-    object ipc_descriptor = *
+    object ipc_descriptor = *,
+    type cls = *,
 )
+
+# Memory attribute query helpers (used by _managed_memory_ops)
+cdef void _init_mem_attrs(Buffer self)
+cdef int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr,
+) except -1 nogil
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -78,6 +78,7 @@ __all__ = ['Buffer', 'MemoryResource']
 
 
 
+
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -479,12 +480,15 @@ cdef inline int _query_memory_attrs(
         ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
     HANDLE_RETURN(ret)
 
+    # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the
+    # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet.
+    out.is_managed = is_managed != 0
+
     if memory_type == 0:
         # unregistered host pointer
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
-        out.is_managed = False
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -493,12 +497,10 @@ cdef inline int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = is_managed
     elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
         out.is_host_accessible = False
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = False
     else:
         with cython.gil:
             raise ValueError(f"Unsupported memory type: {memory_type}")
@@ -576,14 +578,15 @@ cdef class MemoryResource:
 
 # Buffer Implementation Helpers
 # -----------------------------
-cdef inline Buffer Buffer_from_deviceptr_handle(
+cdef Buffer Buffer_from_deviceptr_handle(
     DevicePtrHandle h_ptr,
     size_t size,
     MemoryResource mr,
-    object ipc_descriptor = None
+    object ipc_descriptor = None,
+    type cls = Buffer,
 ):
-    """Create a Buffer from an existing DevicePtrHandle."""
-    cdef Buffer buf = Buffer.__new__(Buffer)
+    """Create a Buffer (or subclass instance) from an existing DevicePtrHandle."""
+    cdef Buffer buf = cls.__new__(cls)
     buf._h_ptr = h_ptr
     buf._size = size
     buf._memory_resource = mr