Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,10 @@ def _import_versioned_module():
StreamOptions,
)
from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions
from cuda.core.graph import (
Graph,
GraphAllocOptions,
GraphBuilder,
GraphCompleteOptions,
GraphCondition,
GraphDebugPrintOptions,
GraphDefinition,
)

# isort: split
# Must come after the cuda.core._* extension imports above: loading graph
# earlier interacts badly with the merged-wheel __path__ rewrite and leaves
# Graph/GraphBuilder/GraphCompleteOptions/GraphDebugPrintOptions missing from
# cuda.core.graph.
import cuda.core.graph
6 changes: 0 additions & 6 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,3 @@ def _warn_deprecated():
from cuda.core._module import Kernel, ObjectCode
from cuda.core._program import Program, ProgramOptions
from cuda.core._stream import Stream, StreamOptions
from cuda.core.graph import (
Graph,
GraphBuilder,
GraphCompleteOptions,
GraphDebugPrintOptions,
)
46 changes: 5 additions & 41 deletions cuda_core/cuda/core/graph/_graph_definition.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,11 @@ from cuda.core._resource_handles cimport (
)
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN

from dataclasses import dataclass

from cuda.core._utils.cuda_utils import driver

from cuda.core.typing import GraphMemoryType

__all__ = ['GraphCondition', 'GraphAllocOptions', 'GraphDefinition']
__all__ = ['GraphCondition', 'GraphDefinition']


cdef class GraphCondition:
Expand Down Expand Up @@ -71,42 +69,6 @@ cdef class GraphCondition:
return <unsigned long long>self._c_handle


@dataclass
class GraphAllocOptions:
"""Options for graph memory allocation nodes.

Attributes
----------
device : int or Device, optional
The device on which to allocate memory. If None (default),
uses the current CUDA context's device.
memory_type : GraphMemoryType | str, optional
Type of memory to allocate. One of:

- ``"device"`` (default): Pinned device memory, optimal for GPU kernels.
- ``"host"``: Pinned host memory, accessible from both host and device.
Useful for graphs containing host callback nodes. Note: may not be
supported on all systems/drivers.
- ``"managed"``: Managed/unified memory that automatically migrates
between host and device. Useful for mixed host/device access patterns.

peer_access : list of int or Device, optional
List of devices that should have read-write access to the
allocated memory. If None (default), only the allocating
device has access.

Notes
-----
- IPC (inter-process communication) is not supported for graph
memory allocation nodes per CUDA documentation.
- The allocation uses the device's default memory pool.
"""

device: int | "Device" | None = None
memory_type: GraphMemoryType = GraphMemoryType.DEVICE
peer_access: list | None = None


cdef class GraphDefinition:
"""A graph definition.

Expand Down Expand Up @@ -147,12 +109,14 @@ cdef class GraphDefinition:
n._h_node = create_graph_node_handle(<cydriver.CUgraphNode>NULL, self._h_graph)
return n

def allocate(self, size_t size, options: GraphAllocOptions | None = None) -> "AllocNode":
def allocate(self, size_t size, *, device: "Device" | int | None = None,
memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
peer_access: list["Device" | int] | None = None) -> "AllocNode":
"""Add an entry-point memory allocation node (no dependencies).

See :meth:`GraphNode.allocate` for full documentation.
"""
return self._entry.allocate(size, options)
return self._entry.allocate(size, device=device, memory_type=memory_type, peer_access=peer_access)

def deallocate(self, dptr) -> "FreeNode":
"""Add an entry-point memory free node (no dependencies).
Expand Down
59 changes: 42 additions & 17 deletions cuda_core/cuda/core/graph/_graph_node.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import weakref

from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy
from cuda.core._utils.cuda_utils import driver
from cuda.core.typing import GraphMemoryType

__all__ = ['GraphNode']

Expand Down Expand Up @@ -218,23 +219,48 @@ cdef class GraphNode:
"""
return GN_join(self, nodes)

def allocate(self, size_t size, options=None) -> AllocNode:
def allocate(self, size_t size, *, device: "Device" | int | None = None,
memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
peer_access: list["Device" | int] | None = None) -> AllocNode:
"""Add a memory allocation node depending on this node.

Parameters
----------
size : int
Number of bytes to allocate.
options : GraphAllocOptions, optional
Allocation options. If None, allocates on the current device.
device : int or Device, optional
The device on which to allocate memory. If None (default),
uses the current CUDA context's device.
memory_type : GraphMemoryType or str, optional
Type of memory to allocate. One of:

- ``GraphMemoryType.DEVICE`` (default): Pinned device memory,
optimal for GPU kernels.
- ``GraphMemoryType.HOST``: Pinned host memory, accessible from
both host and device. Useful for graphs containing host
callback nodes. Note: may not be supported on all
systems/drivers.
- ``GraphMemoryType.MANAGED``: Managed/unified memory that
automatically migrates between host and device. Useful for
mixed host/device access patterns.

peer_access : list of int or Device, optional
List of devices that should have read-write access to the
allocated memory. If None (default), only the allocating
device has access.

Returns
-------
AllocNode
A new AllocNode representing the allocation. Access the allocated
device pointer via the dptr property.

Notes
-----
IPC (inter-process communication) is not supported for graph
memory allocation nodes per CUDA documentation.
"""
return GN_alloc(self, size, options)
return GN_alloc(self, size, device, memory_type, peer_access)

def deallocate(self, dptr: int) -> FreeNode:
"""Add a memory free node depending on this node.
Expand Down Expand Up @@ -658,16 +684,17 @@ cdef inline EmptyNode GN_join(GraphNode self, tuple nodes):
return _registered(EmptyNode._create_impl(create_graph_node_handle(new_node, h_graph)))


cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object device,
object memory_type, object peer_access):
cdef int device_id
cdef cydriver.CUdevice dev

if options is None or options.device is None:
if device is None:
with nogil:
HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
device_id = <int>dev
else:
device_id = getattr(options.device, 'device_id', options.device)
device_id = getattr(device, 'device_id', device)

cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params
cdef cydriver.CUgraphNode new_node = NULL
Expand All @@ -684,8 +711,8 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
cdef int peer_id
cdef list peer_ids = []

if options is not None and options.peer_access is not None:
for peer_dev in options.peer_access:
if peer_access is not None:
for peer_dev in peer_access:
peer_id = getattr(peer_dev, 'device_id', peer_dev)
peer_ids.append(peer_id)
access_descs.push_back(cydriver.CUmemAccessDesc_st(
Expand All @@ -696,31 +723,29 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
))

cdef str memory_type = "device"
if options is not None and options.memory_type is not None:
memory_type = str(options.memory_type)
cdef str memory_type_str = "device" if memory_type is None else str(memory_type)

c_memset(&alloc_params, 0, sizeof(alloc_params))
alloc_params.poolProps.handleTypes = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
alloc_params.bytesize = size

if memory_type == "device":
if memory_type_str == "device":
alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
alloc_params.poolProps.location.id = device_id
elif memory_type == "host":
elif memory_type_str == "host":
alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
alloc_params.poolProps.location.id = 0
elif memory_type == "managed":
elif memory_type_str == "managed":
IF CUDA_CORE_BUILD_MAJOR >= 13:
alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
alloc_params.poolProps.location.id = device_id
ELSE:
raise ValueError("memory_type='managed' requires CUDA 13.0 or later")
else:
raise ValueError(f"Invalid memory_type: {memory_type!r}. "
raise ValueError(f"Invalid memory_type: {memory_type_str!r}. "
"Must be 'device', 'host', or 'managed'.")

if access_descs.size() > 0:
Expand All @@ -733,7 +758,7 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):

return _registered(AllocNode._create_with_params(
create_graph_node_handle(new_node, h_graph), alloc_params.dptr, size,
device_id, memory_type, tuple(peer_ids)))
device_id, memory_type_str, tuple(peer_ids)))


cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr):
Expand Down
12 changes: 0 additions & 12 deletions cuda_core/cuda/core/graph/_subclasses.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,6 @@ cdef class AllocNode(GraphNode):
The type of memory allocated.
peer_access : tuple of int
Device IDs that have read-write access to this allocation.
options : GraphAllocOptions
A GraphAllocOptions reconstructed from this node's parameters.
"""

@staticmethod
Expand Down Expand Up @@ -253,16 +251,6 @@ cdef class AllocNode(GraphNode):
"""Device IDs with read-write access to this allocation."""
return self._peer_access

@property
def options(self):
"""A GraphAllocOptions reconstructed from this node's parameters."""
from cuda.core.graph._graph_definition import GraphAllocOptions
return GraphAllocOptions(
device=self._device_id,
memory_type=self._memory_type,
peer_access=list(self._peer_access) if self._peer_access else None,
)


cdef class FreeNode(GraphNode):
"""A memory deallocation node.
Expand Down
1 change: 0 additions & 1 deletion cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ launched on a :class:`Stream`.

:template: dataclass.rst

graph.GraphAllocOptions
graph.GraphCompleteOptions
graph.GraphDebugPrintOptions

Expand Down
2 changes: 1 addition & 1 deletion cuda_core/docs/source/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs
- :class:`Device` class for GPU device operations and context management.
- :class:`Buffer` and :class:`MemoryResource` classes for memory allocation and management.
- :class:`Program` for JIT compilation of CUDA kernels.
- :class:`GraphBuilder` for building and executing CUDA graphs.
- :class:`graph.GraphBuilder` for building and executing CUDA graphs.
- :class:`Stream` and :class:`Event` for asynchronous execution and timing.

Example: Compiling and Launching a CUDA kernel
Expand Down
13 changes: 13 additions & 0 deletions cuda_core/docs/source/release/1.0.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ New features
Breaking changes
----------------

- Graph types are no longer re-exported from the top-level ``cuda.core``
namespace; they must be imported from :mod:`cuda.core.graph`. The affected
symbols are :class:`~graph.Graph`, :class:`~graph.GraphBuilder`,
:class:`~graph.GraphCompleteOptions`, :class:`~graph.GraphCondition`,
:class:`~graph.GraphDebugPrintOptions`, and :class:`~graph.GraphDefinition`.
Update ``from cuda.core import GraphBuilder`` to
``from cuda.core.graph import GraphBuilder`` (and similarly for the other
symbols). The same symbols are also no longer forwarded through the
deprecated ``cuda.core.experimental`` namespace.
- Removed the ``GraphAllocOptions`` dataclass and the
``AllocNode.options`` property. Its fields are now keyword-only
parameters on :meth:`graph.GraphDefinition.allocate` and
:meth:`graph.GraphNode.allocate`.
- Renamed :class:`~graph.GraphDef` to :class:`~graph.GraphDefinition` for
consistency with the rest of the API, which spells words out (e.g.
``TensorMapDescriptor``, not ``TensorMapDesc``).
Expand Down
2 changes: 1 addition & 1 deletion cuda_core/tests/graph/test_device_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from cuda.core import (
Device,
GraphCompleteOptions,
LaunchConfig,
LegacyPinnedMemoryResource,
Linker,
Expand All @@ -19,6 +18,7 @@
ProgramOptions,
launch,
)
from cuda.core.graph import GraphCompleteOptions


def _get_device_arch():
Expand Down
5 changes: 3 additions & 2 deletions cuda_core/tests/graph/test_graph_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GraphBuilder stream capture tests."""
Expand All @@ -8,7 +8,8 @@
from helpers.graph_kernels import compile_common_kernels
from helpers.marks import requires_module

from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch
from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
from cuda.core.graph import GraphBuilder


def test_graph_is_building(init_cuda):
Expand Down
5 changes: 3 additions & 2 deletions cuda_core/tests/graph/test_graph_builder_conditional.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Tests for GraphBuilder conditional node capture (if, if-else, switch, while)."""
Expand All @@ -10,7 +10,8 @@
from helpers.graph_kernels import compile_conditional_kernels
from helpers.marks import requires_module

from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch
from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
from cuda.core.graph import GraphBuilder


@pytest.mark.parametrize(
Expand Down
Loading
Loading