diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 56027d41fe3..825b29e6caf 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -66,12 +66,10 @@ def _import_versioned_module(): StreamOptions, ) from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions -from cuda.core.graph import ( - Graph, - GraphAllocOptions, - GraphBuilder, - GraphCompleteOptions, - GraphCondition, - GraphDebugPrintOptions, - GraphDefinition, -) + +# isort: split +# Must come after the cuda.core._* extension imports above: loading graph +# earlier interacts badly with the merged-wheel __path__ rewrite and leaves +# Graph/GraphBuilder/GraphCompleteOptions/GraphDebugPrintOptions missing from +# cuda.core.graph. +import cuda.core.graph diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index f65e7852a9a..08c3e33ce18 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -67,9 +67,3 @@ def _warn_deprecated(): from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import Stream, StreamOptions -from cuda.core.graph import ( - Graph, - GraphBuilder, - GraphCompleteOptions, - GraphDebugPrintOptions, -) diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx index 5e4fa60d055..413a17368d8 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyx +++ b/cuda_core/cuda/core/graph/_graph_definition.pyx @@ -23,13 +23,11 @@ from cuda.core._resource_handles cimport ( ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from dataclasses import dataclass - from cuda.core._utils.cuda_utils import driver from cuda.core.typing import GraphMemoryType -__all__ = ['GraphCondition', 'GraphAllocOptions', 'GraphDefinition'] +__all__ = ['GraphCondition', 'GraphDefinition'] cdef class GraphCondition: @@ -71,42 +69,6 @@ cdef class GraphCondition: return self._c_handle -@dataclass -class GraphAllocOptions: - """Options for graph memory allocation nodes. - - Attributes - ---------- - device : int or Device, optional - The device on which to allocate memory. If None (default), - uses the current CUDA context's device. - memory_type : GraphMemoryType | str, optional - Type of memory to allocate. One of: - - - ``"device"`` (default): Pinned device memory, optimal for GPU kernels. - - ``"host"``: Pinned host memory, accessible from both host and device. - Useful for graphs containing host callback nodes. Note: may not be - supported on all systems/drivers. - - ``"managed"``: Managed/unified memory that automatically migrates - between host and device. Useful for mixed host/device access patterns. - - peer_access : list of int or Device, optional - List of devices that should have read-write access to the - allocated memory. If None (default), only the allocating - device has access. - - Notes - ----- - - IPC (inter-process communication) is not supported for graph - memory allocation nodes per CUDA documentation. - - The allocation uses the device's default memory pool. - """ - - device: int | "Device" | None = None - memory_type: GraphMemoryType = GraphMemoryType.DEVICE - peer_access: list | None = None - - cdef class GraphDefinition: """A graph definition. @@ -147,12 +109,14 @@ cdef class GraphDefinition: n._h_node = create_graph_node_handle(NULL, self._h_graph) return n - def allocate(self, size_t size, options: GraphAllocOptions | None = None) -> "AllocNode": + def allocate(self, size_t size, *, device: "Device" | int | None = None, + memory_type: GraphMemoryType = GraphMemoryType.DEVICE, + peer_access: list["Device" | int] | None = None) -> "AllocNode": """Add an entry-point memory allocation node (no dependencies). See :meth:`GraphNode.allocate` for full documentation. """ - return self._entry.allocate(size, options) + return self._entry.allocate(size, device=device, memory_type=memory_type, peer_access=peer_access) def deallocate(self, dptr) -> "FreeNode": """Add an entry-point memory free node (no dependencies). diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index e4e00d5c5f5..a5577d134de 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -63,6 +63,7 @@ import weakref from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy from cuda.core._utils.cuda_utils import driver +from cuda.core.typing import GraphMemoryType __all__ = ['GraphNode'] @@ -218,23 +219,48 @@ cdef class GraphNode: """ return GN_join(self, nodes) - def allocate(self, size_t size, options=None) -> AllocNode: + def allocate(self, size_t size, *, device: "Device" | int | None = None, + memory_type: GraphMemoryType = GraphMemoryType.DEVICE, + peer_access: list["Device" | int] | None = None) -> AllocNode: """Add a memory allocation node depending on this node. Parameters ---------- size : int Number of bytes to allocate. - options : GraphAllocOptions, optional - Allocation options. If None, allocates on the current device. + device : int or Device, optional + The device on which to allocate memory. If None (default), + uses the current CUDA context's device. + memory_type : GraphMemoryType or str, optional + Type of memory to allocate. One of: + + - ``GraphMemoryType.DEVICE`` (default): Pinned device memory, + optimal for GPU kernels. + - ``GraphMemoryType.HOST``: Pinned host memory, accessible from + both host and device. Useful for graphs containing host + callback nodes. Note: may not be supported on all + systems/drivers. + - ``GraphMemoryType.MANAGED``: Managed/unified memory that + automatically migrates between host and device. Useful for + mixed host/device access patterns. + + peer_access : list of int or Device, optional + List of devices that should have read-write access to the + allocated memory. If None (default), only the allocating + device has access. Returns ------- AllocNode A new AllocNode representing the allocation. Access the allocated device pointer via the dptr property. + + Notes + ----- + IPC (inter-process communication) is not supported for graph + memory allocation nodes per CUDA documentation. """ - return GN_alloc(self, size, options) + return GN_alloc(self, size, device, memory_type, peer_access) def deallocate(self, dptr: int) -> FreeNode: """Add a memory free node depending on this node. @@ -658,16 +684,17 @@ cdef inline EmptyNode GN_join(GraphNode self, tuple nodes): return _registered(EmptyNode._create_impl(create_graph_node_handle(new_node, h_graph))) -cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options): +cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object device, + object memory_type, object peer_access): cdef int device_id cdef cydriver.CUdevice dev - if options is None or options.device is None: + if device is None: with nogil: HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) device_id = dev else: - device_id = getattr(options.device, 'device_id', options.device) + device_id = getattr(device, 'device_id', device) cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params cdef cydriver.CUgraphNode new_node = NULL @@ -684,8 +711,8 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options): cdef int peer_id cdef list peer_ids = [] - if options is not None and options.peer_access is not None: - for peer_dev in options.peer_access: + if peer_access is not None: + for peer_dev in peer_access: peer_id = getattr(peer_dev, 'device_id', peer_dev) peer_ids.append(peer_id) access_descs.push_back(cydriver.CUmemAccessDesc_st( @@ -696,23 +723,21 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options): cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE )) - cdef str memory_type = "device" - if options is not None and options.memory_type is not None: - memory_type = str(options.memory_type) + cdef str memory_type_str = "device" if memory_type is None else str(memory_type) c_memset(&alloc_params, 0, sizeof(alloc_params)) alloc_params.poolProps.handleTypes = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE alloc_params.bytesize = size - if memory_type == "device": + if memory_type_str == "device": alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE alloc_params.poolProps.location.id = device_id - elif memory_type == "host": + elif memory_type_str == "host": alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST alloc_params.poolProps.location.id = 0 - elif memory_type == "managed": + elif memory_type_str == "managed": IF CUDA_CORE_BUILD_MAJOR >= 13: alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE @@ -720,7 +745,7 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options): ELSE: raise ValueError("memory_type='managed' requires CUDA 13.0 or later") else: - raise ValueError(f"Invalid memory_type: {memory_type!r}. " + raise ValueError(f"Invalid memory_type: {memory_type_str!r}. " "Must be 'device', 'host', or 'managed'.") if access_descs.size() > 0: @@ -733,7 +758,7 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options): return _registered(AllocNode._create_with_params( create_graph_node_handle(new_node, h_graph), alloc_params.dptr, size, - device_id, memory_type, tuple(peer_ids))) + device_id, memory_type_str, tuple(peer_ids))) cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr): diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx index 7c6f3c2b002..6d15ebc3ff7 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyx +++ b/cuda_core/cuda/core/graph/_subclasses.pyx @@ -174,8 +174,6 @@ cdef class AllocNode(GraphNode): The type of memory allocated. peer_access : tuple of int Device IDs that have read-write access to this allocation. - options : GraphAllocOptions - A GraphAllocOptions reconstructed from this node's parameters. """ @staticmethod @@ -253,16 +251,6 @@ cdef class AllocNode(GraphNode): """Device IDs with read-write access to this allocation.""" return self._peer_access - @property - def options(self): - """A GraphAllocOptions reconstructed from this node's parameters.""" - from cuda.core.graph._graph_definition import GraphAllocOptions - return GraphAllocOptions( - device=self._device_id, - memory_type=self._memory_type, - peer_access=list(self._peer_access) if self._peer_access else None, - ) - cdef class FreeNode(GraphNode): """A memory deallocation node. diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 238bf69568e..6c0019279cf 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -102,7 +102,6 @@ launched on a :class:`Stream`. :template: dataclass.rst - graph.GraphAllocOptions graph.GraphCompleteOptions graph.GraphDebugPrintOptions diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 7ded390b65c..ebe97df8347 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -25,7 +25,7 @@ Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs - :class:`Device` class for GPU device operations and context management. - :class:`Buffer` and :class:`MemoryResource` classes for memory allocation and management. - :class:`Program` for JIT compilation of CUDA kernels. -- :class:`GraphBuilder` for building and executing CUDA graphs. +- :class:`graph.GraphBuilder` for building and executing CUDA graphs. - :class:`Stream` and :class:`Event` for asynchronous execution and timing. Example: Compiling and Launching a CUDA kernel diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 95fdc4aff5c..1a9a67c8614 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -25,6 +25,19 @@ New features Breaking changes ---------------- +- Graph types are no longer re-exported from the top-level ``cuda.core`` + namespace; they must be imported from :mod:`cuda.core.graph`. The affected + symbols are :class:`~graph.Graph`, :class:`~graph.GraphBuilder`, + :class:`~graph.GraphCompleteOptions`, :class:`~graph.GraphCondition`, + :class:`~graph.GraphDebugPrintOptions`, and :class:`~graph.GraphDefinition`. + Update ``from cuda.core import GraphBuilder`` to + ``from cuda.core.graph import GraphBuilder`` (and similarly for the other + symbols). The same symbols are also no longer forwarded through the + deprecated ``cuda.core.experimental`` namespace. +- Removed the ``GraphAllocOptions`` dataclass and the + ``AllocNode.options`` property. Its fields are now keyword-only + parameters on :meth:`graph.GraphDefinition.allocate` and + :meth:`graph.GraphNode.allocate`. - Renamed :class:`~graph.GraphDef` to :class:`~graph.GraphDefinition` for consistency with the rest of the API, which spells words out (e.g. ``TensorMapDescriptor``, not ``TensorMapDesc``). diff --git a/cuda_core/tests/graph/test_device_launch.py b/cuda_core/tests/graph/test_device_launch.py index cb143a17328..0e9367077da 100644 --- a/cuda_core/tests/graph/test_device_launch.py +++ b/cuda_core/tests/graph/test_device_launch.py @@ -9,7 +9,6 @@ from cuda.core import ( Device, - GraphCompleteOptions, LaunchConfig, LegacyPinnedMemoryResource, Linker, @@ -19,6 +18,7 @@ ProgramOptions, launch, ) +from cuda.core.graph import GraphCompleteOptions def _get_device_arch(): diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index c0299df5661..e0e3fd9a51c 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """GraphBuilder stream capture tests.""" @@ -8,7 +8,8 @@ from helpers.graph_kernels import compile_common_kernels from helpers.marks import requires_module -from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch +from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch +from cuda.core.graph import GraphBuilder def test_graph_is_building(init_cuda): diff --git a/cuda_core/tests/graph/test_graph_builder_conditional.py b/cuda_core/tests/graph/test_graph_builder_conditional.py index de65848c1a0..69956cf0f21 100644 --- a/cuda_core/tests/graph/test_graph_builder_conditional.py +++ b/cuda_core/tests/graph/test_graph_builder_conditional.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Tests for GraphBuilder conditional node capture (if, if-else, switch, while).""" @@ -10,7 +10,8 @@ from helpers.graph_kernels import compile_conditional_kernels from helpers.marks import requires_module -from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch +from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch +from cuda.core.graph import GraphBuilder @pytest.mark.parametrize( diff --git a/cuda_core/tests/graph/test_graph_definition.py b/cuda_core/tests/graph/test_graph_definition.py index 82791223d17..7f70c74aa34 100644 --- a/cuda_core/tests/graph/test_graph_definition.py +++ b/cuda_core/tests/graph/test_graph_definition.py @@ -19,7 +19,6 @@ EventRecordNode, EventWaitNode, FreeNode, - GraphAllocOptions, GraphCompleteOptions, GraphDebugPrintOptions, GraphDefinition, @@ -269,23 +268,20 @@ def _build_alloc_node(g): "device_id": device_id, "memory_type": "device", "peer_access": (), - "options": GraphAllocOptions(device=device_id, memory_type="device"), } def _build_alloc_managed_node(g): _skip_if_no_managed_mempool() device_id = Device().device_id - options = GraphAllocOptions(memory_type=GraphMemoryType.MANAGED) entry = g.allocate(ALLOC_SIZE) - node = entry.allocate(ALLOC_SIZE, options) + node = entry.allocate(ALLOC_SIZE, memory_type=GraphMemoryType.MANAGED) return node, { "dptr": lambda v: v != 0, "bytesize": ALLOC_SIZE, "device_id": device_id, "memory_type": "managed", "peer_access": (), - "options": GraphAllocOptions(device=device_id, memory_type="managed"), } @@ -831,9 +827,8 @@ def test_alloc_free_chain(sample_graphdef): def test_alloc_memory_type_invalid(sample_graphdef): """Invalid memory type raises ValueError.""" - options = GraphAllocOptions(memory_type="invalid") with pytest.raises(ValueError, match="Invalid memory_type"): - sample_graphdef.allocate(ALLOC_SIZE, options) + sample_graphdef.allocate(ALLOC_SIZE, memory_type="invalid") @pytest.mark.parametrize( @@ -847,8 +842,7 @@ def test_alloc_device_option(sample_graphdef, device_spec): """Device can be specified as int or Device object.""" _skip_if_no_mempool() device = Device() - options = GraphAllocOptions(device=device_spec(device)) - node = sample_graphdef.allocate(ALLOC_SIZE, options) + node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device)) assert node.dptr != 0 @@ -856,8 +850,7 @@ def test_alloc_peer_access(mempool_device_x2): """AllocNode.peer_access reflects requested peers.""" d0, d1 = mempool_device_x2 g = GraphDefinition() - options = GraphAllocOptions(device=d0.device_id, peer_access=[d1.device_id]) - node = g.allocate(ALLOC_SIZE, options) + node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id]) assert d1.device_id in node.peer_access diff --git a/cuda_core/tests/graph/test_graph_memory_resource.py b/cuda_core/tests/graph/test_graph_memory_resource.py index 7f71fc95852..cdf694e3230 100644 --- a/cuda_core/tests/graph/test_graph_memory_resource.py +++ b/cuda_core/tests/graph/test_graph_memory_resource.py @@ -11,7 +11,6 @@ from cuda.core import ( Device, DeviceMemoryResource, - GraphCompleteOptions, GraphMemoryResource, LaunchConfig, Program, @@ -19,6 +18,7 @@ launch, ) from cuda.core._utils.cuda_utils import CUDAError +from cuda.core.graph import GraphCompleteOptions def _common_kernels_alloc(): diff --git a/cuda_core/tests/graph/test_options.py b/cuda_core/tests/graph/test_options.py index 2002c1b7006..b6034eb6799 100644 --- a/cuda_core/tests/graph/test_options.py +++ b/cuda_core/tests/graph/test_options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Graph options and build mode tests.""" @@ -6,7 +6,8 @@ import pytest from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels -from cuda.core import Device, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, LaunchConfig, launch +from cuda.core import Device, LaunchConfig, launch +from cuda.core.graph import GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions def test_graph_dot_print_options(init_cuda, tmp_path): diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index c3215b056ac..98af4a9557a 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -67,8 +67,6 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Program is cuda.core.Program assert cuda.core.experimental.Kernel is cuda.core.Kernel assert cuda.core.experimental.ObjectCode is cuda.core.ObjectCode - assert cuda.core.experimental.Graph is cuda.core.Graph - assert cuda.core.experimental.GraphBuilder is cuda.core.GraphBuilder assert cuda.core.experimental.Event is cuda.core.Event assert cuda.core.experimental.Linker is cuda.core.Linker @@ -95,8 +93,6 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "LaunchConfig") assert hasattr(cuda.core.experimental, "ProgramOptions") assert hasattr(cuda.core.experimental, "LinkerOptions") - assert hasattr(cuda.core.experimental, "GraphCompleteOptions") - assert hasattr(cuda.core.experimental, "GraphDebugPrintOptions") assert hasattr(cuda.core.experimental, "DeviceMemoryResourceOptions") assert hasattr(cuda.core.experimental, "VirtualMemoryResourceOptions")