NVIDIA · Andy-Jost · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -66,12 +66,10 @@ def _import_versioned_module():
     StreamOptions,
 )
 from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions
-from cuda.core.graph import (
-    Graph,
-    GraphAllocOptions,
-    GraphBuilder,
-    GraphCompleteOptions,
-    GraphCondition,
-    GraphDebugPrintOptions,
-    GraphDefinition,
-)
+
+# isort: split
+# Must come after the cuda.core._* extension imports above: loading graph
+# earlier interacts badly with the merged-wheel __path__ rewrite and leaves
+# Graph/GraphBuilder/GraphCompleteOptions/GraphDebugPrintOptions missing from
+# cuda.core.graph.
+import cuda.core.graph
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
@@ -67,9 +67,3 @@ def _warn_deprecated():
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import Stream, StreamOptions
-from cuda.core.graph import (
-    Graph,
-    GraphBuilder,
-    GraphCompleteOptions,
-    GraphDebugPrintOptions,
-)
diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx
@@ -23,13 +23,11 @@ from cuda.core._resource_handles cimport (
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
-from dataclasses import dataclass
-
 from cuda.core._utils.cuda_utils import driver
 
 from cuda.core.typing import GraphMemoryType
 
-__all__ = ['GraphCondition', 'GraphAllocOptions', 'GraphDefinition']
+__all__ = ['GraphCondition', 'GraphDefinition']
 
 
 cdef class GraphCondition:
@@ -71,42 +69,6 @@ cdef class GraphCondition:
         return <unsigned long long>self._c_handle
 
 
-@dataclass
-class GraphAllocOptions:
-    """Options for graph memory allocation nodes.
-
-    Attributes
-    ----------
-    device : int or Device, optional
-        The device on which to allocate memory. If None (default),
-        uses the current CUDA context's device.
-    memory_type : GraphMemoryType | str, optional
-        Type of memory to allocate. One of:
-
-        - ``"device"`` (default): Pinned device memory, optimal for GPU kernels.
-        - ``"host"``: Pinned host memory, accessible from both host and device.
-          Useful for graphs containing host callback nodes. Note: may not be
-          supported on all systems/drivers.
-        - ``"managed"``: Managed/unified memory that automatically migrates
-          between host and device. Useful for mixed host/device access patterns.
-
-    peer_access : list of int or Device, optional
-        List of devices that should have read-write access to the
-        allocated memory. If None (default), only the allocating
-        device has access.
-
-    Notes
-    -----
-    - IPC (inter-process communication) is not supported for graph
-      memory allocation nodes per CUDA documentation.
-    - The allocation uses the device's default memory pool.
-    """
-
-    device: int | "Device" | None = None
-    memory_type: GraphMemoryType = GraphMemoryType.DEVICE
-    peer_access: list | None = None
-
-
 cdef class GraphDefinition:
     """A graph definition.
 
@@ -147,12 +109,14 @@ cdef class GraphDefinition:
         n._h_node = create_graph_node_handle(<cydriver.CUgraphNode>NULL, self._h_graph)
         return n
 
-    def allocate(self, size_t size, options: GraphAllocOptions | None = None) -> "AllocNode":
+    def allocate(self, size_t size, *, device: "Device" | int | None = None,
+                 memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
+                 peer_access: list["Device" | int] | None = None) -> "AllocNode":
         """Add an entry-point memory allocation node (no dependencies).
 
         See :meth:`GraphNode.allocate` for full documentation.
         """
-        return self._entry.allocate(size, options)
+        return self._entry.allocate(size, device=device, memory_type=memory_type, peer_access=peer_access)
 
     def deallocate(self, dptr) -> "FreeNode":
         """Add an entry-point memory free node (no dependencies).

diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx
@@ -63,6 +63,7 @@ import weakref
 
 from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy
 from cuda.core._utils.cuda_utils import driver
+from cuda.core.typing import GraphMemoryType
 
 __all__ = ['GraphNode']
 
@@ -218,23 +219,48 @@ cdef class GraphNode:
         """
         return GN_join(self, nodes)
 
-    def allocate(self, size_t size, options=None) -> AllocNode:
+    def allocate(self, size_t size, *, device: "Device" | int | None = None,
+                 memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
+                 peer_access: list["Device" | int] | None = None) -> AllocNode:
         """Add a memory allocation node depending on this node.
 
         Parameters
         ----------
         size : int
             Number of bytes to allocate.
-        options : GraphAllocOptions, optional
-            Allocation options. If None, allocates on the current device.
+        device : int or Device, optional
+            The device on which to allocate memory. If None (default),
+            uses the current CUDA context's device.
+        memory_type : GraphMemoryType or str, optional
+            Type of memory to allocate. One of:
+
+            - ``GraphMemoryType.DEVICE`` (default): Pinned device memory,
+              optimal for GPU kernels.
+            - ``GraphMemoryType.HOST``: Pinned host memory, accessible from
+              both host and device. Useful for graphs containing host
+              callback nodes. Note: may not be supported on all
+              systems/drivers.
+            - ``GraphMemoryType.MANAGED``: Managed/unified memory that
+              automatically migrates between host and device. Useful for
+              mixed host/device access patterns.
+
+        peer_access : list of int or Device, optional
+            List of devices that should have read-write access to the
+            allocated memory. If None (default), only the allocating
+            device has access.
 
         Returns
         -------
         AllocNode
             A new AllocNode representing the allocation. Access the allocated
             device pointer via the dptr property.
+
+        Notes
+        -----
+        IPC (inter-process communication) is not supported for graph
+        memory allocation nodes per CUDA documentation.
         """
-        return GN_alloc(self, size, options)
+        return GN_alloc(self, size, device, memory_type, peer_access)
 
     def deallocate(self, dptr: int) -> FreeNode:
         """Add a memory free node depending on this node.
@@ -658,16 +684,17 @@ cdef inline EmptyNode GN_join(GraphNode self, tuple nodes):
     return _registered(EmptyNode._create_impl(create_graph_node_handle(new_node, h_graph)))
 
 
-cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
+cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object device,
+                               object memory_type, object peer_access):
     cdef int device_id
     cdef cydriver.CUdevice dev
 
-    if options is None or options.device is None:
+    if device is None:
         with nogil:
             HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
         device_id = <int>dev
     else:
-        device_id = getattr(options.device, 'device_id', options.device)
+        device_id = getattr(device, 'device_id', device)
 
     cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params
     cdef cydriver.CUgraphNode new_node = NULL
@@ -684,8 +711,8 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
     cdef int peer_id
     cdef list peer_ids = []
 
-    if options is not None and options.peer_access is not None:
-        for peer_dev in options.peer_access:
+    if peer_access is not None:
+        for peer_dev in peer_access:
             peer_id = getattr(peer_dev, 'device_id', peer_dev)
             peer_ids.append(peer_id)
             access_descs.push_back(cydriver.CUmemAccessDesc_st(
@@ -696,31 +723,29 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
                 cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
             ))
 
-    cdef str memory_type = "device"
-    if options is not None and options.memory_type is not None:
-        memory_type = str(options.memory_type)
+    cdef str memory_type_str = "device" if memory_type is None else str(memory_type)
 
     c_memset(&alloc_params, 0, sizeof(alloc_params))
     alloc_params.poolProps.handleTypes = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
     alloc_params.bytesize = size
 
-    if memory_type == "device":
+    if memory_type_str == "device":
         alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
         alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
         alloc_params.poolProps.location.id = device_id
-    elif memory_type == "host":
+    elif memory_type_str == "host":
         alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
         alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
         alloc_params.poolProps.location.id = 0
-    elif memory_type == "managed":
+    elif memory_type_str == "managed":
         IF CUDA_CORE_BUILD_MAJOR >= 13:
             alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
             alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
             alloc_params.poolProps.location.id = device_id
         ELSE:
             raise ValueError("memory_type='managed' requires CUDA 13.0 or later")
     else:
-        raise ValueError(f"Invalid memory_type: {memory_type!r}. "
+        raise ValueError(f"Invalid memory_type: {memory_type_str!r}. "
                        "Must be 'device', 'host', or 'managed'.")
 
     if access_descs.size() > 0:
@@ -733,7 +758,7 @@ cdef inline AllocNode GN_alloc(GraphNode self, size_t size, object options):
 
     return _registered(AllocNode._create_with_params(
         create_graph_node_handle(new_node, h_graph), alloc_params.dptr, size,
-        device_id, memory_type, tuple(peer_ids)))
+        device_id, memory_type_str, tuple(peer_ids)))
 
 
 cdef inline FreeNode GN_free(GraphNode self, cydriver.CUdeviceptr c_dptr):

diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx
@@ -174,8 +174,6 @@ cdef class AllocNode(GraphNode):
         The type of memory allocated.
     peer_access : tuple of int
         Device IDs that have read-write access to this allocation.
-    options : GraphAllocOptions
-        A GraphAllocOptions reconstructed from this node's parameters.
     """
 
     @staticmethod
@@ -253,16 +251,6 @@ cdef class AllocNode(GraphNode):
         """Device IDs with read-write access to this allocation."""
         return self._peer_access
 
-    @property
-    def options(self):
-        """A GraphAllocOptions reconstructed from this node's parameters."""
-        from cuda.core.graph._graph_definition import GraphAllocOptions
-        return GraphAllocOptions(
-            device=self._device_id,
-            memory_type=self._memory_type,
-            peer_access=list(self._peer_access) if self._peer_access else None,
-        )
-
 
 cdef class FreeNode(GraphNode):
     """A memory deallocation node.

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -102,7 +102,6 @@ launched on a :class:`Stream`.
 
    :template: dataclass.rst
 
-   graph.GraphAllocOptions
    graph.GraphCompleteOptions
    graph.GraphDebugPrintOptions
 

diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst
@@ -25,7 +25,7 @@ Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs
 - :class:`Device` class for GPU device operations and context management.
 - :class:`Buffer` and :class:`MemoryResource` classes for memory allocation and management.
 - :class:`Program` for JIT compilation of CUDA kernels.
-- :class:`GraphBuilder` for building and executing CUDA graphs.
+- :class:`graph.GraphBuilder` for building and executing CUDA graphs.
 - :class:`Stream` and :class:`Event` for asynchronous execution and timing.
 
 Example: Compiling and Launching a CUDA kernel

diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -25,6 +25,19 @@ New features
 Breaking changes
 ----------------
 
+- Graph types are no longer re-exported from the top-level ``cuda.core``
+  namespace; they must be imported from :mod:`cuda.core.graph`. The affected
+  symbols are :class:`~graph.Graph`, :class:`~graph.GraphBuilder`,
+  :class:`~graph.GraphCompleteOptions`, :class:`~graph.GraphCondition`,
+  :class:`~graph.GraphDebugPrintOptions`, and :class:`~graph.GraphDefinition`.
+  Update ``from cuda.core import GraphBuilder`` to
+  ``from cuda.core.graph import GraphBuilder`` (and similarly for the other
+  symbols). The same symbols are also no longer forwarded through the
+  deprecated ``cuda.core.experimental`` namespace.
+- Removed the ``GraphAllocOptions`` dataclass and the
+  ``AllocNode.options`` property. Its fields are now keyword-only
+  parameters on :meth:`graph.GraphDefinition.allocate` and
+  :meth:`graph.GraphNode.allocate`.
 - Renamed :class:`~graph.GraphDef` to :class:`~graph.GraphDefinition` for
   consistency with the rest of the API, which spells words out (e.g.
   ``TensorMapDescriptor``, not ``TensorMapDesc``).

diff --git a/cuda_core/tests/graph/test_device_launch.py b/cuda_core/tests/graph/test_device_launch.py
@@ -9,7 +9,6 @@
 
 from cuda.core import (
     Device,
-    GraphCompleteOptions,
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Linker,
@@ -19,6 +18,7 @@
     ProgramOptions,
     launch,
 )
+from cuda.core.graph import GraphCompleteOptions
 
 
 def _get_device_arch():

diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """GraphBuilder stream capture tests."""
@@ -8,7 +8,8 @@
 from helpers.graph_kernels import compile_common_kernels
 from helpers.marks import requires_module
 
-from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch
+from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
+from cuda.core.graph import GraphBuilder
 
 
 def test_graph_is_building(init_cuda):

diff --git a/cuda_core/tests/graph/test_graph_builder_conditional.py b/cuda_core/tests/graph/test_graph_builder_conditional.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """Tests for GraphBuilder conditional node capture (if, if-else, switch, while)."""
@@ -10,7 +10,8 @@
 from helpers.graph_kernels import compile_conditional_kernels
 from helpers.marks import requires_module
 
-from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch
+from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
+from cuda.core.graph import GraphBuilder
 
 
 @pytest.mark.parametrize(