diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 5c30defada7..634623bd244 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -82,8 +82,46 @@ program = edge_program.to_executorch(
         )
 ```
 
+> **Note:** Custom pool passes that pre-assign `mem_id` are not yet compatible
+> with `enable_non_cpu_memory_planning=True`.  When per-device planning is
+> enabled, device buffers are appended after the CPU buffers in the global
+> `bufsizes` array.  If a custom pass has already set `mem_id` values (e.g.
+> `mem_id=2` or `mem_id=3`), those slots may collide with the device-buffer
+> slots, leading to incorrect memory layout.  If both features are enabled
+> simultaneously, `apply_algo` will raise a `NotImplementedError`.
+
 Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
 
+## Device-Aware Memory Planning
+
+When `enable_non_cpu_memory_planning=True` is set on `ExecutorchBackendConfig`,
+the memory planning pass partitions tensor specs by their device type and runs
+the planning algorithm independently for each device.  This produces separate
+memory buffers for each device (e.g. CPU vs. CUDA), ensuring that device memory
+and host memory are never mixed.
+
+```python
+program = edge_program.to_executorch(
+            exir.ExecutorchBackendConfig(
+                enable_non_cpu_memory_planning=True,
+            )
+        )
+```
+
+The resulting `bufsizes` array layout depends on which devices are present:
+
+| Scenario | bufsizes | Description |
+|---|---|---|
+| CPU only | `[0, cpu_size]` | Same as legacy behavior |
+| CUDA only | `[0, cuda_size]` | Buffer 1 is CUDA, no wasted CPU slot |
+| CPU + CUDA | `[0, cpu_size, cuda_size]` | Buffer 1 is CPU, buffer 2 is CUDA |
+
+**Current limitations:**
+- Not compatible with custom pool passes that pre-assign `spec.mem_id` (see note above).
+- Submodule buffer sizes (from control-flow submodules like `cond`/`while`/`map`)
+  are applied only to the CPU partition.  This is safe today because on-device
+  tensors only appear as delegate blob I/O, never inside control-flow submodules.
+
 ## Debugging Tool
 
 Please refer to [Memory Planning Inspection](memory-planning-inspection.md) for a tool to inspect the result of memory planning.
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 3fbc8ae7ef3..2d6290bdd0b 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -117,3 +117,9 @@ class ExecutorchBackendConfig:
 
     # Experimental: If set to true, we run a pass to reinplace ops in the graph.
     run_reinplace_pass: bool = False
+
+    # When True, memory planning partitions specs by device and runs the
+    # algorithm independently per device, producing separate buffers for CPU
+    # vs. accelerator memory.  Default False preserves the legacy behavior
+    # where all tensors are planned into CPU memory regardless of device.
+    enable_non_cpu_memory_planning: bool = False
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index c5d3441bcde..863425b7022 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -31,7 +31,7 @@
 from executorch.exir.delegate import executorch_call_delegate
 from executorch.exir.error import internal_assert, InternalError
 from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
-from executorch.exir.schema import TensorShapeDynamism
+from executorch.exir.schema import DeviceType, NonConstBufferDevice, TensorShapeDynamism
 from executorch.exir.tensor import TensorSpec
 from torch import fx
 from torch.export.exported_program import (
@@ -1211,10 +1211,19 @@ def apply_algo(
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
     alloc_mutable_buffers: bool = True,
+    enable_non_cpu_memory_planning: bool = False,
 ) -> list[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
 
+    Partitions specs by device type and device idx, and runs the memory planning
+    algorithm independently per device, then merges results into separate buffers.
+    This ensures device memory and CPU memory are never mixed.
+
+    When enable_non_cpu_memory_planning is False (default), all specs are planned
+    into a single CPU memory pool regardless of their device attribute. This
+    preserves the legacy behavior. Set to True to enable per-device partitioning.
+
     Algo implementation should handle one of two meta entries for submodules:
     1. input_mem_buffer_sizes: List of int offset bytes. Memory allocated by
        `algo` should start at the offset specified by this list;
@@ -1229,49 +1238,144 @@ def apply_algo(
     `operand` arg. The memory for operands is unused.
     """
     # Extract the nodes and their lifespans from the graph_module
-    # Difficult to just filter the list of specs returned by this due to
-    # how we flag trainable weights.
     _ = update_all_tensors_lifetime(graph_module, graph_signature)
 
-    # Filter specs based on alloc_graph_input and alloc_graph_output
-    specs = collect_specs_from_nodes(
-        graph_module.graph.nodes,
-        graph_signature,
-        do_assertion=False,
-        ignore_graph_input=not alloc_graph_input,
-        ignore_graph_output=not alloc_graph_output,
-        ignore_mutable_buffers=not alloc_mutable_buffers,
+    # Collect and materialize specs into a set so we can iterate multiple
+    # times and partition by device.
+    all_specs: set[TensorSpec] = set(
+        collect_specs_from_nodes(
+            graph_module.graph.nodes,
+            graph_signature,
+            do_assertion=False,
+            ignore_graph_input=not alloc_graph_input,
+            ignore_graph_output=not alloc_graph_output,
+            ignore_mutable_buffers=not alloc_mutable_buffers,
+        )
     )
 
     # Get temporary specs for submodules to set aside space during execution
     # of submodules.
+    # NOTE: submodule_bufsizes are currently applied only to the CPU partition.
+    # This assumes all control-flow submodule tensors (cond/while/map) live in
+    # CPU memory.  Today this is safe because on-device tensors only appear as
+    # delegate blob I/O, which never lives inside control-flow submodules.
+    # If device tensors ever appear in submodules, _apply_algo_to_submodules
+    # will need per-device partitioning as well.
     submodule_bufsizes = _apply_algo_to_submodules(
         algo, graph_module, alignment, graph_signature
     )
 
-    # Update `input_mem_buffer_sizes` in graph_module. This will allow existing
-    # algos to work using `input_mem_buffer_sizes` or use
-    # `non_const_buffer_sizes` directly.
-    # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
-    graph_module.input_mem_buffer_sizes = submodule_bufsizes
-
     # Get extra padding for XNNPACK if needed
     extra_padding = 0
     if _contains_xnnpack_delegate(graph_module):
         extra_padding = 64
 
-    # Pass the filtered specs to the algorithm
-    bufsizes: list[int] = algo(
-        alignment,
-        specs,
-        graph_module,
-        graph_signature,
-        extra_padding,
+    # 1. Partition specs by (device_type, device_index).
+    # Different device indices on the same device type (e.g. CUDA:0 vs CUDA:1)
+    # get separate memory buffers.
+    _CPU_KEY: tuple[DeviceType, int] = (DeviceType.CPU, 0)
+    specs_by_device: dict[tuple[DeviceType, int], set[TensorSpec]] = defaultdict(set)
+    if enable_non_cpu_memory_planning:
+        has_non_cpu_specs = False
+        has_pre_assigned_mem_id = False
+        for spec in all_specs:
+            device_key = (spec.device, spec.device_index)
+            specs_by_device[device_key].add(spec)
+            if spec.device != DeviceType.CPU:
+                has_non_cpu_specs = True
+            if spec.mem_id is not None:
+                has_pre_assigned_mem_id = True
+
+        # Custom pool passes pre-assign mem_ids (e.g. mem_id=2, 3, …) to place
+        # tensors into specific memory arenas.  Per-device partitioning appends
+        # device buffers after the CPU buffers, and the remap formula
+        #   global_mem_id = (local_mem_id - 1) + base_mem_id
+        # assumes the algo-local numbering starts at 1.  If a custom pass has
+        # already set mem_ids > 1 on the CPU side, the device-buffer slots may
+        # collide with those custom pool slots.
+        # TODO(gasoonjia): support custom pools + per-device planning by reserving
+        # device slots after the highest custom pool id.
+        if has_non_cpu_specs and has_pre_assigned_mem_id:
+            raise NotImplementedError(
+                "enable_non_cpu_memory_planning is not yet compatible with "
+                "custom memory pool passes that pre-assign spec.mem_id. "
+                "The per-device buffer slots may collide with custom pool "
+                "mem_ids. Please disable enable_non_cpu_memory_planning or "
+                "remove the custom mem_id assignments."
+            )
+    else:
+        # Legacy behavior: all specs planned into CPU memory regardless of device
+        specs_by_device[_CPU_KEY] = all_specs
+
+    # 2. Plan each device independently
+    global_bufsizes: list[int] = [0]  # index 0 reserved for constants
+    # Track (device_type, device_index) for each buffer slot
+    buffer_devices: list[tuple[DeviceType, int]] = [_CPU_KEY]
+
+    # Process CPU:0 first (if present), then other devices sorted by
+    # (type.value, index) so the ordering is deterministic.
+    device_order = sorted(
+        specs_by_device.keys(),
+        key=lambda dk: (dk != _CPU_KEY, dk[0].value, dk[1]),
     )
 
-    # pyre-ignore[6]: Incompatible parameter type [6]
-    # In call `insert_calls_to_free`, for 2nd positional argument, expected `Set[TensorSpec]` but got `Iterable[TensorSpec]`
-    insert_calls_to_free(graph_module, specs)
+    for device_key in device_order:
+        device_specs = specs_by_device[device_key]
 
-    graph_module.meta.update({"non_const_buffer_sizes": bufsizes})
-    return bufsizes
+        # Only apply submodule pre-allocation for CPU specs; device buffers
+        # do not share memory space with CPU submodule arenas.
+        # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
+        graph_module.input_mem_buffer_sizes = (
+            submodule_bufsizes if device_key == _CPU_KEY else []
+        )
+
+        # Run algorithm independently on this device's specs
+        device_bufsizes = algo(
+            alignment, device_specs, graph_module, graph_signature, extra_padding
+        )
+
+        # Calculate base mem_id in global space
+        base_mem_id = len(global_bufsizes)
+
+        # Append buffer sizes (skip index 0 which is constants placeholder)
+        global_bufsizes.extend(device_bufsizes[1:])
+
+        # Track device key for each new buffer slot
+        for _ in device_bufsizes[1:]:
+            buffer_devices.append(device_key)
+
+        # Remap spec mem_ids from algo-local to global.
+        # At this point spec.mem_id has been set by MemoryPlanningAlgorithmSuite:
+        # the suite runs each algorithm (e.g. greedy), picks the best result,
+        # and writes the winning mem_id/mem_offset/mem_obj_id back onto each
+        # spec.  For specs with no pre-assigned mem_id the algorithm defaults
+        # to mem_id=1; custom-pool passes may pre-assign other values (e.g. 3).
+        # We remap from the algo-local numbering (1-based) to the global
+        # position: global_mem_id = (local_mem_id - 1) + base_mem_id.
+        for spec in device_specs:
+            if spec.mem_id is not None:
+                spec.mem_id = (spec.mem_id - 1) + base_mem_id
+
+    # Ensure backward compatibility: at least [0, 0] when no specs exist
+    if len(global_bufsizes) < 2:
+        global_bufsizes.append(0)
+        buffer_devices.append(_CPU_KEY)
+
+    # 3. Insert free calls and build device buffer mapping
+    insert_calls_to_free(graph_module, all_specs)
+
+    # Only record non-CPU buffer entries.  CPU buffers are the default and
+    # do not need explicit device metadata in the serialized program.
+    non_const_buffer_device: Optional[list[NonConstBufferDevice]] = None
+    has_device_buffers = any(dk[0] != DeviceType.CPU for dk in buffer_devices)
+    if has_device_buffers:
+        non_const_buffer_device = [
+            NonConstBufferDevice(buffer_idx=i, device_type=dt, device_index=di)
+            for i, (dt, di) in enumerate(buffer_devices)
+            if (dt, di) != _CPU_KEY
+        ]
+
+    graph_module.meta["non_const_buffer_sizes"] = global_bufsizes
+    if non_const_buffer_device is not None:
+        graph_module.meta["non_const_buffer_device"] = non_const_buffer_device
+    return global_bufsizes
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index f3970f13b56..32c343a4607 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -153,6 +153,7 @@ def __init__(
         alloc_mutable_buffers: bool = True,
         share_mutable_buffers: bool = False,
         alignment: int = ALIGNMENT,
+        enable_non_cpu_memory_planning: bool = False,
     ) -> None:
         r"""
         alloc_graph_input/alloc_graph_output will have 4 different combinations
@@ -173,6 +174,7 @@ def __init__(
         self.alloc_mutable_buffers = alloc_mutable_buffers
         self.share_mutable_buffers = share_mutable_buffers
         self.alignment = alignment
+        self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning
         self.state = _MemoryPlanningState()
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
@@ -250,6 +252,7 @@ def run(
             # If mutable buffers are shared, then do not allocate them in the
             # main memory planning algo; they are allocated in run_multimethod.
             self.alloc_mutable_buffers and not self.share_mutable_buffers,
+            self.enable_non_cpu_memory_planning,
         )
 
         if self.share_mutable_buffers and graph_signature is not None:
diff --git a/exir/program/_program.py b/exir/program/_program.py
index c68d0eed945..8f0b983bd04 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1788,6 +1788,12 @@ def to_executorch(  # noqa (FLAKE8) C901
                 )
             else:
                 memory_planning_pass = config.memory_planning_pass
+            # Propagate enable_non_cpu_memory_planning from the top-level config
+            # to the pass instance so that device-aware partitioning is applied.
+            if hasattr(memory_planning_pass, "enable_non_cpu_memory_planning"):
+                memory_planning_pass.enable_non_cpu_memory_planning = (
+                    config.enable_non_cpu_memory_planning
+                )
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
             if hasattr(memory_planning_pass, "run"):
                 new_gm_res = memory_planning_pass.run(new_gm, new_signature)
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index f364541d900..e0df7a713e6 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -29,6 +29,8 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.memory_planning import (
     _do_user_inputs_exist,
+    apply_algo,
+    collect_specs_from_nodes,
     filter_nodes,
     get_node_tensor_specs,
     greedy,
@@ -45,6 +47,7 @@
     ToOutVarPass,
 )
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from executorch.exir.schema import DeviceType
 from executorch.exir.tensor import TensorSpec
 from functorch.experimental.control_flow import map as torch_map
 from parameterized import parameterized
@@ -1259,3 +1262,231 @@ def reset(self, k_zeros: torch.Tensor, v_zeros: torch.Tensor) -> None:
             self.assertEqual(v_cache[0].val.allocation_info.memory_id, 2)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_low, 256)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_high, 0)
+
+
+class TestDeviceAwareMemoryPlanning(unittest.TestCase):
+    """Tests for per-device memory planning (separate buffers per device type)."""
+
+    def _prepare_model(
+        self,
+    ) -> Tuple[GraphModule, ExportGraphSignature]:
+        """Prepare ToyModelForMemPlanning through SpecPropPass + ToOutVarPass."""
+        model = ToyModelForMemPlanning()
+        inputs = model.get_random_inputs()
+        edge = to_edge(export(model, inputs, strict=True))
+        gm = edge.exported_program().graph_module
+        gs = edge.exported_program().graph_signature
+        gm = PassManager(passes=[SpecPropPass(), ToOutVarPass()])(gm).graph_module
+        return gm, gs
+
+    def _get_planned_specs(
+        self,
+        gm: GraphModule,
+        gs: ExportGraphSignature,
+    ) -> list[TensorSpec]:
+        """Get the unique set of specs that apply_algo would plan."""
+        return list(
+            collect_specs_from_nodes(
+                gm.graph.nodes,
+                gs,
+                do_assertion=False,
+                ignore_graph_input=False,
+                ignore_graph_output=False,
+                ignore_mutable_buffers=False,
+            )
+        )
+
+    def test_cpu_only_unchanged(self) -> None:
+        """CPU-only specs produce bufsizes = [0, X] with no device metadata."""
+        gm, gs = self._prepare_model()
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # The CUDA spec is the only tensor in its buffer
+        self.assertEqual(bufsizes[0], 0)  # constants
+        self.assertGreater(bufsizes[1], 0)  # CPU activations
+        self.assertNotIn("non_const_buffer_device", gm.meta)
+
+    def test_custom_pool_with_device_planning_raises(self) -> None:
+        """Pre-assigned mem_ids + enable_non_cpu_memory_planning raises."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Pre-assign a custom mem_id AND set a non-CPU device
+        specs[0].mem_id = 3
+        specs[-1].device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        with self.assertRaises(NotImplementedError):
+            apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+    def test_all_cuda_no_wasted_slots(self) -> None:
+        """CUDA-only specs produce [0, X] with CUDA at buffer index 1."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [0, cuda_size] — no wasted CPU buffer slot
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        # Device mapping should only contain non-CPU entries
+        self.assertIn("non_const_buffer_device", gm.meta)
+        device_map = gm.meta["non_const_buffer_device"]
+        self.assertEqual(len(device_map), 1)
+        self.assertEqual(device_map[0].buffer_idx, 1)
+        self.assertEqual(device_map[0].device_type, DeviceType.CUDA)
+        self.assertEqual(device_map[0].device_index, 0)
+
+    def test_mixed_cpu_cuda_separate_buffers(self) -> None:
+        """CPU specs at mem_id=1, CUDA specs at mem_id=2, separate sizes."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set second half of specs to CUDA
+        mid = len(specs) // 2
+        self.assertGreater(mid, 0)
+        cpu_specs = specs[:mid]
+        cuda_specs = specs[mid:]
+        for spec in cuda_specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [constants, cpu_activations, cuda_activations]
+        self.assertEqual(len(bufsizes), 3)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertGreater(bufsizes[2], 0)
+
+        # CPU specs should have mem_id=1, CUDA specs should have mem_id=2
+        for spec in cpu_specs:
+            self.assertEqual(
+                spec.mem_id, 1, f"CPU spec has wrong mem_id: {spec.mem_id}"
+            )
+        for spec in cuda_specs:
+            self.assertEqual(
+                spec.mem_id, 2, f"CUDA spec has wrong mem_id: {spec.mem_id}"
+            )
+
+    def test_mem_offset_correct_after_remap(self) -> None:
+        """After remapping, mem_offset is relative to its own buffer."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set the last spec to CUDA (sole CUDA tensor)
+        cuda_spec = specs[-1]
+        cuda_spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # The CUDA spec is the only tensor in its buffer, so offset should be 0
+        self.assertEqual(cuda_spec.mem_offset, 0)
+        # The CUDA buffer should fit exactly this tensor
+        cuda_mem_id = cuda_spec.mem_id
+        self.assertIsNotNone(cuda_mem_id)
+        assert cuda_mem_id is not None
+        self.assertGreaterEqual(bufsizes[cuda_mem_id], cuda_spec.allocated_memory)
+
+    def test_no_cross_device_memory_sharing(self) -> None:
+        """Specs on different devices never share buffers, regardless of lifetime."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        self.assertGreaterEqual(len(specs), 2)
+
+        # Assign alternating specs to CUDA to ensure some pairs have
+        # non-overlapping lifetimes (which greedy would normally share).
+        for i, spec in enumerate(specs):
+            if i % 2 == 0:
+                spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # Verify CPU and CUDA specs have disjoint mem_ids
+        cpu_mem_ids: set[int] = set()
+        cuda_mem_ids: set[int] = set()
+        for i, spec in enumerate(specs):
+            if spec.mem_id is not None:
+                if i % 2 == 0:
+                    cuda_mem_ids.add(spec.mem_id)
+                else:
+                    cpu_mem_ids.add(spec.mem_id)
+
+        self.assertTrue(
+            cpu_mem_ids.isdisjoint(cuda_mem_ids),
+            f"CPU {cpu_mem_ids} and CUDA {cuda_mem_ids} should not share buffers",
+        )
+
+    def test_different_device_indices_separate_buffers(self) -> None:
+        """CUDA:0 and CUDA:1 specs get separate buffers."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        self.assertGreaterEqual(len(specs), 3)
+
+        # specs[0] → CUDA:0, specs[1] → CUDA:1, rest → CPU
+        specs[0].device = DeviceType.CUDA
+        specs[0].device_index = 0
+        specs[1].device = DeviceType.CUDA
+        specs[1].device_index = 1
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [constants, cpu, cuda:0, cuda:1]
+        self.assertEqual(len(bufsizes), 4)
+
+        # CUDA:0 and CUDA:1 should have different mem_ids
+        self.assertNotEqual(specs[0].mem_id, specs[1].mem_id)
+        # Both should differ from the CPU spec
+        self.assertNotEqual(specs[0].mem_id, specs[2].mem_id)
+        self.assertNotEqual(specs[1].mem_id, specs[2].mem_id)
+
+        # Device mapping should only contain non-CPU entries with correct indices
+        device_map = gm.meta["non_const_buffer_device"]
+        for entry in device_map:
+            self.assertEqual(entry.device_type, DeviceType.CUDA)
+        cuda_indices = sorted(e.device_index for e in device_map)
+        self.assertEqual(cuda_indices, [0, 1])
+
+    def test_device_index_propagated(self) -> None:
+        """NonConstBufferDevice entries carry the actual device_index, not 0."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set the first spec to CUDA device index 3
+        specs[0].device = DeviceType.CUDA
+        specs[0].device_index = 3
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        device_map = gm.meta["non_const_buffer_device"]
+        self.assertEqual(len(device_map), 1)
+        self.assertEqual(device_map[0].device_type, DeviceType.CUDA)
+        self.assertEqual(device_map[0].device_index, 3)
+
+    def test_disabled_falls_back_to_cpu(self) -> None:
+        """With enable_non_cpu_memory_planning=False (default), CUDA specs are
+        planned into CPU memory — no device-specific buffers are created."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        # Default: enable_non_cpu_memory_planning=False
+        bufsizes = apply_algo(algo, gm, 16, gs)
+
+        # All specs planned into a single CPU pool — same as CPU-only
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertNotIn("non_const_buffer_device", gm.meta)