pytorch · Gasoonjia · Mar 24, 2026
@@ -1,3 +1,3 @@
 /*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
@@ -13,6 +13,7 @@
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/platform/runtime.h>
 
 namespace executorch {
@@ -314,6 +315,45 @@
   return planned;
 }
 
+std::unique_ptr<Module::PlannedMemory>
+Module::make_planned_memory_with_devices(
+    const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
+  auto planned = std::make_unique<PlannedMemory>();
+  const size_t num_buffers = method_meta.num_memory_planned_buffers();
+  planned->planned_buffers.reserve(num_buffers);
+  planned->planned_spans.reserve(num_buffers);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta.memory_planned_buffer_size(i);
+    ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
+    auto device = method_meta.memory_planned_buffer_device(i);
+    ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
+
+    if (device->is_cpu()) {
+      planned->planned_buffers.emplace_back(size.get());
+      planned->planned_spans.emplace_back(
+          planned->planned_buffers.back().data(), size.get());
+    } else {
+      // Allocate device memory via DeviceAllocator and store the RAII buffer.
+      planned->planned_buffers.emplace_back(); // empty CPU placeholder
+      auto dmb = runtime::DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ET_CHECK_MSG(
+          dmb.ok(),
+          "Failed to allocate device memory for buffer %zu (device_type=%d)",
+          i,
+          static_cast<int>(device->type()));
+      planned->planned_spans.emplace_back(dmb->as_span());
+      planned->device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  planned->planned_memory =
+      std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
+          planned->planned_spans.data(), planned->planned_spans.size()));
+  return planned;
+}
+
 runtime::Result<std::vector<size_t>> Module::get_mem_planned_buffer_sizes(
     const std::string& method_name) {
   auto meta_res = program_->method_meta(method_name.c_str());
@@ -365,10 +405,54 @@
     MethodHolder method_holder;
 
     if (!planned_memory) {
-      if (!share_memory_arenas_) {
+      // Check if any buffers need device memory allocation.
+      auto meta_res = program_->method_meta(method_name.c_str());
+      ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
+      auto& meta = meta_res.get();
+
+      bool has_device_buffers = false;
+      for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+        auto dev = meta.memory_planned_buffer_device(i);
+        if (dev.ok() && !dev->is_cpu()) {
+          has_device_buffers = true;
+          break;
+        }
+      }
+
+      if (has_device_buffers) {
+        // Device memory with shared arenas is not yet supported.
+        ET_CHECK_OR_RETURN_ERROR(
+            !share_memory_arenas_,
+            NotSupported,
+            "Device memory buffers are not yet compatible with "
+            "share_memory_arenas. Please disable share_memory_arenas "
+            "when using models with device-planned memory.");
+
+        // Device-aware path: allocate CPU and device buffers, build metadata.
+        method_holder.planned_memory =
+            make_planned_memory_with_devices(meta);
+
+        // Build per-buffer device type array for MemoryManager metadata.
+        for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+          auto dev = meta.memory_planned_buffer_device(i);
+          method_holder.buffer_devices.push_back(
+              dev.ok() ? dev->type()
+                       : runtime::etensor::DeviceType::CPU);
+        }
+        planned_memory = method_holder.planned_memory->planned_memory.get();
+
+        method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
+            memory_allocator_.get(),
+            planned_memory,
+            temp_allocator_.get(),
+            runtime::Span<const runtime::etensor::DeviceType>(
+                method_holder.buffer_devices.data(),
+                method_holder.buffer_devices.size()));
+      } else if (!share_memory_arenas_) {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
         method_holder.planned_memory = make_planned_memory(sizes_res.get());
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       } else {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -385,12 +469,14 @@
         }
         method_holder.planned_memory =
             make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       }
-      planned_memory = method_holder.planned_memory->planned_memory.get();
     }
 
-    method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
-        memory_allocator_.get(), planned_memory, temp_allocator_.get());
+    if (!method_holder.memory_manager) {
+      method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
+          memory_allocator_.get(), planned_memory, temp_allocator_.get());
+    }
     auto res_method = program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),

@@ -16,6 +16,8 @@
 
 #include <executorch/runtime/executor/program.h>
 
+#include <executorch/runtime/core/device_memory_buffer.h>
+
 #ifdef USE_ATEN_LIB
 #define ET_MODULE_NAMESPACE module::aten
 #else // !USE_ATEN_LIB
@@ -682,12 +684,15 @@ class Module {
     std::vector<std::vector<uint8_t>> planned_buffers;
     std::vector<runtime::Span<uint8_t>> planned_spans;
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
+    std::vector<runtime::DeviceMemoryBuffer> device_buffers;
   };
   std::unique_ptr<PlannedMemory> make_planned_memory(
       const std::vector<size_t>& buffer_sizes);
   std::unique_ptr<PlannedMemory> make_planned_memory_with_shared_arenas(
       const std::vector<size_t>& buffer_sizes,
       std::vector<std::vector<uint8_t>>& shared_arenas);
+  std::unique_ptr<PlannedMemory> make_planned_memory_with_devices(
+      const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
   runtime::Result<std::vector<size_t>> get_mem_planned_buffer_sizes(
       const std::string& method_name);
   runtime::Result<std::vector<size_t>> get_max_mem_planned_buffer_sizes();
@@ -696,6 +701,7 @@ class Module {
     std::unique_ptr<PlannedMemory> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
+    std::vector<runtime::etensor::DeviceType> buffer_devices;
   };
 
   std::string file_path_;

@@ -28,6 +28,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+                "//executorch/runtime/core:device_memory_buffer",
             ],
         )
 

@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that Module's device-aware memory allocation path works correctly.
+ *
+ * Uses ModuleAddWithDevice.pte which has:
+ *   non_const_buffer_sizes: [0, 48]  (1 buffer, index 0 reserved)
+ *   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+ *
+ * Since we don't have a real CUDA backend, we test that:
+ * 1. CPU-only models load through Module without invoking device allocator
+ * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
+ */
+
+#include <executorch/extension/module/module.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::extension::Module;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    last_allocate_index_ = index;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  DeviceIndex last_allocate_index_ = -1;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
+class ModuleDeviceMemoryTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
+  void SetUp() override {
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_allocate_index_ = -1;
+  }
+};
+
+TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
+  const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
+
+  Module module(path);
+  auto err = module.load_method("forward");
+  ASSERT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
+      << "CPU-only model should not allocate device memory";
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
+  // Directly test DeviceMemoryBuffer::create with the registered mock.
+  // This verifies the RAII allocation/deallocation path that Module uses.
+  {
+    auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+    auto buf = std::move(result.get());
+
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 48);
+
+    // as_span() wraps the device pointer for HierarchicalAllocator.
+    auto span = buf.as_span();
+    EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+    EXPECT_EQ(span.size(), 48);
+
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  // RAII deallocation on scope exit.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
+  // Verify MethodMeta reports the correct device for buffers in the
+  // device-annotated model, without needing to load the full method.
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  Module module(path);
+  auto err = module.load();
+  ASSERT_EQ(err, Error::Ok);
+
+  auto meta = module.method_meta("forward");
+  ASSERT_TRUE(meta.ok());
+
+  // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
+  ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
+
+  auto size = meta->memory_planned_buffer_size(0);
+  ASSERT_TRUE(size.ok());
+  EXPECT_EQ(size.get(), 48);
+
+  auto device = meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+}
+
+TEST_F(
+    ModuleDeviceMemoryTest,
+    DeviceModelWithSharedArenasReturnsNotSupported) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  // share_memory_arenas = true with a device-annotated model should fail.
+  Module module(
+      path,
+      Module::LoadMode::File,
+      /*event_tracer=*/nullptr,
+      /*memory_allocator=*/nullptr,
+      /*temp_allocator=*/nullptr,
+      /*share_memory_arenas=*/true);
+
+  auto err = module.load_method("forward");
+  EXPECT_EQ(err, Error::NotSupported);
+}
+
+TEST_F(
+    ModuleDeviceMemoryTest,
+    LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  {
+    Module module(path);
+    auto err = module.load_method("forward");
+
+    // Regardless of whether load_method succeeds or fails (e.g. due to
+    // backend init issues), the device-aware memory allocation path
+    // (make_planned_memory_with_devices) runs BEFORE backend init.
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
+        << "Expected 1 device allocation for the CUDA buffer"
+        << " (actual: " << g_mock_cuda.allocate_count_ << ")"
+        << ", deallocate_count=" << g_mock_cuda.deallocate_count_
+        << ", load_method returned error=" << static_cast<int>(err);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
+        << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
+        << "Expected device_index=0 (cuda:0)";
+
+    if (err == Error::Ok) {
+      // Success path: MethodHolder moved into methods_ map.
+      // DeviceMemoryBuffer is alive as long as Module is alive.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
+          << "No deallocation while method is loaded";
+    } else {
+      // Error path: local MethodHolder destroyed on return from load_method.
+      // RAII deallocation already happened.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+          << "RAII deallocation on error path";
+    }
+  }
+
+  // After Module destroyed, all device memory must be freed.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+      << "Expected deallocation after Module destroyed";
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ def define_common_targets(): @@
                 ],
                 exported_deps = [
                     "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+                    "//executorch/runtime/core:device_memory_buffer",
                 ],
             )
@@ Expand Down @@