diff --git a/extension/module/module.cpp b/extension/module/module.cpp index ec7236276f5..21842938db9 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace executorch { @@ -314,6 +315,45 @@ Module::make_planned_memory_with_shared_arenas( return planned; } +std::unique_ptr +Module::make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) { + auto planned = std::make_unique(); + const size_t num_buffers = method_meta.num_memory_planned_buffers(); + planned->planned_buffers.reserve(num_buffers); + planned->planned_spans.reserve(num_buffers); + + for (size_t i = 0; i < num_buffers; ++i) { + auto size = method_meta.memory_planned_buffer_size(i); + ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i); + auto device = method_meta.memory_planned_buffer_device(i); + ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i); + + if (device->is_cpu()) { + planned->planned_buffers.emplace_back(size.get()); + planned->planned_spans.emplace_back( + planned->planned_buffers.back().data(), size.get()); + } else { + // Allocate device memory via DeviceAllocator and store the RAII buffer. + planned->planned_buffers.emplace_back(); // empty CPU placeholder + auto dmb = runtime::DeviceMemoryBuffer::create( + size.get(), device->type(), device->index()); + ET_CHECK_MSG( + dmb.ok(), + "Failed to allocate device memory for buffer %zu (device_type=%d)", + i, + static_cast(device->type())); + planned->planned_spans.emplace_back(dmb->as_span()); + planned->device_buffers.push_back(std::move(dmb.get())); + } + } + + planned->planned_memory = + std::make_unique(runtime::Span( + planned->planned_spans.data(), planned->planned_spans.size())); + return planned; +} + runtime::Result> Module::get_mem_planned_buffer_sizes( const std::string& method_name) { auto meta_res = program_->method_meta(method_name.c_str()); @@ -365,10 +405,54 @@ runtime::Error Module::load_method( MethodHolder method_holder; if (!planned_memory) { - if (!share_memory_arenas_) { + // Check if any buffers need device memory allocation. + auto meta_res = program_->method_meta(method_name.c_str()); + ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error()); + auto& meta = meta_res.get(); + + bool has_device_buffers = false; + for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) { + auto dev = meta.memory_planned_buffer_device(i); + if (dev.ok() && !dev->is_cpu()) { + has_device_buffers = true; + break; + } + } + + if (has_device_buffers) { + // Device memory with shared arenas is not yet supported. + ET_CHECK_OR_RETURN_ERROR( + !share_memory_arenas_, + NotSupported, + "Device memory buffers are not yet compatible with " + "share_memory_arenas. Please disable share_memory_arenas " + "when using models with device-planned memory."); + + // Device-aware path: allocate CPU and device buffers, build metadata. + method_holder.planned_memory = + make_planned_memory_with_devices(meta); + + // Build per-buffer device type array for MemoryManager metadata. + for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) { + auto dev = meta.memory_planned_buffer_device(i); + method_holder.buffer_devices.push_back( + dev.ok() ? dev->type() + : runtime::etensor::DeviceType::CPU); + } + planned_memory = method_holder.planned_memory->planned_memory.get(); + + method_holder.memory_manager = std::make_unique( + memory_allocator_.get(), + planned_memory, + temp_allocator_.get(), + runtime::Span( + method_holder.buffer_devices.data(), + method_holder.buffer_devices.size())); + } else if (!share_memory_arenas_) { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); method_holder.planned_memory = make_planned_memory(sizes_res.get()); + planned_memory = method_holder.planned_memory->planned_memory.get(); } else { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); @@ -385,12 +469,14 @@ runtime::Error Module::load_method( } method_holder.planned_memory = make_planned_memory_with_shared_arenas(sizes, shared_arenas_); + planned_memory = method_holder.planned_memory->planned_memory.get(); } - planned_memory = method_holder.planned_memory->planned_memory.get(); } - method_holder.memory_manager = std::make_unique( - memory_allocator_.get(), planned_memory, temp_allocator_.get()); + if (!method_holder.memory_manager) { + method_holder.memory_manager = std::make_unique( + memory_allocator_.get(), planned_memory, temp_allocator_.get()); + } auto res_method = program_->load_method( method_name.c_str(), method_holder.memory_manager.get(), diff --git a/extension/module/module.h b/extension/module/module.h index 08a68b2676b..4ae494eff0b 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -16,6 +16,8 @@ #include +#include + #ifdef USE_ATEN_LIB #define ET_MODULE_NAMESPACE module::aten #else // !USE_ATEN_LIB @@ -682,12 +684,15 @@ class Module { std::vector> planned_buffers; std::vector> planned_spans; std::unique_ptr planned_memory; + std::vector device_buffers; }; std::unique_ptr make_planned_memory( const std::vector& buffer_sizes); std::unique_ptr make_planned_memory_with_shared_arenas( const std::vector& buffer_sizes, std::vector>& shared_arenas); + std::unique_ptr make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta); runtime::Result> get_mem_planned_buffer_sizes( const std::string& method_name); runtime::Result> get_max_mem_planned_buffer_sizes(); @@ -696,6 +701,7 @@ class Module { std::unique_ptr planned_memory; std::unique_ptr memory_manager; std::unique_ptr method; + std::vector buffer_devices; }; std::string file_path_; diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl index 6d60429bc51..03c50498bbc 100644 --- a/extension/module/targets.bzl +++ b/extension/module/targets.bzl @@ -28,6 +28,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix, + "//executorch/runtime/core:device_memory_buffer", ], ) diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp new file mode 100644 index 00000000000..39d0e2ab2ed --- /dev/null +++ b/extension/module/test/module_device_memory_test.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests that Module's device-aware memory allocation path works correctly. + * + * Uses ModuleAddWithDevice.pte which has: + * non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved) + * non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}] + * + * Since we don't have a real CUDA backend, we test that: + * 1. CPU-only models load through Module without invoking device allocator + * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock + */ + +#include + +#include + +#include +#include +#include + +using executorch::extension::Module; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::Result; +using executorch::runtime::register_device_allocator; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +class MockCudaAllocator : public DeviceAllocator { + public: + Result allocate(size_t nbytes, DeviceIndex index) override { + allocate_count_++; + last_allocate_size_ = nbytes; + last_allocate_index_ = index; + buffer_ = std::make_unique(nbytes); + return static_cast(buffer_.get()); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + buffer_.reset(); + } + + Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + DeviceIndex last_allocate_index_ = -1; + + private: + std::unique_ptr buffer_; +}; + +} // namespace + +static MockCudaAllocator g_mock_cuda; + +class ModuleDeviceMemoryTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(DeviceType::CUDA, &g_mock_cuda); + } + + void SetUp() override { + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_allocate_index_ = -1; + } +}; + +TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) { + const char* path = std::getenv("ET_MODULE_ADD_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set"; + + Module module(path); + auto err = module.load_method("forward"); + ASSERT_EQ(err, Error::Ok); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 0) + << "CPU-only model should not allocate device memory"; +} + +TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) { + // Directly test DeviceMemoryBuffer::create with the registered mock. + // This verifies the RAII allocation/deallocation path that Module uses. + { + auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48); + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 48); + + // as_span() wraps the device pointer for HierarchicalAllocator. + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 48); + + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + // RAII deallocation on scope exit. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); +} + +TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) { + // Verify MethodMeta reports the correct device for buffers in the + // device-annotated model, without needing to load the full method. + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + Module module(path); + auto err = module.load(); + ASSERT_EQ(err, Error::Ok); + + auto meta = module.method_meta("forward"); + ASSERT_TRUE(meta.ok()); + + // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA. + ASSERT_EQ(meta->num_memory_planned_buffers(), 1); + + auto size = meta->memory_planned_buffer_size(0); + ASSERT_TRUE(size.ok()); + EXPECT_EQ(size.get(), 48); + + auto device = meta->memory_planned_buffer_device(0); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), DeviceType::CUDA); + EXPECT_EQ(device->index(), 0); +} + +TEST_F( + ModuleDeviceMemoryTest, + DeviceModelWithSharedArenasReturnsNotSupported) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + // share_memory_arenas = true with a device-annotated model should fail. + Module module( + path, + Module::LoadMode::File, + /*event_tracer=*/nullptr, + /*memory_allocator=*/nullptr, + /*temp_allocator=*/nullptr, + /*share_memory_arenas=*/true); + + auto err = module.load_method("forward"); + EXPECT_EQ(err, Error::NotSupported); +} + +TEST_F( + ModuleDeviceMemoryTest, + LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + { + Module module(path); + auto err = module.load_method("forward"); + + // Regardless of whether load_method succeeds or fails (e.g. due to + // backend init issues), the device-aware memory allocation path + // (make_planned_memory_with_devices) runs BEFORE backend init. + EXPECT_EQ(g_mock_cuda.allocate_count_, 1) + << "Expected 1 device allocation for the CUDA buffer" + << " (actual: " << g_mock_cuda.allocate_count_ << ")" + << ", deallocate_count=" << g_mock_cuda.deallocate_count_ + << ", load_method returned error=" << static_cast(err); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48) + << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)"; + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0) + << "Expected device_index=0 (cuda:0)"; + + if (err == Error::Ok) { + // Success path: MethodHolder moved into methods_ map. + // DeviceMemoryBuffer is alive as long as Module is alive. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0) + << "No deallocation while method is loaded"; + } else { + // Error path: local MethodHolder destroyed on return from load_method. + // RAII deallocation already happened. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "RAII deallocation on error path"; + } + } + + // After Module destroyed, all device memory must be freed. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "Expected deallocation after Module destroyed"; +} diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl index f0d7e449efd..4dc3fb537f3 100644 --- a/extension/module/test/targets.bzl +++ b/extension/module/test/targets.bzl @@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False): aten_suffix = ("_aten" if aten_mode else "") runtime.cxx_test( - name = "test" + aten_suffix, + name = "module_test" + aten_suffix, srcs = [ "module_test.cpp", ], @@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False): ], ) + runtime.cxx_test( + name = "module_device_memory_test" + aten_suffix, + srcs = [ + "module_device_memory_test.cpp", + ], + deps = [ + "//executorch/kernels/portable:generated_lib" + aten_suffix, + "//executorch/extension/module:module" + aten_suffix, + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core:device_memory_buffer", + ], + env = { + "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])", + "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", + }, + compiler_flags = [ + "-Wno-error=deprecated-declarations", + ], + ) + runtime.filegroup( name = "resources", srcs = native.glob([ diff --git a/test/models/targets.bzl b/test/models/targets.bzl index c9fb67b7d31..a80244b1383 100644 --- a/test/models/targets.bzl +++ b/test/models/targets.bzl @@ -226,6 +226,7 @@ def define_common_targets(): default_outs = ["."], visibility = [ "//executorch/runtime/executor/test/...", + "//executorch/extension/module/test/...", ], )