Thread method-scoped kernel registry through Program and Method

JacobSzwejbka · facebook-github-bot · commit c4e71f6ce42e · 2026-05-13T10:34:40.000-07:00
Summary:
Certain kernels might make optimizations that are broadly optimal but sub optimal for a specific model. In those scenarios it is useful to expose a backdoor for the exception method to defer to a different implementation without forcing the root imlementation to have to handle all possible dispatches.

This is just a proposal impl because things still get a little weird because ET today tends to have kernel impls get auto registered. Might need follow ups to allow generating boxed kernels separately from registering them into ETs generic kernel registry.

Differential Revision: D98080033
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -802,8 +802,19 @@ Error Method::resolve_operator(
   }
 
   // Find a kernel with the matching name and tensor meta.
-  Result<OpFunction> op_function =
-      get_op_function_from_registry(operator_name, {meta, count});
+  // Try method-scoped registry first (if provided), then fall back to global.
+  auto resolve_op_function = [&]() -> Result<OpFunction> {
+    if (!kernel_registry_.empty()) {
+      Result<OpFunction> method_scoped_op_function =
+          get_op_function_from_registry(
+              operator_name, {meta, count}, kernel_registry_);
+      if (method_scoped_op_function.ok()) {
+        return method_scoped_op_function;
+      }
+    }
+    return get_op_function_from_registry(operator_name, {meta, count});
+  };
+  Result<OpFunction> op_function = resolve_op_function();
   if (!op_function.ok()) {
     ET_LOG(
         Error,
@@ -831,7 +842,8 @@ Result<Method> Method::load(
     MemoryManager* memory_manager,
     EventTracer* event_tracer,
     const NamedDataMap* external_data_map,
-    const LoadBackendOptionsMap* backend_options) {
+    const LoadBackendOptionsMap* backend_options,
+    Span<const Kernel> kernel_registry) {
   MemoryAllocator* temp_allocator = memory_manager->temp_allocator();
   if (temp_allocator == nullptr) {
     PlatformMemoryAllocator* platform_allocator =
@@ -844,7 +856,8 @@ Result<Method> Method::load(
     new (platform_allocator) PlatformMemoryAllocator();
     temp_allocator = platform_allocator;
   }
-  Method method(program, memory_manager, event_tracer, temp_allocator);
+  Method method(
+      program, memory_manager, event_tracer, temp_allocator, kernel_registry);
   ET_LOG(Debug, "Loading method: %s.", s_plan->name()->c_str());
   Error err = method.init(s_plan, external_data_map, backend_options);
   if (err != Error::Ok) {
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
@@ -21,6 +21,7 @@
 #include <executorch/runtime/core/named_data_map.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/memory_manager.h>
+#include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/executor/merged_data_map.h>
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/platform/compiler.h>
@@ -82,6 +83,7 @@ class Method final {
         merged_data_map_(std::move(rhs.merged_data_map_)),
         external_constants_(rhs.external_constants_),
         n_external_constants_(rhs.n_external_constants_),
+        kernel_registry_(rhs.kernel_registry_),
         init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
@@ -331,7 +333,8 @@ class Method final {
       const Program* program,
       MemoryManager* memory_manager,
       EventTracer* event_tracer,
-      MemoryAllocator* temp_allocator)
+      MemoryAllocator* temp_allocator,
+      Span<const Kernel> kernel_registry = {})
       : step_state_(),
         program_(program),
         memory_manager_(memory_manager),
@@ -348,6 +351,7 @@ class Method final {
         merged_data_map_(nullptr),
         external_constants_(nullptr),
         n_external_constants_(0),
+        kernel_registry_(kernel_registry),
         init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
@@ -357,7 +361,8 @@ class Method final {
       MemoryManager* memory_manager,
       EventTracer* event_tracer,
       const NamedDataMap* named_data_map,
-      const LoadBackendOptionsMap* backend_options = nullptr);
+      const LoadBackendOptionsMap* backend_options = nullptr,
+      Span<const Kernel> kernel_registry = {});
 
   /**
    * Initialize the method from its serialized representation.
@@ -403,6 +408,8 @@ class Method final {
   NamedData* external_constants_;
   size_t n_external_constants_ = 0;
 
+  Span<const Kernel> kernel_registry_;
+
   InitializationState init_state_;
 
   /**
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
@@ -355,7 +355,8 @@ Result<Method> Program::load_method(
     MemoryManager* memory_manager,
     EventTracer* event_tracer,
     const NamedDataMap* named_data_map,
-    const LoadBackendOptionsMap* backend_options) const {
+    const LoadBackendOptionsMap* backend_options,
+    Span<const Kernel> kernel_registry) const {
   EXECUTORCH_SCOPE_PROF("Program::load_method");
   internal::event_tracer_create_event_block(event_tracer, "Default");
   internal::EventTracerProfileMethodScope event_tracer_scope =
@@ -378,7 +379,8 @@ Result<Method> Program::load_method(
       memory_manager,
       event_tracer,
       named_data_map,
-      backend_options);
+      backend_options,
+      kernel_registry);
 }
 
 Result<MethodMeta> Program::method_meta(const char* method_name) const {
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
@@ -21,6 +21,7 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/compiler.h>
 
 // Forward declare flatbuffer types. This is a public header and must not
@@ -151,7 +152,8 @@ class Program final {
       MemoryManager* memory_manager,
       EventTracer* event_tracer = nullptr,
       const NamedDataMap* named_data_map = nullptr,
-      const LoadBackendOptionsMap* backend_options = nullptr) const;
+      const LoadBackendOptionsMap* backend_options = nullptr,
+      Span<const Kernel> kernel_registry = {}) const;
 
   /**
    * Gathers metadata for the named method.
diff --git a/runtime/executor/test/kernel_registry_test.cpp b/runtime/executor/test/kernel_registry_test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Kernel;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+namespace {
+
+// aten::add.out args: [self, other, out, out]
+void multiply_by_two(
+    KernelRuntimeContext& /*context*/,
+    Span<EValue*> args) {
+  auto& in = args[0]->toTensor();
+  auto& out = args[args.size() - 1]->toTensor();
+  for (ssize_t i = 0; i < in.numel(); ++i) {
+    out.mutable_data_ptr<float>()[i] = in.const_data_ptr<float>()[i] * 2.0f;
+  }
+}
+
+void multiply_by_three(
+    KernelRuntimeContext& /*context*/,
+    Span<EValue*> args) {
+  auto& in = args[0]->toTensor();
+  auto& out = args[args.size() - 1]->toTensor();
+  for (ssize_t i = 0; i < in.numel(); ++i) {
+    out.mutable_data_ptr<float>()[i] = in.const_data_ptr<float>()[i] * 3.0f;
+  }
+}
+
+} // namespace
+
+class KernelRegistryTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+
+    const char* path = std::getenv("ET_MODULE_ADD_PATH");
+    ASSERT_NE(path, nullptr)
+        << "ET_MODULE_ADD_PATH environment variable must be set";
+
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    Result<Program> program = Program::load(loader_.get());
+    ASSERT_EQ(program.error(), Error::Ok);
+    program_ = std::make_unique<Program>(std::move(program.get()));
+  }
+
+  std::unique_ptr<FileDataLoader> loader_;
+  std::unique_ptr<Program> program_;
+};
+
+TEST_F(KernelRegistryTest, MethodScopedKernelOverridesGlobal) {
+  // Create two fallback kernels for aten::add.out with different behavior.
+  Kernel kernel_x2(
+      "aten::add.out", multiply_by_two);
+  Kernel kernel_x3(
+      "aten::add.out", multiply_by_three);
+
+  Span<const Kernel> registry_x2(&kernel_x2, 1);
+  Span<const Kernel> registry_x3(&kernel_x3, 1);
+
+  // Load two methods with different kernel registries.
+  ManagedMemoryManager mmm_a(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method_a = program_->load_method(
+      "forward",
+      &mmm_a.get(),
+      /*event_tracer=*/nullptr,
+      /*named_data_map=*/nullptr,
+      /*backend_options=*/nullptr,
+      registry_x2);
+  ASSERT_EQ(method_a.error(), Error::Ok);
+
+  ManagedMemoryManager mmm_b(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method_b = program_->load_method(
+      "forward",
+      &mmm_b.get(),
+      /*event_tracer=*/nullptr,
+      /*named_data_map=*/nullptr,
+      /*backend_options=*/nullptr,
+      registry_x3);
+  ASSERT_EQ(method_b.error(), Error::Ok);
+
+  // Prepare inputs: tensor inputs + alpha scalar (input index 2).
+  auto inputs_a = torch::executor::util::prepare_input_tensors(method_a.get());
+  ASSERT_EQ(inputs_a.error(), Error::Ok);
+  ASSERT_EQ(method_a->set_input(EValue(1.0), 2), Error::Ok);
+
+  auto inputs_b = torch::executor::util::prepare_input_tensors(method_b.get());
+  ASSERT_EQ(inputs_b.error(), Error::Ok);
+  ASSERT_EQ(method_b->set_input(EValue(1.0), 2), Error::Ok);
+
+  // Execute both methods.
+  ASSERT_EQ(method_a->execute(), Error::Ok);
+  ASSERT_EQ(method_b->execute(), Error::Ok);
+
+  // Check outputs: method_a should have 2.0, method_b should have 3.0.
+  const auto& out_a = method_a->get_output(0).toTensor();
+  const auto& out_b = method_b->get_output(0).toTensor();
+
+  ASSERT_GT(out_a.numel(), 0);
+  for (ssize_t i = 0; i < out_a.numel(); ++i) {
+    EXPECT_FLOAT_EQ(out_a.const_data_ptr<float>()[i], 2.0f)
+        << "method_a output[" << i << "] should be 2.0";
+  }
+  for (ssize_t i = 0; i < out_b.numel(); ++i) {
+    EXPECT_FLOAT_EQ(out_b.const_data_ptr<float>()[i], 3.0f)
+        << "method_b output[" << i << "] should be 3.0";
+  }
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
@@ -223,6 +223,23 @@ def define_common_targets(is_fbcode = False):
             env = modules_env,
         )
 
+        runtime.cxx_test(
+            name = "kernel_registry_test",
+            srcs = [
+                "kernel_registry_test.cpp",
+            ],
+            deps = [
+                ":managed_memory_manager",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/runner_util:inputs",
+                "//executorch/runtime/executor:program",
+                "//executorch/runtime/kernel:kernel_runtime_context",
+                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/platform:platform",
+            ],
+            env = modules_env,
+        )
+
         runtime.cxx_test(
             name = "kernel_integration_test",
             srcs = [
diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp
@@ -438,6 +438,67 @@ TEST_F(OperatorRegistryTest, GetOpFunctionUsesProvidedKernelList) {
   EXPECT_EQ(run_kernel(*fallback_func), 50);
 }
 
+TEST_F(OperatorRegistryTest, ProvidedKernelListMissCanFallBackToGlobal) {
+  std::array<char, kKernelKeyBufSize> buf;
+  Error err = make_kernel_key(
+      {{ScalarType::Long, {0, 1, 2, 3}}}, buf.data(), buf.size());
+  ASSERT_EQ(err, Error::Ok);
+  KernelKey long_key = KernelKey(buf.data());
+
+  Kernel global_kernel = Kernel(
+      "test::provided_kernel_list_global_fallback",
+      KernelKey{},
+      [](KernelRuntimeContext& context, Span<EValue*> stack) {
+        (void)context;
+        *(stack[0]) = Scalar(50);
+      });
+  err = register_kernels({&global_kernel, 1});
+  ASSERT_EQ(err, Error::Ok);
+
+  Kernel scoped_kernel = Kernel(
+      "test::provided_kernel_list_global_fallback",
+      long_key,
+      [](KernelRuntimeContext& context, Span<EValue*> stack) {
+        (void)context;
+        *(stack[0]) = Scalar(100);
+      });
+  Span<const Kernel> scoped_registry(&scoped_kernel, 1);
+
+  Tensor::DimOrderType dims[] = {0, 1, 2, 3};
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
+  TensorMeta long_meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
+  Span<const TensorMeta> long_kernel_key(long_meta);
+
+  TensorMeta float_meta[] = {TensorMeta(ScalarType::Float, dim_order_type)};
+  Span<const TensorMeta> float_kernel_key(float_meta);
+
+  auto run_kernel = [](OpFunction func) {
+    EValue value = Scalar(0);
+    EValue* stack[] = {&value};
+    KernelRuntimeContext context{};
+    func(context, Span<EValue*>(stack));
+    return value.toScalar().to<int64_t>();
+  };
+
+  Result<OpFunction> scoped_func = get_op_function_from_registry(
+      "test::provided_kernel_list_global_fallback",
+      long_kernel_key,
+      scoped_registry);
+  ASSERT_EQ(scoped_func.error(), Error::Ok);
+  EXPECT_EQ(run_kernel(*scoped_func), 100);
+
+  Result<OpFunction> scoped_miss = get_op_function_from_registry(
+      "test::provided_kernel_list_global_fallback",
+      float_kernel_key,
+      scoped_registry);
+  ASSERT_EQ(scoped_miss.error(), Error::OperatorMissing);
+
+  Result<OpFunction> global_func = get_op_function_from_registry(
+      "test::provided_kernel_list_global_fallback", float_kernel_key);
+  ASSERT_EQ(global_func.error(), Error::Ok);
+  EXPECT_EQ(run_kernel(*global_func), 50);
+}
+
 TEST_F(OperatorRegistryTest, DoubleRegisterKernelsDies) {
   std::array<char, kKernelKeyBufSize> buf_long_contiguous;
   Error err = make_kernel_key(