From 4efdd1a5e405087b2924620629e9cef1053d64e1 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 15:55:29 +0000
Subject: [PATCH 01/84] Add channelwise conv

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 103 ++++++++++++++++
 .../migraphx/kernels/channelwise_conv.hpp     |  61 ++++++++++
 src/targets/gpu/prefuse_ops.cpp               | 110 ++++++++++++++++++
 test/verify/test_channelwise_conv.cpp         | 104 +++++++++++++++++
 4 files changed, 378 insertions(+)
 create mode 100644 src/targets/gpu/jit/channelwise_conv.cpp
 create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
 create mode 100644 test/verify/test_channelwise_conv.cpp
diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
new file mode 100644
index 00000000000..88218c2a03d
--- /dev/null
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -0,0 +1,103 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const channelwise_conv_kernel = R"__migraphx__(
+#include <migraphx/kernels/channelwise_conv.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
+{
+    transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
+        channelwise_conv<${algo}>(index_ints<${kernel}>{}, output, x, w);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"gpu::channelwise_conv", "channelwise_conv"};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        auto num_spatial   = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s    = inputs.at(0);
+        const auto& out_s  = inputs.back();
+        options.inputs     = inputs;
+        options.output     = out_s;
+        options.kernel_name = "channelwise_conv_kernel";
+        options.virtual_inputs = inputs;
+
+        auto x_lens = x_s.lens();
+        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2,
+                                              x_lens.begin() + 2 + num_spatial);
+        std::size_t kernel_total = 1;
+        for(auto k : kernel_sizes)
+            kernel_total *= k;
+
+        std::string algo       = "reduce::lane";
+        std::size_t block_size = 256;
+
+        options.set_launch_params(
+            v, compute_global_for(ctx, out_s.elements(), 256), block_size);
+
+        auto src = interpolate_string(channelwise_conv_kernel,
+                                      {{"algo", algo},
+                                       {"kernel", to_string_range(kernel_sizes)}});
+
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
new file mode 100644
index 00000000000..5f3a454546a
--- /dev/null
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -0,0 +1,61 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
+#define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/pooling.hpp>
+
+namespace migraphx {
+
+template <class Algo, class KernelLens, class Output, class Input1, class Input2>
+__device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x, Input2 w)
+{
+    constexpr index_int NS           = array_size(KernelLens{});
+    constexpr index_int NDIM         = 2 + 2 * NS;
+    constexpr index_int kernel_total = KernelLens{}.product();
+
+    pooling_reduce<Algo, 1>(output, [&](auto out_idx, auto r) {
+        auto result = r.reduce(op::sum{}, 0, [&](auto ki) {
+            auto kmulti    = kernel_lens.multi(ki);
+            auto bcast_idx = generate_array<index_int>(
+                _c<NDIM>, [&](auto d) -> index_int {
+                    if constexpr(d < 2)
+                        return out_idx[d];
+                    else if constexpr(d < 2 + NS)
+                        return kmulti[d - _c<2>];
+                    else
+                        return out_idx[d - _c<NS>] + kmulti[d - _c<2 + NS>];
+                });
+            return x[bcast_idx] * w[bcast_idx];
+        })(reduce::make_indices(_c<kernel_total>));
+        return result;
+    });
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 7d3cd43b8db..6153d0a3811 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -27,9 +27,11 @@
 #include <migraphx/gpu/gemm_softmax_gemm.hpp>
 #include <migraphx/match/layernorm.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/make_op.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
+#include <numeric>
 #ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
 #include <migraphx/gpu/ck.hpp>
 #endif
@@ -237,6 +239,113 @@ struct find_gemm_softmax_gemm
     }
 };
 
+struct channelwise_conv
+{
+    std::size_t num_spatial = 2;
+
+    std::string name() const { return "gpu::channelwise_conv"; }
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.num_spatial, "num_spatial"));
+    }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto lens = inputs.front().lens();
+        std::vector<std::size_t> out_lens;
+        out_lens.push_back(lens[0]);
+        out_lens.push_back(lens[1]);
+        for(std::size_t d = 0; d < num_spatial; ++d)
+        {
+            auto kernel_size  = lens[2 + d];
+            auto spatial_size = lens[2 + num_spatial + d];
+            out_lens.push_back(spatial_size - kernel_size + 1);
+        }
+        return {inputs.front().type(), out_lens};
+    }
+};
+MIGRAPHX_REGISTER_OP(channelwise_conv);
+
+MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins)
+{
+    if(ins->name() != "convolution")
+        return false;
+    auto v = ins->get_operator().to_value();
+    if(not all_of(v.at("stride"), [](const value& x) { return x.to<std::size_t>() == 1; }))
+        return false;
+    if(not all_of(v.at("padding"), [](const value& x) { return x.to<std::size_t>() == 0; }))
+        return false;
+    if(not all_of(v.at("dilation"), [](const value& x) { return x.to<std::size_t>() == 1; }))
+        return false;
+    auto w_lens = ins->inputs().back()->get_shape().lens();
+    if(w_lens[1] != 1)
+        return false;
+    auto x_lens = ins->inputs().front()->get_shape().lens();
+    auto c_in   = x_lens[1];
+    auto group  = v.at("group").to<int>();
+    if(group != 1 and group != static_cast<int>(c_in))
+        return false;
+    return true;
+}
+
+struct find_channelwise_convolution
+{
+    auto matcher() const { return conv_channelwise(); }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+
+        auto input   = ins->inputs().front();
+        auto weights = ins->inputs().back();
+
+        auto w_lens      = weights->get_shape().lens();
+        auto x_lens      = input->get_shape().lens();
+        auto ndim        = ins->get_shape().ndim();
+        auto num_spatial = ndim - 2;
+
+        // Build product shape: [N, C, k_0, ..., k_{ns-1}, s_0, ..., s_{ns-1}]
+        std::vector<std::size_t> prod_lens;
+        prod_lens.push_back(x_lens[0]);
+        prod_lens.push_back(w_lens[0]);
+        for(std::size_t d = 2; d < ndim; ++d)
+            prod_lens.push_back(w_lens[d]);
+        for(std::size_t d = 2; d < ndim; ++d)
+            prod_lens.push_back(x_lens[d]);
+
+        // Unsqueeze input: [N, C_in, H, W] -> [N, C_in, 1, ..., 1, H, W]
+        std::vector<int64_t> input_unsq_axes(num_spatial);
+        std::iota(input_unsq_axes.begin(), input_unsq_axes.end(), 2);
+        auto unsq_input =
+            m.insert_instruction(ins, make_op("unsqueeze", {{"axes", input_unsq_axes}}), input);
+
+        // Broadcast input to product shape
+        auto bcast_input = m.insert_instruction(
+            ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_input);
+
+        // Squeeze weight axis 1: [C_out, 1, k_0, ...] -> [C_out, k_0, ...]
+        auto sq_weights = m.insert_instruction(ins, make_op("squeeze", {{"axes", {1}}}), weights);
+
+        // Unsqueeze weight: [C_out, k_0, ...] -> [1, C_out, k_0, ..., 1, ..., 1]
+        std::vector<int64_t> w_unsq_axes;
+        w_unsq_axes.push_back(0);
+        for(std::size_t d = 0; d < num_spatial; ++d)
+            w_unsq_axes.push_back(static_cast<int64_t>(2 + num_spatial + d));
+        auto unsq_weights =
+            m.insert_instruction(ins, make_op("unsqueeze", {{"axes", w_unsq_axes}}), sq_weights);
+
+        // Broadcast weight to product shape
+        auto bcast_weights = m.insert_instruction(
+            ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights);
+
+        m.replace_instruction(
+            ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights);
+    }
+};
+
 void inline_group_sub_module(module_pass_manager& mpm)
 {
     auto& m = mpm.get_module();
@@ -262,6 +371,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
         match::find_matches(mpm.get_module(), find_add_layernorm{});
     }
     match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention});
+    match::find_matches(mpm.get_module(), find_channelwise_convolution{});
 
     if(enabled(MIGRAPHX_DISABLE_MLIR{}))
     {
diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp
new file mode 100644
index 00000000000..1367fd85076
--- /dev/null
+++ b/test/verify/test_channelwise_conv.cpp
@@ -0,0 +1,104 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_depthwise
+    : verify_program<test_channelwise_conv_depthwise<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_depthwise<migraphx::shape::float_type>;
+template struct test_channelwise_conv_depthwise<migraphx::shape::half_type>;
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_single_channel
+    : verify_program<test_channelwise_conv_single_channel<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution"), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_single_channel<migraphx::shape::float_type>;
+template struct test_channelwise_conv_single_channel<migraphx::shape::half_type>;
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_depthwise_5x5
+    : verify_program<test_channelwise_conv_depthwise_5x5<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::float_type>;
+template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::half_type>;
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_1d
+    : verify_program<test_channelwise_conv_1d<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}});
+        mm->add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}),
+            input,
+            weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_1d<migraphx::shape::float_type>;
+template struct test_channelwise_conv_1d<migraphx::shape::half_type>;

From a0c6b07963d5686b6f4a81a0dddd0ef613001091 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 15:55:32 +0000
Subject: [PATCH 02/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 26 +++++++------------
 .../migraphx/kernels/channelwise_conv.hpp     | 17 ++++++------
 src/targets/gpu/prefuse_ops.cpp               |  3 +--
 test/verify/test_channelwise_conv.cpp         |  6 ++---
 4 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 88218c2a03d..e0101186e5b 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,25 +56,21 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const
-    {
-        return {"gpu::channelwise_conv", "channelwise_conv"};
-    }
+    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial   = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s    = inputs.at(0);
-        const auto& out_s  = inputs.back();
-        options.inputs     = inputs;
-        options.output     = out_s;
-        options.kernel_name = "channelwise_conv_kernel";
+        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s        = inputs.at(0);
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens = x_s.lens();
-        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2,
-                                              x_lens.begin() + 2 + num_spatial);
+        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
         std::size_t kernel_total = 1;
         for(auto k : kernel_sizes)
             kernel_total *= k;
@@ -82,12 +78,10 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         std::string algo       = "reduce::lane";
         std::size_t block_size = 256;
 
-        options.set_launch_params(
-            v, compute_global_for(ctx, out_s.elements(), 256), block_size);
+        options.set_launch_params(v, compute_global_for(ctx, out_s.elements(), 256), block_size);
 
         auto src = interpolate_string(channelwise_conv_kernel,
-                                      {{"algo", algo},
-                                       {"kernel", to_string_range(kernel_sizes)}});
+                                      {{"algo", algo}, {"kernel", to_string_range(kernel_sizes)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 5f3a454546a..0ade178111a 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -42,15 +42,14 @@ __device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x
     pooling_reduce<Algo, 1>(output, [&](auto out_idx, auto r) {
         auto result = r.reduce(op::sum{}, 0, [&](auto ki) {
             auto kmulti    = kernel_lens.multi(ki);
-            auto bcast_idx = generate_array<index_int>(
-                _c<NDIM>, [&](auto d) -> index_int {
-                    if constexpr(d < 2)
-                        return out_idx[d];
-                    else if constexpr(d < 2 + NS)
-                        return kmulti[d - _c<2>];
-                    else
-                        return out_idx[d - _c<NS>] + kmulti[d - _c<2 + NS>];
-                });
+            auto bcast_idx = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
+                if constexpr(d < 2)
+                    return out_idx[d];
+                else if constexpr(d < 2 + NS)
+                    return kmulti[d - _c<2>];
+                else
+                    return out_idx[d - _c<NS>] + kmulti[d - _c<2 + NS>];
+            });
             return x[bcast_idx] * w[bcast_idx];
         })(reduce::make_indices(_c<kernel_total>));
         return result;
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 6153d0a3811..e3a39d05fb7 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -341,8 +341,7 @@ struct find_channelwise_convolution
         auto bcast_weights = m.insert_instruction(
             ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights);
 
-        m.replace_instruction(
-            ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights);
+        m.replace_instruction(ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights);
     }
 };
 
diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp
index 1367fd85076..e3483480d8e 100644
--- a/test/verify/test_channelwise_conv.cpp
+++ b/test/verify/test_channelwise_conv.cpp
@@ -28,8 +28,7 @@
 #include <migraphx/make_op.hpp>
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_depthwise
-    : verify_program<test_channelwise_conv_depthwise<DType>>
+struct test_channelwise_conv_depthwise : verify_program<test_channelwise_conv_depthwise<DType>>
 {
     migraphx::program create_program() const
     {
@@ -82,8 +81,7 @@ template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::float_type>
 template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::half_type>;
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_1d
-    : verify_program<test_channelwise_conv_1d<DType>>
+struct test_channelwise_conv_1d : verify_program<test_channelwise_conv_1d<DType>>
 {
     migraphx::program create_program() const
     {

From efeafca404d5b07be2dbcf5ec7face4b6831ad65 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:20:50 +0000
Subject: [PATCH 03/84] Use shared memory

---
 src/targets/gpu/jit/channelwise_conv.cpp      |  33 +++---
 .../migraphx/kernels/channelwise_conv.hpp     | 112 ++++++++++++++----
 2 files changed, 108 insertions(+), 37 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index e0101186e5b..7cbe07938ae 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -44,7 +44,7 @@ extern "C" {
 MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 {
     transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
-        channelwise_conv<${algo}>(index_ints<${kernel}>{}, output, x, w);
+        channelwise_conv(index_ints<${kernel}>{}, index_ints<${spatial}>{}, output, x, w);
     });
 }
 
@@ -56,32 +56,35 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
+    std::vector<std::string> names() const
+    {
+        return {"gpu::channelwise_conv", "channelwise_conv"};
+    }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s        = inputs.at(0);
-        const auto& out_s      = inputs.back();
-        options.inputs         = inputs;
-        options.output         = out_s;
-        options.kernel_name    = "channelwise_conv_kernel";
+        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s     = inputs.at(0);
+        const auto& out_s   = inputs.back();
+        options.inputs      = inputs;
+        options.output      = out_s;
+        options.kernel_name = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens = x_s.lens();
-        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
-        std::size_t kernel_total = 1;
-        for(auto k : kernel_sizes)
-            kernel_total *= k;
+        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2,
+                                              x_lens.begin() + 2 + num_spatial);
+        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end());
 
-        std::string algo       = "reduce::lane";
+        auto num_channels  = out_s.lens()[0] * out_s.lens()[1];
         std::size_t block_size = 256;
 
-        options.set_launch_params(v, compute_global_for(ctx, out_s.elements(), 256), block_size);
+        options.set_launch_params(v, num_channels * block_size, block_size);
 
         auto src = interpolate_string(channelwise_conv_kernel,
-                                      {{"algo", algo}, {"kernel", to_string_range(kernel_sizes)}});
+                                      {{"kernel", to_string_range(kernel_sizes)},
+                                       {"spatial", to_string_range(spatial_sizes)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 0ade178111a..b72056ba15c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -26,34 +26,102 @@
 #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
 
 #include <migraphx/kernels/index.hpp>
-#include <migraphx/kernels/ops.hpp>
-#include <migraphx/kernels/reduce.hpp>
-#include <migraphx/kernels/pooling.hpp>
+#include <migraphx/kernels/array.hpp>
 
 namespace migraphx {
 
-template <class Algo, class KernelLens, class Output, class Input1, class Input2>
-__device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x, Input2 w)
+template <class KernelLens, class SpatialLens, class Output, class Input1, class Input2>
+__device__ void channelwise_conv(KernelLens kernel_lens,
+                                 SpatialLens,
+                                 Output output,
+                                 Input1 x,
+                                 Input2 w)
 {
-    constexpr index_int NS           = array_size(KernelLens{});
-    constexpr index_int NDIM         = 2 + 2 * NS;
-    constexpr index_int kernel_total = KernelLens{}.product();
-
-    pooling_reduce<Algo, 1>(output, [&](auto out_idx, auto r) {
-        auto result = r.reduce(op::sum{}, 0, [&](auto ki) {
-            auto kmulti    = kernel_lens.multi(ki);
-            auto bcast_idx = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
-                if constexpr(d < 2)
-                    return out_idx[d];
-                else if constexpr(d < 2 + NS)
-                    return kmulti[d - _c<2>];
-                else
-                    return out_idx[d - _c<NS>] + kmulti[d - _c<2 + NS>];
-            });
-            return x[bcast_idx] * w[bcast_idx];
-        })(reduce::make_indices(_c<kernel_total>));
+    constexpr index_int NS            = array_size(KernelLens{});
+    constexpr index_int kernel_total  = KernelLens{}.product();
+    constexpr index_int spatial_total = SpatialLens{}.product();
+    constexpr index_int product_total = kernel_total * spatial_total;
+
+    constexpr auto out_spatial_lens = return_array_c([] {
+        constexpr auto kl          = KernelLens{};
+        constexpr auto sl          = SpatialLens{};
+        constexpr index_int ns     = array_size(KernelLens{});
+        array<index_int, ns> result;
+        for(index_int i = 0; i < ns; i++)
+            result[i] = sl[i] - kl[i] + 1;
         return result;
     });
+    constexpr index_int out_spatial_total = out_spatial_lens.product();
+
+    constexpr auto prod_lens = return_array_c([] {
+        constexpr auto kl              = KernelLens{};
+        constexpr auto sl              = SpatialLens{};
+        constexpr index_int ns         = array_size(KernelLens{});
+        array<index_int, 2 * ns> result;
+        for(index_int i = 0; i < ns; i++)
+            result[i] = kl[i];
+        for(index_int i = 0; i < ns; i++)
+            result[ns + i] = sl[i];
+        return result;
+    });
+    constexpr auto smem_shape = make_shape(prod_lens);
+
+    using T = typename Output::type;
+    __shared__ T smem[product_total];
+
+    auto idx = make_index();
+
+    index_int C = output.get_shape().lens[1];
+    auto n      = idx.group / C;
+    auto c      = idx.group % C;
+
+    // Phase 1: elementwise multiply into shared memory
+    for(index_int i = idx.local; i < product_total; i += idx.nlocal())
+    {
+        auto prod_multi = prod_lens.multi(i);
+        auto bcast_idx =
+            generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
+                if constexpr(d == 0)
+                    return n;
+                else if constexpr(d == 1)
+                    return c;
+                else
+                    return prod_multi[d - _c<2>];
+            });
+        smem[i] = x[bcast_idx] * w[bcast_idx];
+    }
+
+    __syncthreads();
+
+    auto smem_view = make_tensor_view(&smem[0], smem_shape);
+
+    // Phase 2: sliding window reduce from shared memory
+    for(index_int j = idx.local; j < out_spatial_total; j += idx.nlocal())
+    {
+        auto out_spatial = out_spatial_lens.multi(j);
+        T acc            = 0;
+        for(index_int ki = 0; ki < kernel_total; ki++)
+        {
+            auto k_multi  = kernel_lens.multi(ki);
+            auto smem_idx = generate_array<index_int>(_c<2 * NS>, [&](auto d) -> index_int {
+                if constexpr(d < NS)
+                    return k_multi[d];
+                else
+                    return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
+            });
+            acc += smem_view[smem_idx];
+        }
+
+        auto out_idx = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
+            if constexpr(d == 0)
+                return n;
+            else if constexpr(d == 1)
+                return c;
+            else
+                return out_spatial[d - _c<2>];
+        });
+        output[out_idx] = acc;
+    }
 }
 
 } // namespace migraphx

From 44989349ef9677d45fe22281f219b9afd0f9a487 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:20:56 +0000
Subject: [PATCH 04/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 22 ++++------
 .../migraphx/kernels/channelwise_conv.hpp     | 44 +++++++++----------
 2 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 7cbe07938ae..ad535485512 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,28 +56,24 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const
-    {
-        return {"gpu::channelwise_conv", "channelwise_conv"};
-    }
+    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s     = inputs.at(0);
-        const auto& out_s   = inputs.back();
-        options.inputs      = inputs;
-        options.output      = out_s;
-        options.kernel_name = "channelwise_conv_kernel";
+        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s        = inputs.at(0);
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens = x_s.lens();
-        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2,
-                                              x_lens.begin() + 2 + num_spatial);
+        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
         std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end());
 
-        auto num_channels  = out_s.lens()[0] * out_s.lens()[1];
+        auto num_channels      = out_s.lens()[0] * out_s.lens()[1];
         std::size_t block_size = 256;
 
         options.set_launch_params(v, num_channels * block_size, block_size);
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index b72056ba15c..fef75d5573e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -31,21 +31,18 @@
 namespace migraphx {
 
 template <class KernelLens, class SpatialLens, class Output, class Input1, class Input2>
-__device__ void channelwise_conv(KernelLens kernel_lens,
-                                 SpatialLens,
-                                 Output output,
-                                 Input1 x,
-                                 Input2 w)
+__device__ void
+channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, Input2 w)
 {
     constexpr index_int NS            = array_size(KernelLens{});
     constexpr index_int kernel_total  = KernelLens{}.product();
     constexpr index_int spatial_total = SpatialLens{}.product();
     constexpr index_int product_total = kernel_total * spatial_total;
 
-    constexpr auto out_spatial_lens = return_array_c([] {
-        constexpr auto kl          = KernelLens{};
-        constexpr auto sl          = SpatialLens{};
-        constexpr index_int ns     = array_size(KernelLens{});
+    constexpr auto out_spatial_lens       = return_array_c([] {
+        constexpr auto kl      = KernelLens{};
+        constexpr auto sl      = SpatialLens{};
+        constexpr index_int ns = array_size(KernelLens{});
         array<index_int, ns> result;
         for(index_int i = 0; i < ns; i++)
             result[i] = sl[i] - kl[i] + 1;
@@ -53,10 +50,10 @@ __device__ void channelwise_conv(KernelLens kernel_lens,
     });
     constexpr index_int out_spatial_total = out_spatial_lens.product();
 
-    constexpr auto prod_lens = return_array_c([] {
-        constexpr auto kl              = KernelLens{};
-        constexpr auto sl              = SpatialLens{};
-        constexpr index_int ns         = array_size(KernelLens{});
+    constexpr auto prod_lens  = return_array_c([] {
+        constexpr auto kl      = KernelLens{};
+        constexpr auto sl      = SpatialLens{};
+        constexpr index_int ns = array_size(KernelLens{});
         array<index_int, 2 * ns> result;
         for(index_int i = 0; i < ns; i++)
             result[i] = kl[i];
@@ -79,16 +76,15 @@ __device__ void channelwise_conv(KernelLens kernel_lens,
     for(index_int i = idx.local; i < product_total; i += idx.nlocal())
     {
         auto prod_multi = prod_lens.multi(i);
-        auto bcast_idx =
-            generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
-                if constexpr(d == 0)
-                    return n;
-                else if constexpr(d == 1)
-                    return c;
-                else
-                    return prod_multi[d - _c<2>];
-            });
-        smem[i] = x[bcast_idx] * w[bcast_idx];
+        auto bcast_idx  = generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
+            if constexpr(d == 0)
+                return n;
+            else if constexpr(d == 1)
+                return c;
+            else
+                return prod_multi[d - _c<2>];
+        });
+        smem[i]         = x[bcast_idx] * w[bcast_idx];
     }
 
     __syncthreads();
@@ -112,7 +108,7 @@ __device__ void channelwise_conv(KernelLens kernel_lens,
             acc += smem_view[smem_idx];
         }
 
-        auto out_idx = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
+        auto out_idx    = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
             if constexpr(d == 0)
                 return n;
             else if constexpr(d == 1)

From 1792edbb745ae0465e9e0024fb6a8eb996c16334 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:23:08 +0000
Subject: [PATCH 05/84] Update slice functions

---
 .../include/migraphx/kernels/slice.hpp        | 261 +++++++++---------
 1 file changed, 135 insertions(+), 126 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
index 90c8b6a7dd6..00db73aebd0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
@@ -21,129 +21,138 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP
-#define MIGRAPHX_GUARD_KERNELS_SLICE_HPP
-
-#include <migraphx/kernels/shape.hpp>
-#include <migraphx/kernels/tensor_view.hpp>
-#include <migraphx/kernels/index.hpp>
-
-namespace migraphx {
-
-template <class Shape, class Size>
-constexpr auto slice_make_multi_lens(Shape, Size)
-{
-    return return_array_c([] {
-        auto n     = Size{} - _c<1>;
-        auto i     = Shape{}.multi(n);
-        using type = typename decltype(i)::value_type;
-        return i + type{1};
-    });
-}
-
-template <class Shape, class T, T... Xs>
-constexpr auto slice_make_multi_lens(Shape, integral_const_array<T, Xs...> x)
-{
-    return x;
-}
-
-template <class Shape, class Select>
-constexpr auto make_slice(Shape, Select select)
-{
-    auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int {
-        if(select(x, ii, Shape{}.lens.size()))
-            return x;
-        return 1;
-    });
-    return make_shape(inner_lens, Shape{}.strides);
-}
-
-template <class Shape, class Select, class Size>
-constexpr auto make_slice(Shape input, Select select, Size size)
-{
-    auto as   = make_slice(input, select);
-    auto lens = slice_make_multi_lens(as, size);
-    return make_shape(lens, Shape{}.strides);
-}
-
-template <class F>
-struct slice_size_transform
-{
-    F f;
-
-    template <class... Ts>
-    constexpr auto operator()(Ts... xs) const
-    {
-        return f(xs...);
-    }
-};
-MIGRAPHX_AUTO_DEDUCE(slice_size_transform);
-
-template <class Shape, class Select, class F>
-constexpr auto make_slice(Shape input, Select select, slice_size_transform<F> t)
-{
-    auto as   = make_slice(input, select);
-    auto lens = slice_make_multi_lens(as, decltype(t(input, as)){});
-    return make_shape(lens, Shape{}.strides);
-}
-
-template <class Shape, class... Ss>
-constexpr auto nslices(Shape input, Ss... ss)
-{
-    auto as = make_slice(input, ss...);
-    return input.elements() / as.elements();
-}
-
-template <index_int N>
-constexpr auto slice_group()
-{
-    return slice_size_transform{[](auto input, auto s) {
-        auto r = return_array_c([] {
-            auto lens = decltype(s){}.lens.base();
-            lens.back() *= N;
-            lens -= 1;
-            return decltype(input){}.lens.carry(lens) + index_int{1};
-        });
-        return r;
-    }};
-}
-
-template <index_int N>
-constexpr auto slice_split()
-{
-    return slice_size_transform{[](auto, auto s) { return s.elements() / _c<N>; }};
-}
-
-template <diff_int... Axes>
-constexpr auto slice_axes()
-{
-    return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); };
-}
-
-template <class Input, class T, class... Ss>
-constexpr auto slice_tensor(Input input, T start, Ss... ss)
-{
-    constexpr auto inner_shape = make_slice(get_shape_c<Input>{}, ss...);
-    auto outer_lens            = transform(get_shape_c<Input>{}.lens,
-                                inner_shape.lens,
-                                [=](auto x, auto inner) { return 1 + x - inner; });
-    auto outer_shape           = make_shape(outer_lens, get_shape_c<Input>{}.strides);
-    auto offset                = outer_shape.index(start);
-    MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c<Input>{}.element_space());
-    return make_tensor_view(input.data() + offset, inner_shape);
-}
-
-template <class Schedule, class... Ss>
-constexpr auto slice_schedule(index idx, Ss... ss)
-{
-    return [=](auto... xs) {
-        return [=](auto f) {
-            // TODO: Assert nslices is the same for all xs
-            constexpr auto n = nslices(get_shape_c<decltype(arg_c<0>()(xs...))>{}, ss...);
-            Schedule{idx}.group_stride(n, [&](auto i) { f(slice_tensor(xs, i, ss...)...); });
-        };
-    };
-}
-
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+ #ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+ #define MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+ 
+ #include <migraphx/kernels/shape.hpp>
+ #include <migraphx/kernels/tensor_view.hpp>
+ #include <migraphx/kernels/index.hpp>
+ 
+ namespace migraphx {
+ 
+ template <class Shape, class Size>
+ constexpr auto slice_make_multi_lens(Shape, Size)
+ {
+     return return_array_c([] {
+         auto n     = Size{} - _c<1>;
+         auto i     = Shape{}.multi(n);
+         using type = typename decltype(i)::value_type;
+         return i + type{1};
+     });
+ }
+ 
+ template <class Shape, class T, T... Xs>
+ constexpr auto slice_make_multi_lens(Shape, integral_const_array<T, Xs...> x)
+ {
+     return x;
+ }
+ 
+ template <class Shape, class Select>
+ constexpr auto make_slice(Shape, Select select)
+ {
+     auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int {
+         if(select(x, ii, Shape{}.lens.size()))
+             return x;
+         return 1;
+     });
+     return make_shape(inner_lens, Shape{}.strides);
+ }
+ 
+ template <class Shape, class Select, class Size>
+ constexpr auto make_slice(Shape input, Select select, Size size)
+ {
+     auto as   = make_slice(input, select);
+     auto lens = slice_make_multi_lens(as, size);
+     return make_shape(lens, Shape{}.strides);
+ }
+ 
+ template <class F>
+ struct slice_size_transform
+ {
+     F f;
+ 
+     template <class... Ts>
+     constexpr auto operator()(Ts... xs) const
+     {
+         return f(xs...);
+     }
+ };
+ MIGRAPHX_AUTO_DEDUCE(slice_size_transform);
+ 
+ template <class Shape, class Select, class F>
+ constexpr auto make_slice(Shape input, Select select, slice_size_transform<F> t)
+ {
+     auto as   = make_slice(input, select);
+     auto lens = slice_make_multi_lens(as, decltype(t(input, as)){});
+     return make_shape(lens, Shape{}.strides);
+ }
+ 
+ template <class Shape, class... Ss>
+ constexpr auto nslices(Shape input, Ss... ss)
+ {
+     auto as = make_slice(input, ss...);
+     return input.elements() / as.elements();
+ }
+ 
+ template <index_int N>
+ constexpr auto slice_group()
+ {
+     return slice_size_transform{[](auto input, auto s) {
+         auto r = return_array_c([] {
+             auto lens = decltype(s){}.lens.base();
+             lens.back() *= N;
+             lens -= 1;
+             return decltype(input){}.lens.carry(lens) + index_int{1};
+         });
+         return r;
+     }};
+ }
+ 
+ template <index_int N>
+ constexpr auto slice_split()
+ {
+     return slice_size_transform{[](auto, auto s) { return s.elements() / _c<N>; }};
+ }
+ 
+ template <diff_int... Axes>
+ constexpr auto slice_axes()
+ {
+     return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); };
+ }
+ 
+ template <class Input, class T, class... Ss>
+ constexpr auto slice_tensor(Input input, T start, Ss... ss)
+ {
+     constexpr auto inner_shape = make_slice(get_shape_c<Input>{}, ss...);
+     auto outer_lens            = transform(
+         get_shape_c<Input>{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; });
+     // TODO: Handle non-divisble dimensions
+     auto outer_shape = make_shape(outer_lens, get_shape_c<Input>{}.strides * inner_shape.lens);
+     auto offset                = outer_shape.index(start);
+     MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() ==
+                     input.get_shape().elements());
+     MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c<Input>{}.element_space());
+     return make_tensor_view(input.data() + offset, inner_shape);
+ }
+ 
+ template <class Schedule, class... Ss>
+ constexpr auto slice_schedule(index idx, Ss... ss)
+ {
+     return [=](auto... xs) {
+         return [=](auto f) {
+             constexpr auto first = get_shape_c<decltype(arg_c<0>()(xs...))>{};
+             constexpr auto n     = nslices(first, ss...);
+             MIGRAPHX_ASSERT(((n == nslices(get_shape_c<decltype(xs)>{}, ss...)) and ...));
+             Schedule{idx}.group_stride(n, [&](auto i) {
+                 MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n ==
+                                   xs.get_shape().elements()) and
+                                  ...));
+                 f(slice_tensor(xs, i, ss...)...);
+             });
+         };
+     };
+ }
+ 
+ } // namespace migraphx
+ #endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+ 
\ No newline at end of file

From 030497288acaa717b113619159a78b809b3f61c2 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:23:11 +0000
Subject: [PATCH 06/84] Format

---
 .../include/migraphx/kernels/slice.hpp        | 269 +++++++++---------
 1 file changed, 134 insertions(+), 135 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
index 00db73aebd0..4bc4d6354b6 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
@@ -21,138 +21,137 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
- #ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP
- #define MIGRAPHX_GUARD_KERNELS_SLICE_HPP
- 
- #include <migraphx/kernels/shape.hpp>
- #include <migraphx/kernels/tensor_view.hpp>
- #include <migraphx/kernels/index.hpp>
- 
- namespace migraphx {
- 
- template <class Shape, class Size>
- constexpr auto slice_make_multi_lens(Shape, Size)
- {
-     return return_array_c([] {
-         auto n     = Size{} - _c<1>;
-         auto i     = Shape{}.multi(n);
-         using type = typename decltype(i)::value_type;
-         return i + type{1};
-     });
- }
- 
- template <class Shape, class T, T... Xs>
- constexpr auto slice_make_multi_lens(Shape, integral_const_array<T, Xs...> x)
- {
-     return x;
- }
- 
- template <class Shape, class Select>
- constexpr auto make_slice(Shape, Select select)
- {
-     auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int {
-         if(select(x, ii, Shape{}.lens.size()))
-             return x;
-         return 1;
-     });
-     return make_shape(inner_lens, Shape{}.strides);
- }
- 
- template <class Shape, class Select, class Size>
- constexpr auto make_slice(Shape input, Select select, Size size)
- {
-     auto as   = make_slice(input, select);
-     auto lens = slice_make_multi_lens(as, size);
-     return make_shape(lens, Shape{}.strides);
- }
- 
- template <class F>
- struct slice_size_transform
- {
-     F f;
- 
-     template <class... Ts>
-     constexpr auto operator()(Ts... xs) const
-     {
-         return f(xs...);
-     }
- };
- MIGRAPHX_AUTO_DEDUCE(slice_size_transform);
- 
- template <class Shape, class Select, class F>
- constexpr auto make_slice(Shape input, Select select, slice_size_transform<F> t)
- {
-     auto as   = make_slice(input, select);
-     auto lens = slice_make_multi_lens(as, decltype(t(input, as)){});
-     return make_shape(lens, Shape{}.strides);
- }
- 
- template <class Shape, class... Ss>
- constexpr auto nslices(Shape input, Ss... ss)
- {
-     auto as = make_slice(input, ss...);
-     return input.elements() / as.elements();
- }
- 
- template <index_int N>
- constexpr auto slice_group()
- {
-     return slice_size_transform{[](auto input, auto s) {
-         auto r = return_array_c([] {
-             auto lens = decltype(s){}.lens.base();
-             lens.back() *= N;
-             lens -= 1;
-             return decltype(input){}.lens.carry(lens) + index_int{1};
-         });
-         return r;
-     }};
- }
- 
- template <index_int N>
- constexpr auto slice_split()
- {
-     return slice_size_transform{[](auto, auto s) { return s.elements() / _c<N>; }};
- }
- 
- template <diff_int... Axes>
- constexpr auto slice_axes()
- {
-     return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); };
- }
- 
- template <class Input, class T, class... Ss>
- constexpr auto slice_tensor(Input input, T start, Ss... ss)
- {
-     constexpr auto inner_shape = make_slice(get_shape_c<Input>{}, ss...);
-     auto outer_lens            = transform(
-         get_shape_c<Input>{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; });
-     // TODO: Handle non-divisble dimensions
-     auto outer_shape = make_shape(outer_lens, get_shape_c<Input>{}.strides * inner_shape.lens);
-     auto offset                = outer_shape.index(start);
-     MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() ==
-                     input.get_shape().elements());
-     MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c<Input>{}.element_space());
-     return make_tensor_view(input.data() + offset, inner_shape);
- }
- 
- template <class Schedule, class... Ss>
- constexpr auto slice_schedule(index idx, Ss... ss)
- {
-     return [=](auto... xs) {
-         return [=](auto f) {
-             constexpr auto first = get_shape_c<decltype(arg_c<0>()(xs...))>{};
-             constexpr auto n     = nslices(first, ss...);
-             MIGRAPHX_ASSERT(((n == nslices(get_shape_c<decltype(xs)>{}, ss...)) and ...));
-             Schedule{idx}.group_stride(n, [&](auto i) {
-                 MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n ==
-                                   xs.get_shape().elements()) and
-                                  ...));
-                 f(slice_tensor(xs, i, ss...)...);
-             });
-         };
-     };
- }
- 
- } // namespace migraphx
- #endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP
- 
\ No newline at end of file
+#ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+#define MIGRAPHX_GUARD_KERNELS_SLICE_HPP
+
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/index.hpp>
+
+namespace migraphx {
+
+template <class Shape, class Size>
+constexpr auto slice_make_multi_lens(Shape, Size)
+{
+    return return_array_c([] {
+        auto n     = Size{} - _c<1>;
+        auto i     = Shape{}.multi(n);
+        using type = typename decltype(i)::value_type;
+        return i + type{1};
+    });
+}
+
+template <class Shape, class T, T... Xs>
+constexpr auto slice_make_multi_lens(Shape, integral_const_array<T, Xs...> x)
+{
+    return x;
+}
+
+template <class Shape, class Select>
+constexpr auto make_slice(Shape, Select select)
+{
+    auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int {
+        if(select(x, ii, Shape{}.lens.size()))
+            return x;
+        return 1;
+    });
+    return make_shape(inner_lens, Shape{}.strides);
+}
+
+template <class Shape, class Select, class Size>
+constexpr auto make_slice(Shape input, Select select, Size size)
+{
+    auto as   = make_slice(input, select);
+    auto lens = slice_make_multi_lens(as, size);
+    return make_shape(lens, Shape{}.strides);
+}
+
+template <class F>
+struct slice_size_transform
+{
+    F f;
+
+    template <class... Ts>
+    constexpr auto operator()(Ts... xs) const
+    {
+        return f(xs...);
+    }
+};
+MIGRAPHX_AUTO_DEDUCE(slice_size_transform);
+
+template <class Shape, class Select, class F>
+constexpr auto make_slice(Shape input, Select select, slice_size_transform<F> t)
+{
+    auto as   = make_slice(input, select);
+    auto lens = slice_make_multi_lens(as, decltype(t(input, as)){});
+    return make_shape(lens, Shape{}.strides);
+}
+
+template <class Shape, class... Ss>
+constexpr auto nslices(Shape input, Ss... ss)
+{
+    auto as = make_slice(input, ss...);
+    return input.elements() / as.elements();
+}
+
+template <index_int N>
+constexpr auto slice_group()
+{
+    return slice_size_transform{[](auto input, auto s) {
+        auto r = return_array_c([] {
+            auto lens = decltype(s){}.lens.base();
+            lens.back() *= N;
+            lens -= 1;
+            return decltype(input){}.lens.carry(lens) + index_int{1};
+        });
+        return r;
+    }};
+}
+
+template <index_int N>
+constexpr auto slice_split()
+{
+    return slice_size_transform{[](auto, auto s) { return s.elements() / _c<N>; }};
+}
+
+template <diff_int... Axes>
+constexpr auto slice_axes()
+{
+    return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); };
+}
+
+template <class Input, class T, class... Ss>
+constexpr auto slice_tensor(Input input, T start, Ss... ss)
+{
+    constexpr auto inner_shape = make_slice(get_shape_c<Input>{}, ss...);
+    auto outer_lens            = transform(
+        get_shape_c<Input>{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; });
+    // TODO: Handle non-divisble dimensions
+    auto outer_shape = make_shape(outer_lens, get_shape_c<Input>{}.strides * inner_shape.lens);
+    auto offset      = outer_shape.index(start);
+    MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() ==
+                    input.get_shape().elements());
+    MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c<Input>{}.element_space());
+    return make_tensor_view(input.data() + offset, inner_shape);
+}
+
+template <class Schedule, class... Ss>
+constexpr auto slice_schedule(index idx, Ss... ss)
+{
+    return [=](auto... xs) {
+        return [=](auto f) {
+            constexpr auto first = get_shape_c<decltype(arg_c<0>()(xs...))>{};
+            constexpr auto n     = nslices(first, ss...);
+            MIGRAPHX_ASSERT(((n == nslices(get_shape_c<decltype(xs)>{}, ss...)) and ...));
+            Schedule{idx}.group_stride(n, [&](auto i) {
+                MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n ==
+                                  xs.get_shape().elements()) and
+                                 ...));
+                f(slice_tensor(xs, i, ss...)...);
+            });
+        };
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP

From 1389ae5182e13f83d2363cc5a2f99ffa1158bb51 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:36:27 +0000
Subject: [PATCH 07/84] Update to use slices instead

---
 .../migraphx/kernels/channelwise_conv.hpp     | 97 +++++++++----------
 1 file changed, 47 insertions(+), 50 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index fef75d5573e..8aac289c2d7 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -27,6 +27,8 @@
 
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/slice.hpp>
+#include <migraphx/kernels/uninitialized_buffer.hpp>
 
 namespace migraphx {
 
@@ -39,7 +41,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
     constexpr index_int spatial_total = SpatialLens{}.product();
     constexpr index_int product_total = kernel_total * spatial_total;
 
-    constexpr auto out_spatial_lens       = return_array_c([] {
+    constexpr auto out_spatial_lens = return_array_c([] {
         constexpr auto kl      = KernelLens{};
         constexpr auto sl      = SpatialLens{};
         constexpr index_int ns = array_size(KernelLens{});
@@ -50,7 +52,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
     });
     constexpr index_int out_spatial_total = out_spatial_lens.product();
 
-    constexpr auto prod_lens  = return_array_c([] {
+    constexpr auto prod_lens = return_array_c([] {
         constexpr auto kl      = KernelLens{};
         constexpr auto sl      = SpatialLens{};
         constexpr index_int ns = array_size(KernelLens{});
@@ -61,63 +63,58 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
             result[ns + i] = sl[i];
         return result;
     });
-    constexpr auto smem_shape = make_shape(prod_lens);
+    constexpr auto prod_strides = calculate_strides(prod_lens);
 
     using T = typename Output::type;
-    __shared__ T smem[product_total];
+    __shared__ uninitialized_buffer<T, product_total> smem;
 
-    auto idx = make_index();
+    auto idx            = make_index();
+    auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; };
 
-    index_int C = output.get_shape().lens[1];
-    auto n      = idx.group / C;
-    auto c      = idx.group % C;
-
-    // Phase 1: elementwise multiply into shared memory
-    for(index_int i = idx.local; i < product_total; i += idx.nlocal())
-    {
-        auto prod_multi = prod_lens.multi(i);
-        auto bcast_idx  = generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
-            if constexpr(d == 0)
-                return n;
-            else if constexpr(d == 1)
-                return c;
-            else
-                return prod_multi[d - _c<2>];
-        });
-        smem[i]         = x[bcast_idx] * w[bcast_idx];
-    }
+    slice_schedule<per_block>(idx, keep_non_batch)(x, w, output)(
+        [&](auto x_ch, auto w_ch, auto out_ch) {
+            // Phase 1: elementwise multiply into shared memory
+            idx.local_stride(_c<product_total>, [&](auto i) {
+                auto prod_multi = prod_lens.multi(i);
+                auto ch_idx =
+                    generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
+                        if constexpr(d < 2)
+                            return 0;
+                        else
+                            return prod_multi[d - _c<2>];
+                    });
+                smem[i] = x_ch[ch_idx] * w_ch[ch_idx];
+            });
 
-    __syncthreads();
+            __syncthreads();
 
-    auto smem_view = make_tensor_view(&smem[0], smem_shape);
+            // Phase 2: sliding window reduce from shared memory
+            idx.local_stride(_c<out_spatial_total>, [&](auto j) {
+                auto out_spatial = out_spatial_lens.multi(j);
+                T acc            = 0;
+                for(index_int ki = 0; ki < kernel_total; ki++)
+                {
+                    auto k_multi  = kernel_lens.multi(ki);
+                    auto smem_idx = generate_array<index_int>(
+                        _c<2 * NS>, [&](auto d) -> index_int {
+                            if constexpr(d < NS)
+                                return k_multi[d];
+                            else
+                                return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
+                        });
+                    acc += smem[smem_idx.dot(prod_strides)];
+                }
 
-    // Phase 2: sliding window reduce from shared memory
-    for(index_int j = idx.local; j < out_spatial_total; j += idx.nlocal())
-    {
-        auto out_spatial = out_spatial_lens.multi(j);
-        T acc            = 0;
-        for(index_int ki = 0; ki < kernel_total; ki++)
-        {
-            auto k_multi  = kernel_lens.multi(ki);
-            auto smem_idx = generate_array<index_int>(_c<2 * NS>, [&](auto d) -> index_int {
-                if constexpr(d < NS)
-                    return k_multi[d];
-                else
-                    return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
+                auto out_idx =
+                    generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
+                        if constexpr(d < 2)
+                            return 0;
+                        else
+                            return out_spatial[d - _c<2>];
+                    });
+                out_ch[out_idx] = acc;
             });
-            acc += smem_view[smem_idx];
-        }
-
-        auto out_idx    = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
-            if constexpr(d == 0)
-                return n;
-            else if constexpr(d == 1)
-                return c;
-            else
-                return out_spatial[d - _c<2>];
         });
-        output[out_idx] = acc;
-    }
 }
 
 } // namespace migraphx

From 9c9b9a54ca93ffe4cb84e41979e24118c74cce02 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:36:30 +0000
Subject: [PATCH 08/84] Format

---
 .../migraphx/kernels/channelwise_conv.hpp     | 77 +++++++++----------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 8aac289c2d7..ac50e18aeaa 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -41,7 +41,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
     constexpr index_int spatial_total = SpatialLens{}.product();
     constexpr index_int product_total = kernel_total * spatial_total;
 
-    constexpr auto out_spatial_lens = return_array_c([] {
+    constexpr auto out_spatial_lens       = return_array_c([] {
         constexpr auto kl      = KernelLens{};
         constexpr auto sl      = SpatialLens{};
         constexpr index_int ns = array_size(KernelLens{});
@@ -52,7 +52,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
     });
     constexpr index_int out_spatial_total = out_spatial_lens.product();
 
-    constexpr auto prod_lens = return_array_c([] {
+    constexpr auto prod_lens    = return_array_c([] {
         constexpr auto kl      = KernelLens{};
         constexpr auto sl      = SpatialLens{};
         constexpr index_int ns = array_size(KernelLens{});
@@ -71,50 +71,47 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I
     auto idx            = make_index();
     auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; };
 
-    slice_schedule<per_block>(idx, keep_non_batch)(x, w, output)(
-        [&](auto x_ch, auto w_ch, auto out_ch) {
-            // Phase 1: elementwise multiply into shared memory
-            idx.local_stride(_c<product_total>, [&](auto i) {
-                auto prod_multi = prod_lens.multi(i);
-                auto ch_idx =
-                    generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
-                        if constexpr(d < 2)
-                            return 0;
-                        else
-                            return prod_multi[d - _c<2>];
-                    });
-                smem[i] = x_ch[ch_idx] * w_ch[ch_idx];
+    slice_schedule<per_block>(idx,
+                              keep_non_batch)(x, w, output)([&](auto x_ch, auto w_ch, auto out_ch) {
+        // Phase 1: elementwise multiply into shared memory
+        idx.local_stride(_c<product_total>, [&](auto i) {
+            auto prod_multi = prod_lens.multi(i);
+            auto ch_idx     = generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
+                if constexpr(d < 2)
+                    return 0;
+                else
+                    return prod_multi[d - _c<2>];
             });
+            smem[i]         = x_ch[ch_idx] * w_ch[ch_idx];
+        });
 
-            __syncthreads();
+        __syncthreads();
 
-            // Phase 2: sliding window reduce from shared memory
-            idx.local_stride(_c<out_spatial_total>, [&](auto j) {
-                auto out_spatial = out_spatial_lens.multi(j);
-                T acc            = 0;
-                for(index_int ki = 0; ki < kernel_total; ki++)
-                {
-                    auto k_multi  = kernel_lens.multi(ki);
-                    auto smem_idx = generate_array<index_int>(
-                        _c<2 * NS>, [&](auto d) -> index_int {
-                            if constexpr(d < NS)
-                                return k_multi[d];
-                            else
-                                return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
-                        });
-                    acc += smem[smem_idx.dot(prod_strides)];
-                }
+        // Phase 2: sliding window reduce from shared memory
+        idx.local_stride(_c<out_spatial_total>, [&](auto j) {
+            auto out_spatial = out_spatial_lens.multi(j);
+            T acc            = 0;
+            for(index_int ki = 0; ki < kernel_total; ki++)
+            {
+                auto k_multi  = kernel_lens.multi(ki);
+                auto smem_idx = generate_array<index_int>(_c<2 * NS>, [&](auto d) -> index_int {
+                    if constexpr(d < NS)
+                        return k_multi[d];
+                    else
+                        return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
+                });
+                acc += smem[smem_idx.dot(prod_strides)];
+            }
 
-                auto out_idx =
-                    generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
-                        if constexpr(d < 2)
-                            return 0;
-                        else
-                            return out_spatial[d - _c<2>];
-                    });
-                out_ch[out_idx] = acc;
+            auto out_idx    = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
+                if constexpr(d < 2)
+                    return 0;
+                else
+                    return out_spatial[d - _c<2>];
             });
+            out_ch[out_idx] = acc;
         });
+    });
 }
 
 } // namespace migraphx

From 207e5d6d85d87fd673ac38040983e05ab5c550c6 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 11:53:43 -0600
Subject: [PATCH 09/84] Add reduce_schedule for outer batches

---
 src/targets/gpu/jit/channelwise_conv.cpp      |  23 ++--
 .../migraphx/kernels/channelwise_conv.hpp     | 128 ++++++++----------
 .../include/migraphx/kernels/index.hpp        |  29 ++++
 .../include/migraphx/kernels/reduce.hpp       |  20 +--
 src/targets/gpu/prefuse_ops.cpp               |  64 ++-------
 5 files changed, 123 insertions(+), 141 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index ad535485512..2d7ce51a853 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,22 +56,27 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
+    std::vector<std::string> names() const
+    {
+        return {"gpu::channelwise_conv", "channelwise_conv"};
+    }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s        = inputs.at(0);
-        const auto& out_s      = inputs.back();
-        options.inputs         = inputs;
-        options.output         = out_s;
-        options.kernel_name    = "channelwise_conv_kernel";
+        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s     = inputs.at(0);
+        const auto& w_s     = inputs.at(1);
+        const auto& out_s   = inputs.back();
+        options.inputs      = inputs;
+        options.output      = out_s;
+        options.kernel_name = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens = x_s.lens();
-        std::vector<std::size_t> kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
-        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end());
+        auto w_lens = w_s.lens();
+        std::vector<std::size_t> kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial);
+        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
 
         auto num_channels      = out_s.lens()[0] * out_s.lens()[1];
         std::size_t block_size = 256;
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index ac50e18aeaa..2a9795682a5 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -26,90 +26,78 @@
 #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
 
 #include <migraphx/kernels/index.hpp>
-#include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/algorithm.hpp>
 #include <migraphx/kernels/slice.hpp>
+#include <migraphx/kernels/copy.hpp>
+#include <migraphx/kernels/reduce.hpp>
 #include <migraphx/kernels/uninitialized_buffer.hpp>
 
 namespace migraphx {
 
-template <class KernelLens, class SpatialLens, class Output, class Input1, class Input2>
+template <class Output, class F>
+__device__ void per_block_pooling_reduce(index idx, Output output, F f)
+{
+    constexpr auto nelements = get_shape_c<Output>{}.elements();
+    idx.local_stride(nelements, [&](auto i) {
+        auto out_idx = get_shape_c<Output>{}.multi(i);
+        auto slicer  = [](auto input) { return reduce_slice<decltype(output)>(input, 0); };
+        auto r       = reduce::lane::make(idx, slicer);
+        r.outer([&] { output[out_idx] = f(out_idx, r); });
+    });
+}
+
+template <class KernelLens, class SpatialLens, class Output, class Input, class Weights>
 __device__ void
-channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, Input2 w)
+channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
 {
-    constexpr index_int NS            = array_size(KernelLens{});
     constexpr index_int kernel_total  = KernelLens{}.product();
     constexpr index_int spatial_total = SpatialLens{}.product();
-    constexpr index_int product_total = kernel_total * spatial_total;
-
-    constexpr auto out_spatial_lens       = return_array_c([] {
-        constexpr auto kl      = KernelLens{};
-        constexpr auto sl      = SpatialLens{};
-        constexpr index_int ns = array_size(KernelLens{});
-        array<index_int, ns> result;
-        for(index_int i = 0; i < ns; i++)
-            result[i] = sl[i] - kl[i] + 1;
-        return result;
-    });
-    constexpr index_int out_spatial_total = out_spatial_lens.product();
-
-    constexpr auto prod_lens    = return_array_c([] {
-        constexpr auto kl      = KernelLens{};
-        constexpr auto sl      = SpatialLens{};
-        constexpr index_int ns = array_size(KernelLens{});
-        array<index_int, 2 * ns> result;
-        for(index_int i = 0; i < ns; i++)
-            result[i] = kl[i];
-        for(index_int i = 0; i < ns; i++)
-            result[ns + i] = sl[i];
-        return result;
-    });
-    constexpr auto prod_strides = calculate_strides(prod_lens);
+
+    constexpr index_int N     = get_shape_c<Output>{}.lens[0];
+    constexpr index_int C_out = get_shape_c<Output>{}.lens[1];
+    constexpr index_int C_in  = get_shape_c<Input>{}.lens[1];
+
+    constexpr auto smem_shape  = make_packed_shape(make_slice(get_shape_c<Input>{},
+        [](auto, auto i, auto) { return i >= 2; }));
+    constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c<Weights>{},
+        [](auto, auto i, auto) { return i >= 2; }));
+
+    constexpr auto out_nc = make_shape(index_ints<N, C_out>{});
+    constexpr auto co_cin = make_shape(index_ints<C_out / C_in, C_in>{});
+    constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
 
     using T = typename Output::type;
-    __shared__ uninitialized_buffer<T, product_total> smem;
-
-    auto idx            = make_index();
-    auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; };
-
-    slice_schedule<per_block>(idx,
-                              keep_non_batch)(x, w, output)([&](auto x_ch, auto w_ch, auto out_ch) {
-        // Phase 1: elementwise multiply into shared memory
-        idx.local_stride(_c<product_total>, [&](auto i) {
-            auto prod_multi = prod_lens.multi(i);
-            auto ch_idx     = generate_array<index_int>(_c<2 + 2 * NS>, [&](auto d) -> index_int {
-                if constexpr(d < 2)
-                    return 0;
-                else
-                    return prod_multi[d - _c<2>];
-            });
-            smem[i]         = x_ch[ch_idx] * w_ch[ch_idx];
-        });
+    __shared__ uninitialized_buffer<T, spatial_total> smem;
+
+    auto idx          = make_index();
+    auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
+
+    slice_schedule<per_block>(idx, keep_spatial)(output)([&](auto out_ch) {
+        auto nc_multi = out_nc.multi(idx.group);
+        auto n        = nc_multi[0];
+        auto co       = nc_multi[1];
+        auto c_in     = co_cin.multi(co)[1];
+
+        auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial);
+        auto w_ch = slice_tensor(w, co, keep_spatial);
+
+        // Phase 1: copy input channel into shared memory
+        auto smem_input = make_tensor_view(smem.data(), smem_shape);
+        local_tensor_copy(idx, x_ch, smem_input);
+
+        // Phase 2: copy weights into registers
+        array<T, kernel_total> wregs_arr;
+        auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape);
+        copy(w_ch.begin(), w_ch.end(), wregs.begin());
 
         __syncthreads();
 
-        // Phase 2: sliding window reduce from shared memory
-        idx.local_stride(_c<out_spatial_total>, [&](auto j) {
-            auto out_spatial = out_spatial_lens.multi(j);
-            T acc            = 0;
-            for(index_int ki = 0; ki < kernel_total; ki++)
-            {
-                auto k_multi  = kernel_lens.multi(ki);
-                auto smem_idx = generate_array<index_int>(_c<2 * NS>, [&](auto d) -> index_int {
-                    if constexpr(d < NS)
-                        return k_multi[d];
-                    else
-                        return out_spatial[d - _c<NS>] + k_multi[d - _c<NS>];
-                });
-                acc += smem[smem_idx.dot(prod_strides)];
-            }
-
-            auto out_idx    = generate_array<index_int>(_c<2 + NS>, [&](auto d) -> index_int {
-                if constexpr(d < 2)
-                    return 0;
-                else
-                    return out_spatial[d - _c<2>];
-            });
-            out_ch[out_idx] = acc;
+        // Phase 3: sliding window multiply-reduce
+        per_block_pooling_reduce(idx, out_ch, [&](auto out_idx, auto r) {
+            return r.reduce(op::sum{}, T{0}, [&](auto ki) {
+                auto k_multi = wregs_shape.multi(ki);
+                return smem_input[out_idx + k_multi] * wregs[k_multi];
+            })(reduce::make_indices(_c<kernel_total>));
         });
     });
 }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
index 77da7283190..4b9de7ae7ce 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -267,6 +267,12 @@ struct index
         }
     }
 
+    template <class F, class N>
+    __device__ void device_stride(N n, F f) const
+    {
+        for_stride<false>(_c<0>, n, _c<1>, f);
+    }
+
     template <class F, class N>
     __device__ void global_stride(N n, F f) const
     {
@@ -333,5 +339,28 @@ struct per_block
     }
 };
 
+struct per_device
+{
+    index idx;
+
+    constexpr auto local() const { return idx.global; }
+
+    constexpr auto nlocal() const { return idx.nglobal(); }
+
+    constexpr auto size() const { return _c<1>; }
+
+    template <class N, class F>
+    constexpr void group_stride(N n, F f) const
+    {
+        return idx.device_stride(n, f);
+    }
+
+    template <class N, class F>
+    constexpr void local_stride(N n, F f) const
+    {
+        return idx.global_stride(n, f);
+    }
+};
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_INDEX_HPP
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
index 59bf17b3eda..ac29a3174c1 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -516,12 +516,13 @@ struct block
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class F>
+    template <class Output, class Schedule=per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
+        auto schedule = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+        schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -570,12 +571,13 @@ struct block_large
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class F>
+    template <class Output, class Schedule=per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
+        auto schedule = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+        schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -648,12 +650,13 @@ struct subwave
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class F>
+    template <class Output, class Schedule=per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
+        auto schedule = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        idx.global_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
+        schedule.local_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal_subwave<SubWaveSize>());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -709,12 +712,13 @@ struct lane
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class F>
+    template <class Output, class Schedule=per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
+        auto schedule = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        idx.global_stride(nelements, [&](auto i) {
+        schedule.local_stride(nelements, [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i);
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index e3a39d05fb7..34f19869660 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -27,11 +27,9 @@
 #include <migraphx/gpu/gemm_softmax_gemm.hpp>
 #include <migraphx/match/layernorm.hpp>
 #include <migraphx/register_op.hpp>
-#include <migraphx/make_op.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
-#include <numeric>
 #ifdef MIGRAPHX_USE_COMPOSABLEKERNEL
 #include <migraphx/gpu/ck.hpp>
 #endif
@@ -254,16 +252,13 @@ struct channelwise_conv
     shape compute_shape(std::vector<shape> inputs) const
     {
         check_shapes{inputs, *this}.has(2);
-        auto lens = inputs.front().lens();
+        auto x_lens = inputs[0].lens();
+        auto w_lens = inputs[1].lens();
         std::vector<std::size_t> out_lens;
-        out_lens.push_back(lens[0]);
-        out_lens.push_back(lens[1]);
+        out_lens.push_back(x_lens[0]);
+        out_lens.push_back(w_lens[0]);
         for(std::size_t d = 0; d < num_spatial; ++d)
-        {
-            auto kernel_size  = lens[2 + d];
-            auto spatial_size = lens[2 + num_spatial + d];
-            out_lens.push_back(spatial_size - kernel_size + 1);
-        }
+            out_lens.push_back(x_lens[2 + d] - w_lens[2 + d] + 1);
         return {inputs.front().type(), out_lens};
     }
 };
@@ -297,51 +292,12 @@ struct find_channelwise_convolution
 
     void apply(module& m, const match::matcher_result& r) const
     {
-        auto ins = r.result;
-
-        auto input   = ins->inputs().front();
-        auto weights = ins->inputs().back();
-
-        auto w_lens      = weights->get_shape().lens();
-        auto x_lens      = input->get_shape().lens();
-        auto ndim        = ins->get_shape().ndim();
-        auto num_spatial = ndim - 2;
-
-        // Build product shape: [N, C, k_0, ..., k_{ns-1}, s_0, ..., s_{ns-1}]
-        std::vector<std::size_t> prod_lens;
-        prod_lens.push_back(x_lens[0]);
-        prod_lens.push_back(w_lens[0]);
-        for(std::size_t d = 2; d < ndim; ++d)
-            prod_lens.push_back(w_lens[d]);
-        for(std::size_t d = 2; d < ndim; ++d)
-            prod_lens.push_back(x_lens[d]);
-
-        // Unsqueeze input: [N, C_in, H, W] -> [N, C_in, 1, ..., 1, H, W]
-        std::vector<int64_t> input_unsq_axes(num_spatial);
-        std::iota(input_unsq_axes.begin(), input_unsq_axes.end(), 2);
-        auto unsq_input =
-            m.insert_instruction(ins, make_op("unsqueeze", {{"axes", input_unsq_axes}}), input);
-
-        // Broadcast input to product shape
-        auto bcast_input = m.insert_instruction(
-            ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_input);
-
-        // Squeeze weight axis 1: [C_out, 1, k_0, ...] -> [C_out, k_0, ...]
-        auto sq_weights = m.insert_instruction(ins, make_op("squeeze", {{"axes", {1}}}), weights);
-
-        // Unsqueeze weight: [C_out, k_0, ...] -> [1, C_out, k_0, ..., 1, ..., 1]
-        std::vector<int64_t> w_unsq_axes;
-        w_unsq_axes.push_back(0);
-        for(std::size_t d = 0; d < num_spatial; ++d)
-            w_unsq_axes.push_back(static_cast<int64_t>(2 + num_spatial + d));
-        auto unsq_weights =
-            m.insert_instruction(ins, make_op("unsqueeze", {{"axes", w_unsq_axes}}), sq_weights);
-
-        // Broadcast weight to product shape
-        auto bcast_weights = m.insert_instruction(
-            ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights);
+        auto ins         = r.result;
+        auto input       = ins->inputs().front();
+        auto weights     = ins->inputs().back();
+        auto num_spatial = ins->get_shape().ndim() - 2;
 
-        m.replace_instruction(ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights);
+        m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights);
     }
 };
 

From cdae8f459993bb67518b96aec2037d1fd478126b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 11:53:46 -0600
Subject: [PATCH 10/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 22 +++++++++----------
 .../migraphx/kernels/channelwise_conv.hpp     | 11 +++++-----
 .../include/migraphx/kernels/reduce.hpp       | 16 +++++++-------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 2d7ce51a853..e79116d9003 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,27 +56,25 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const
-    {
-        return {"gpu::channelwise_conv", "channelwise_conv"};
-    }
+    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s     = inputs.at(0);
-        const auto& w_s     = inputs.at(1);
-        const auto& out_s   = inputs.back();
-        options.inputs      = inputs;
-        options.output      = out_s;
-        options.kernel_name = "channelwise_conv_kernel";
+        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s        = inputs.at(0);
+        const auto& w_s        = inputs.at(1);
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens = x_s.lens();
         auto w_lens = w_s.lens();
         std::vector<std::size_t> kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial);
-        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial);
+        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2,
+                                               x_lens.begin() + 2 + num_spatial);
 
         auto num_channels      = out_s.lens()[0] * out_s.lens()[1];
         std::size_t block_size = 256;
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 2a9795682a5..4b3f8e2d9a7 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -47,8 +47,7 @@ __device__ void per_block_pooling_reduce(index idx, Output output, F f)
 }
 
 template <class KernelLens, class SpatialLens, class Output, class Input, class Weights>
-__device__ void
-channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
+__device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
 {
     constexpr index_int kernel_total  = KernelLens{}.product();
     constexpr index_int spatial_total = SpatialLens{}.product();
@@ -57,10 +56,10 @@ channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
     constexpr index_int C_out = get_shape_c<Output>{}.lens[1];
     constexpr index_int C_in  = get_shape_c<Input>{}.lens[1];
 
-    constexpr auto smem_shape  = make_packed_shape(make_slice(get_shape_c<Input>{},
-        [](auto, auto i, auto) { return i >= 2; }));
-    constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c<Weights>{},
-        [](auto, auto i, auto) { return i >= 2; }));
+    constexpr auto smem_shape = make_packed_shape(
+        make_slice(get_shape_c<Input>{}, [](auto, auto i, auto) { return i >= 2; }));
+    constexpr auto wregs_shape = make_packed_shape(
+        make_slice(get_shape_c<Weights>{}, [](auto, auto i, auto) { return i >= 2; }));
 
     constexpr auto out_nc = make_shape(index_ints<N, C_out>{});
     constexpr auto co_cin = make_shape(index_ints<C_out / C_in, C_in>{});
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
index ac29a3174c1..0abae0363d7 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -516,11 +516,11 @@ struct block
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule=per_device, class F>
+    template <class Output, class Schedule = per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule = Schedule{idx};
+        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
         schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
@@ -571,11 +571,11 @@ struct block_large
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule=per_device, class F>
+    template <class Output, class Schedule = per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule = Schedule{idx};
+        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
         schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
@@ -650,11 +650,11 @@ struct subwave
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule=per_device, class F>
+    template <class Output, class Schedule = per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule = Schedule{idx};
+        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
         schedule.local_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal_subwave<SubWaveSize>());
@@ -712,11 +712,11 @@ struct lane
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule=per_device, class F>
+    template <class Output, class Schedule = per_device, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule = Schedule{idx};
+        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
         schedule.local_stride(nelements, [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i);

From b51b82fe47261fbe1838ee63ae21c802e5500c37 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 12:08:36 -0600
Subject: [PATCH 11/84] Use pooling_reduce

---
 .../include/migraphx/kernels/channelwise_conv.hpp | 15 ++-------------
 .../kernels/include/migraphx/kernels/pooling.hpp  |  6 +++---
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 4b3f8e2d9a7..fb070c6279d 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -31,21 +31,10 @@
 #include <migraphx/kernels/copy.hpp>
 #include <migraphx/kernels/reduce.hpp>
 #include <migraphx/kernels/uninitialized_buffer.hpp>
+#include <migraphx/kernels/pooling.hpp>
 
 namespace migraphx {
 
-template <class Output, class F>
-__device__ void per_block_pooling_reduce(index idx, Output output, F f)
-{
-    constexpr auto nelements = get_shape_c<Output>{}.elements();
-    idx.local_stride(nelements, [&](auto i) {
-        auto out_idx = get_shape_c<Output>{}.multi(i);
-        auto slicer  = [](auto input) { return reduce_slice<decltype(output)>(input, 0); };
-        auto r       = reduce::lane::make(idx, slicer);
-        r.outer([&] { output[out_idx] = f(out_idx, r); });
-    });
-}
-
 template <class KernelLens, class SpatialLens, class Output, class Input, class Weights>
 __device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
 {
@@ -92,7 +81,7 @@ __device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x
         __syncthreads();
 
         // Phase 3: sliding window multiply-reduce
-        per_block_pooling_reduce(idx, out_ch, [&](auto out_idx, auto r) {
+        pooling_reduce<reduce::lane, 1, per_block>(out_ch, [&](auto out_idx, auto r) {
             return r.reduce(op::sum{}, T{0}, [&](auto ki) {
                 auto k_multi = wregs_shape.multi(ki);
                 return smem_input[out_idx + k_multi] * wregs[k_multi];
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
index 76bb7c3cb6b..5a236084c47 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
@@ -186,18 +186,18 @@ constexpr window<Window, Stride, Padding> make_window(Window w, Stride s, Paddin
     return {w, s, p};
 }
 
-template <class Algo, index_int GroupSize, class Output, class F>
+template <class Algo, index_int GroupSize, class Schedule= per_device, class Output, class F>
 __device__ void pooling_reduce(Output output, F f)
 {
     if constexpr(GroupSize < 2)
     {
-        Algo::template run<decltype(output)>(
+        Algo::template run<decltype(output), Schedule>(
             [&](auto out_idx, auto r) { r.outer([&] { output[out_idx] = f(out_idx, r); }); });
     }
     else
     {
         auto goutput = as_vec<GroupSize>(output, output.get_shape().lens.size() - _c<1>);
-        Algo::template run<decltype(goutput)>([&](auto out_idx, auto r) {
+        Algo::template run<decltype(goutput), Schedule>([&](auto out_idx, auto r) {
             auto i = out_idx;
             i.back() *= GroupSize;
             auto result = vec_generate<GroupSize>([&](auto) {

From b5f4f0f47c5fcb6062a9eb863a029b8c63724cf2 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 12:08:40 -0600
Subject: [PATCH 12/84] Format

---
 src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
index 5a236084c47..410deefb9be 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
@@ -186,7 +186,7 @@ constexpr window<Window, Stride, Padding> make_window(Window w, Stride s, Paddin
     return {w, s, p};
 }
 
-template <class Algo, index_int GroupSize, class Schedule= per_device, class Output, class F>
+template <class Algo, index_int GroupSize, class Schedule = per_device, class Output, class F>
 __device__ void pooling_reduce(Output output, F f)
 {
     if constexpr(GroupSize < 2)

From 15fd39f27bcb7fd929c2357c174fdeeefe73b15c Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 21:23:51 +0000
Subject: [PATCH 13/84] Some refactoring to use tiling

---
 src/targets/gpu/jit/channelwise_conv.cpp      |  65 +++++---
 .../migraphx/kernels/channelwise_conv.hpp     | 147 +++++++++++++-----
 test/verify/test_channelwise_conv.cpp         |  35 +++++
 3 files changed, 188 insertions(+), 59 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index e79116d9003..32aae6c2c2a 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -44,7 +44,7 @@ extern "C" {
 MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 {
     transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
-        channelwise_conv(index_ints<${kernel}>{}, index_ints<${spatial}>{}, output, x, w);
+        channelwise_conv(index_ints<${tile}>{}, output, x, w);
     });
 }
 
@@ -56,34 +56,57 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
+    std::vector<std::string> names() const
+    {
+        return {"gpu::channelwise_conv", "channelwise_conv"};
+    }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s        = inputs.at(0);
-        const auto& w_s        = inputs.at(1);
-        const auto& out_s      = inputs.back();
-        options.inputs         = inputs;
-        options.output         = out_s;
-        options.kernel_name    = "channelwise_conv_kernel";
+        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s     = inputs.at(0);
+        const auto& w_s     = inputs.at(1);
+        const auto& out_s   = inputs.back();
+        options.inputs      = inputs;
+        options.output      = out_s;
+        options.kernel_name = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
-        auto x_lens = x_s.lens();
-        auto w_lens = w_s.lens();
-        std::vector<std::size_t> kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial);
-        std::vector<std::size_t> spatial_sizes(x_lens.begin() + 2,
-                                               x_lens.begin() + 2 + num_spatial);
-
-        auto num_channels      = out_s.lens()[0] * out_s.lens()[1];
-        std::size_t block_size = 256;
-
-        options.set_launch_params(v, num_channels * block_size, block_size);
+        auto x_lens   = x_s.lens();
+        auto w_lens   = w_s.lens();
+        auto out_lens = out_s.lens();
+
+        // Tile dimensions: for 2D use 8xH, 32xW; for 1D use 256
+        std::vector<std::size_t> tile_sizes(num_spatial);
+        if(num_spatial == 1)
+        {
+            tile_sizes[0] = 256;
+        }
+        else
+        {
+            tile_sizes[0] = 8;
+            tile_sizes[num_spatial - 1] = 32;
+            for(std::size_t d = 1; d + 1 < num_spatial; ++d)
+                tile_sizes[d] = 1;
+        }
+
+        std::size_t block_size = 1;
+        for(auto t : tile_sizes)
+            block_size *= t;
+
+        // Compute number of tiles per spatial dim: ceil(out_spatial / tile)
+        std::size_t num_blocks = out_lens[0] * out_lens[1];
+        for(std::size_t d = 0; d < num_spatial; ++d)
+        {
+            auto out_spatial = out_lens[2 + d];
+            num_blocks *= (out_spatial + tile_sizes[d] - 1) / tile_sizes[d];
+        }
+
+        options.set_launch_params(v, num_blocks * block_size, block_size);
 
         auto src = interpolate_string(channelwise_conv_kernel,
-                                      {{"kernel", to_string_range(kernel_sizes)},
-                                       {"spatial", to_string_range(spatial_sizes)}});
+                                      {{"tile", to_string_range(tile_sizes)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index fb070c6279d..cf88dcd8ae0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -35,58 +35,129 @@
 
 namespace migraphx {
 
-template <class KernelLens, class SpatialLens, class Output, class Input, class Weights>
-__device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w)
+template <class Pos, class Lens>
+constexpr bool in_bounds(Pos pos, Lens lens)
 {
-    constexpr index_int kernel_total  = KernelLens{}.product();
-    constexpr index_int spatial_total = SpatialLens{}.product();
+    for(index_int d = 0; d < pos.size(); d++)
+    {
+        if(pos[d] >= lens[d])
+            return false;
+    }
+    return true;
+}
+
+template <class TileLens, class Output, class Input, class Weights>
+__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
+{
+    auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
 
     constexpr index_int N     = get_shape_c<Output>{}.lens[0];
     constexpr index_int C_out = get_shape_c<Output>{}.lens[1];
     constexpr index_int C_in  = get_shape_c<Input>{}.lens[1];
 
-    constexpr auto smem_shape = make_packed_shape(
-        make_slice(get_shape_c<Input>{}, [](auto, auto i, auto) { return i >= 2; }));
-    constexpr auto wregs_shape = make_packed_shape(
-        make_slice(get_shape_c<Weights>{}, [](auto, auto i, auto) { return i >= 2; }));
+    // Derive spatial and kernel lens from input shapes (already full-rank)
+    constexpr auto spatial_lens = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
+    constexpr auto kernel_lens  = make_slice(get_shape_c<Weights>{}, keep_spatial).lens;
+    constexpr auto wregs_shape  = make_packed_shape(make_slice(get_shape_c<Weights>{}, keep_spatial));
+
+    constexpr index_int kernel_total = kernel_lens.product();
 
     constexpr auto out_nc = make_shape(index_ints<N, C_out>{});
     constexpr auto co_cin = make_shape(index_ints<C_out / C_in, C_in>{});
     constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
 
+    // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix
+    constexpr auto tile_lens        = return_array_c([] {
+        constexpr auto sl          = decltype(spatial_lens){};
+        constexpr auto tl          = TileLens{};
+        constexpr index_int nd     = sl.size();
+        constexpr index_int ns     = array_size(TileLens{});
+        array<index_int, nd> result;
+        result[0] = 1;
+        result[1] = 1;
+        for(index_int i = 0; i < ns; i++)
+            result[2 + i] = tl[i];
+        return result;
+    });
+    constexpr auto halo_lens        = transform(tile_lens, kernel_lens,
+        [](auto t, auto k) { return t + k - 1; });
+    constexpr auto out_spatial_lens = transform(spatial_lens, kernel_lens,
+        [](auto s, auto k) { return s - k + 1; });
+    constexpr auto tiles_per_dim    = transform(out_spatial_lens, tile_lens,
+        [](auto o, auto t) { return (o + t - 1) / t; });
+
+    constexpr auto tile_shape      = make_shape(tile_lens);
+    constexpr auto halo_shape      = make_shape(halo_lens);
+    constexpr index_int halo_total = halo_lens.product();
+    constexpr index_int tile_total = tile_lens.product();
+
+    // Block shape: [N, C_out, tiles_h, tiles_w]
+    constexpr auto block_lens = return_array_c([] {
+        constexpr auto tpd     = decltype(tiles_per_dim){};
+        constexpr index_int nd = tpd.size();
+        array<index_int, nd> result;
+        for(index_int i = 0; i < nd; i++)
+            result[i] = tpd[i];
+        result[0] = N;
+        result[1] = C_out;
+        return result;
+    });
+    constexpr auto block_shape = make_shape(block_lens);
+
     using T = typename Output::type;
-    __shared__ uninitialized_buffer<T, spatial_total> smem;
+    __shared__ uninitialized_buffer<T, halo_total> smem;
 
-    auto idx          = make_index();
-    auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
+    auto idx = make_index();
+
+    // Decompose block index
+    auto block_multi = block_shape.multi(idx.group);
+    auto n           = block_multi[0];
+    auto co          = block_multi[1];
+    auto c_in        = co_cin.multi(co)[1];
+
+    auto x_ch   = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial);
+    auto w_ch   = slice_tensor(w, co, keep_spatial);
+    auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial);
+
+    // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW]
+    constexpr index_int NDIM = spatial_lens.size();
+    auto tile_origin = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
+        if constexpr(d < 2)
+            return 0;
+        else
+            return block_multi[d] * tile_lens[d];
+    });
+
+    // Phase 1: load halo tile into shared memory with bounds checking
+    auto smem_view = make_tensor_view(smem.data(), halo_shape);
+    idx.local_stride(_c<halo_total>, [&](auto i) {
+        auto halo_multi = halo_shape.multi(index_int{i});
+        auto src_pos    = tile_origin + halo_multi;
+        smem.data()[i]  = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
+    });
+
+    // Phase 2: copy weights into registers
+    array<T, kernel_total> wregs_arr;
+    auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape);
+    copy(w_ch.begin(), w_ch.end(), wregs.begin());
+
+    __syncthreads();
+
+    // Phase 3: compute output tile with bounds checking
+    idx.local_stride(_c<tile_total>, [&](auto j) {
+        auto tile_multi = tile_shape.multi(index_int{j});
+        auto out_pos    = tile_origin + tile_multi;
+        if(not in_bounds(out_pos, out_spatial_lens))
+            return;
+
+        T acc = 0;
+        for(index_int ki = 0; ki < kernel_total; ki++)
+        {
+            auto k_multi = wregs_shape.multi(ki);
+            acc += smem_view[tile_multi + k_multi] * wregs[k_multi];
+        }
 
-    slice_schedule<per_block>(idx, keep_spatial)(output)([&](auto out_ch) {
-        auto nc_multi = out_nc.multi(idx.group);
-        auto n        = nc_multi[0];
-        auto co       = nc_multi[1];
-        auto c_in     = co_cin.multi(co)[1];
-
-        auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial);
-        auto w_ch = slice_tensor(w, co, keep_spatial);
-
-        // Phase 1: copy input channel into shared memory
-        auto smem_input = make_tensor_view(smem.data(), smem_shape);
-        local_tensor_copy(idx, x_ch, smem_input);
-
-        // Phase 2: copy weights into registers
-        array<T, kernel_total> wregs_arr;
-        auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape);
-        copy(w_ch.begin(), w_ch.end(), wregs.begin());
-
-        __syncthreads();
-
-        // Phase 3: sliding window multiply-reduce
-        pooling_reduce<reduce::lane, 1, per_block>(out_ch, [&](auto out_idx, auto r) {
-            return r.reduce(op::sum{}, T{0}, [&](auto ki) {
-                auto k_multi = wregs_shape.multi(ki);
-                return smem_input[out_idx + k_multi] * wregs[k_multi];
-            })(reduce::make_indices(_c<kernel_total>));
-        });
+        out_ch[out_pos] = acc;
     });
 }
 
diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp
index e3483480d8e..91731d1d2f2 100644
--- a/test/verify/test_channelwise_conv.cpp
+++ b/test/verify/test_channelwise_conv.cpp
@@ -100,3 +100,38 @@ struct test_channelwise_conv_1d : verify_program<test_channelwise_conv_1d<DType>
 };
 template struct test_channelwise_conv_1d<migraphx::shape::float_type>;
 template struct test_channelwise_conv_1d<migraphx::shape::half_type>;
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_large : verify_program<test_channelwise_conv_large<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_large<migraphx::shape::float_type>;
+template struct test_channelwise_conv_large<migraphx::shape::half_type>;
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_non_divisible
+    : verify_program<test_channelwise_conv_non_divisible<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_non_divisible<migraphx::shape::float_type>;
+template struct test_channelwise_conv_non_divisible<migraphx::shape::half_type>;

From b61daa34fcb0a465770137034d9728ff1cbfa70d Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 21:23:56 +0000
Subject: [PATCH 14/84] FOrmat

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 25 +++++++---------
 .../migraphx/kernels/channelwise_conv.hpp     | 29 ++++++++++---------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 32aae6c2c2a..5d27c6038a1 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,21 +56,18 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const
-    {
-        return {"gpu::channelwise_conv", "channelwise_conv"};
-    }
+    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s     = inputs.at(0);
-        const auto& w_s     = inputs.at(1);
-        const auto& out_s   = inputs.back();
-        options.inputs      = inputs;
-        options.output      = out_s;
-        options.kernel_name = "channelwise_conv_kernel";
+        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
+        const auto& x_s        = inputs.at(0);
+        const auto& w_s        = inputs.at(1);
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto x_lens   = x_s.lens();
@@ -85,7 +82,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         }
         else
         {
-            tile_sizes[0] = 8;
+            tile_sizes[0]               = 8;
             tile_sizes[num_spatial - 1] = 32;
             for(std::size_t d = 1; d + 1 < num_spatial; ++d)
                 tile_sizes[d] = 1;
@@ -105,8 +102,8 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
-        auto src = interpolate_string(channelwise_conv_kernel,
-                                      {{"tile", to_string_range(tile_sizes)}});
+        auto src =
+            interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index cf88dcd8ae0..3c5715b6433 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -58,7 +58,8 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     // Derive spatial and kernel lens from input shapes (already full-rank)
     constexpr auto spatial_lens = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
     constexpr auto kernel_lens  = make_slice(get_shape_c<Weights>{}, keep_spatial).lens;
-    constexpr auto wregs_shape  = make_packed_shape(make_slice(get_shape_c<Weights>{}, keep_spatial));
+    constexpr auto wregs_shape =
+        make_packed_shape(make_slice(get_shape_c<Weights>{}, keep_spatial));
 
     constexpr index_int kernel_total = kernel_lens.product();
 
@@ -67,11 +68,11 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
 
     // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix
-    constexpr auto tile_lens        = return_array_c([] {
-        constexpr auto sl          = decltype(spatial_lens){};
-        constexpr auto tl          = TileLens{};
-        constexpr index_int nd     = sl.size();
-        constexpr index_int ns     = array_size(TileLens{});
+    constexpr auto tile_lens = return_array_c([] {
+        constexpr auto sl      = decltype(spatial_lens){};
+        constexpr auto tl      = TileLens{};
+        constexpr index_int nd = sl.size();
+        constexpr index_int ns = array_size(TileLens{});
         array<index_int, nd> result;
         result[0] = 1;
         result[1] = 1;
@@ -79,12 +80,12 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
             result[2 + i] = tl[i];
         return result;
     });
-    constexpr auto halo_lens        = transform(tile_lens, kernel_lens,
-        [](auto t, auto k) { return t + k - 1; });
-    constexpr auto out_spatial_lens = transform(spatial_lens, kernel_lens,
-        [](auto s, auto k) { return s - k + 1; });
-    constexpr auto tiles_per_dim    = transform(out_spatial_lens, tile_lens,
-        [](auto o, auto t) { return (o + t - 1) / t; });
+    constexpr auto halo_lens =
+        transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; });
+    constexpr auto out_spatial_lens =
+        transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; });
+    constexpr auto tiles_per_dim =
+        transform(out_spatial_lens, tile_lens, [](auto o, auto t) { return (o + t - 1) / t; });
 
     constexpr auto tile_shape      = make_shape(tile_lens);
     constexpr auto halo_shape      = make_shape(halo_lens);
@@ -92,7 +93,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     constexpr index_int tile_total = tile_lens.product();
 
     // Block shape: [N, C_out, tiles_h, tiles_w]
-    constexpr auto block_lens = return_array_c([] {
+    constexpr auto block_lens  = return_array_c([] {
         constexpr auto tpd     = decltype(tiles_per_dim){};
         constexpr index_int nd = tpd.size();
         array<index_int, nd> result;
@@ -121,7 +122,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
 
     // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW]
     constexpr index_int NDIM = spatial_lens.size();
-    auto tile_origin = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
+    auto tile_origin         = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;
         else

From c9d258f69ab2f32d00faa2aa1b4c872550f00a69 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 22:06:25 +0000
Subject: [PATCH 15/84] Access directly

---
 .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 3c5715b6433..e36fd52d352 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -134,7 +134,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     idx.local_stride(_c<halo_total>, [&](auto i) {
         auto halo_multi = halo_shape.multi(index_int{i});
         auto src_pos    = tile_origin + halo_multi;
-        smem.data()[i]  = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
+        smem[i]  = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
     });
 
     // Phase 2: copy weights into registers

From 6d979f5ecdd34d1b6ead3e2572624c6f4d81be9e Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 22:06:34 +0000
Subject: [PATCH 16/84] Format

---
 .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index e36fd52d352..c6c7f82b51a 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -134,7 +134,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     idx.local_stride(_c<halo_total>, [&](auto i) {
         auto halo_multi = halo_shape.multi(index_int{i});
         auto src_pos    = tile_origin + halo_multi;
-        smem[i]  = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
+        smem[i]         = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
     });
 
     // Phase 2: copy weights into registers

From ecbce52bcdf701f09f95bb1d8705e8ed2ef391ee Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 16:23:54 -0600
Subject: [PATCH 17/84] Add join

---
 .../gpu/kernels/include/migraphx/kernels/array.hpp | 12 ++++++++++++
 .../include/migraphx/kernels/channelwise_conv.hpp  | 14 ++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
index 9c2684f90ac..10270d20c2c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -452,6 +452,18 @@ constexpr auto transform(integral_const_array<T, Xs...>, integral_const_array<U,
     return integral_const_array<T, f(Xs, Ys)...>{};
 }
 
+template <class T, T... Xs, class U, U... Ys>
+constexpr auto join(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>)
+{
+    return integral_const_array<T, Xs..., Ys...>{};
+}
+
+template <class T, T... Xs, class U, U... Ys, class... Arrays>
+constexpr auto join(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>, Arrays...)
+{
+    return join(integral_const_array<T, Xs..., Ys...>{}, Arrays{}...);
+}
+
 template <class F>
 constexpr auto return_array_c(F f)
 {
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index c6c7f82b51a..ecdec9466e1 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -68,18 +68,8 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
 
     // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix
-    constexpr auto tile_lens = return_array_c([] {
-        constexpr auto sl      = decltype(spatial_lens){};
-        constexpr auto tl      = TileLens{};
-        constexpr index_int nd = sl.size();
-        constexpr index_int ns = array_size(TileLens{});
-        array<index_int, nd> result;
-        result[0] = 1;
-        result[1] = 1;
-        for(index_int i = 0; i < ns; i++)
-            result[2 + i] = tl[i];
-        return result;
-    });
+    constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{});
+
     constexpr auto halo_lens =
         transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; });
     constexpr auto out_spatial_lens =

From 4bd655655782f477477a7cd9606341d306afcde2 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 17:31:17 -0600
Subject: [PATCH 18/84] Update tuning

---
 src/targets/gpu/jit/channelwise_conv.cpp | 54 ++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 5d27c6038a1..58cf4532edb 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -82,8 +82,8 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         }
         else
         {
-            tile_sizes[0]               = 8;
-            tile_sizes[num_spatial - 1] = 32;
+            tile_sizes[0]               = v.get("tile_h", 8);
+            tile_sizes[num_spatial - 1] = v.get("tile_w", 32);
             for(std::size_t d = 1; d + 1 < num_spatial; ++d)
                 tile_sizes[d] = 1;
         }
@@ -108,9 +108,55 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
-    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
     {
-        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+        auto v        = op.to_value();
+        for(const auto& x : solution)
+            v.insert(x);
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
+    }
+
+    optional<tuning_config> get_tuning_config(const context& ctx,
+                                              instruction_ref ins,
+                                              const operation& op,
+                                              bool exhaustive) const
+    {
+        tuning_config tc;
+        auto shapes       = to_shapes(ins->inputs());
+        tc.problem        = to_value(shapes);
+        if(exhaustive)
+        {
+            std::vector<std::size_t> sizes;
+            for(auto i:range(1, 64))
+                sizes.push_back(i*4);
+            for(auto tile_h:sizes)
+            {
+                for(auto tile_w:sizes)
+                {
+                    auto block_size = tile_h * tile_w;
+                    if(block_size > 1024)
+                        continue;
+                    if(block_size < ctx.get_current_device().get_wavefront_size())
+                        continue;
+                    if((block_size % ctx.get_current_device().get_wavefront_size()) != 0)
+                        continue;
+                    tc.solutions.push_back({{"tile_h", tile_h}, {"tile_w", tile_w}});
+                }
+            }
+        }
+        else
+        {
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}});
+            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}});
+            tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}});
+            tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}});
+            // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}});
+            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}});
+
+            // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}});
+            // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}});
+        }
+        return tc;
     }
 };
 

From d1da33357292ce8cb35a794281c57e1a38e73164 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 16 Feb 2026 17:31:20 -0600
Subject: [PATCH 19/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 58cf4532edb..76071ba81a3 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -108,9 +108,10 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
-    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
     {
-        auto v        = op.to_value();
+        auto v = op.to_value();
         for(const auto& x : solution)
             v.insert(x);
         return compile_op(ctx, to_shapes(ins->inputs()), v);
@@ -122,16 +123,16 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
                                               bool exhaustive) const
     {
         tuning_config tc;
-        auto shapes       = to_shapes(ins->inputs());
-        tc.problem        = to_value(shapes);
+        auto shapes = to_shapes(ins->inputs());
+        tc.problem  = to_value(shapes);
         if(exhaustive)
         {
             std::vector<std::size_t> sizes;
-            for(auto i:range(1, 64))
-                sizes.push_back(i*4);
-            for(auto tile_h:sizes)
+            for(auto i : range(1, 64))
+                sizes.push_back(i * 4);
+            for(auto tile_h : sizes)
             {
-                for(auto tile_w:sizes)
+                for(auto tile_w : sizes)
                 {
                     auto block_size = tile_h * tile_w;
                     if(block_size > 1024)

From 9cc6906b90812f0a529fd2ee188b05f4b1cf2e9c Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 14:33:02 +0000
Subject: [PATCH 20/84] Add multi-output

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 71 +++++++++++--------
 .../migraphx/kernels/channelwise_conv.hpp     | 39 +++++-----
 2 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 76071ba81a3..e95efbea4e3 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -44,7 +44,7 @@ extern "C" {
 MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 {
     transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
-        channelwise_conv(index_ints<${tile}>{}, output, x, w);
+        channelwise_conv(index_ints<${tile}>{}, index_ints<${output_tile}>{}, output, x, w);
     });
 }
 
@@ -56,54 +56,61 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
+    std::vector<std::string> names() const
+    {
+        return {"gpu::channelwise_conv", "channelwise_conv"};
+    }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
-        const auto& x_s        = inputs.at(0);
-        const auto& w_s        = inputs.at(1);
-        const auto& out_s      = inputs.back();
-        options.inputs         = inputs;
-        options.output         = out_s;
-        options.kernel_name    = "channelwise_conv_kernel";
+        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
+        const auto& out_s   = inputs.back();
+        options.inputs      = inputs;
+        options.output      = out_s;
+        options.kernel_name = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
-        auto x_lens   = x_s.lens();
-        auto w_lens   = w_s.lens();
         auto out_lens = out_s.lens();
 
-        // Tile dimensions: for 2D use 8xH, 32xW; for 1D use 256
+        // Thread block tile dimensions
         std::vector<std::size_t> tile_sizes(num_spatial);
         if(num_spatial == 1)
         {
-            tile_sizes[0] = 256;
+            tile_sizes[0] = v.get("tile_w", std::size_t{256});
         }
         else
         {
-            tile_sizes[0]               = v.get("tile_h", 8);
-            tile_sizes[num_spatial - 1] = v.get("tile_w", 32);
+            tile_sizes[0]               = v.get("tile_h", std::size_t{8});
+            tile_sizes[num_spatial - 1] = v.get("tile_w", std::size_t{32});
             for(std::size_t d = 1; d + 1 < num_spatial; ++d)
                 tile_sizes[d] = 1;
         }
 
+        // Outputs per thread along W (last spatial dim)
+        auto outputs_per_thread = v.get("outputs_per_thread", std::size_t{4});
+
+        // Output tile = thread tile with last dim scaled by outputs_per_thread
+        std::vector<std::size_t> output_tile_sizes = tile_sizes;
+        output_tile_sizes.back() *= outputs_per_thread;
+
         std::size_t block_size = 1;
         for(auto t : tile_sizes)
             block_size *= t;
 
-        // Compute number of tiles per spatial dim: ceil(out_spatial / tile)
+        // Blocks: N * C_out * prod(ceil(out_spatial / output_tile))
         std::size_t num_blocks = out_lens[0] * out_lens[1];
         for(std::size_t d = 0; d < num_spatial; ++d)
         {
             auto out_spatial = out_lens[2 + d];
-            num_blocks *= (out_spatial + tile_sizes[d] - 1) / tile_sizes[d];
+            num_blocks *= (out_spatial + output_tile_sizes[d] - 1) / output_tile_sizes[d];
         }
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
-        auto src =
-            interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}});
+        auto src = interpolate_string(channelwise_conv_kernel,
+                                      {{"tile", to_string_range(tile_sizes)},
+                                       {"output_tile", to_string_range(output_tile_sizes)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
@@ -141,21 +148,27 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
                         continue;
                     if((block_size % ctx.get_current_device().get_wavefront_size()) != 0)
                         continue;
-                    tc.solutions.push_back({{"tile_h", tile_h}, {"tile_w", tile_w}});
+                    for(auto opt : {1, 2, 4, 8})
+                        tc.solutions.push_back(
+                            {{"tile_h", tile_h}, {"tile_w", tile_w}, {"outputs_per_thread", opt}});
                 }
             }
         }
         else
         {
-            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}});
-            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}});
-            tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}});
-            tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}});
-            // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}});
-            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}});
-
-            // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}});
-            // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
+            // for(auto opt : {1, 2})
+            // {
+            //     tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+            //     tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+            //     tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+            //     tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}});
+            //     // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}});
+            //     tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}});
+    
+            //     // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+            //     // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}});
+            // }
         }
         return tc;
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index ecdec9466e1..1e8d2bb0093 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -46,8 +46,10 @@ constexpr bool in_bounds(Pos pos, Lens lens)
     return true;
 }
 
-template <class TileLens, class Output, class Input, class Weights>
-__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
+// TileLens:   thread block tile (determines block_size = product)
+// OutputLens: output region per block (>= TileLens, multiple along W)
+template <class TileLens, class OutputLens, class Output, class Input, class Weights>
+__device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, Weights w)
 {
     auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
 
@@ -55,7 +57,6 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     constexpr index_int C_out = get_shape_c<Output>{}.lens[1];
     constexpr index_int C_in  = get_shape_c<Input>{}.lens[1];
 
-    // Derive spatial and kernel lens from input shapes (already full-rank)
     constexpr auto spatial_lens = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
     constexpr auto kernel_lens  = make_slice(get_shape_c<Weights>{}, keep_spatial).lens;
     constexpr auto wregs_shape =
@@ -67,23 +68,22 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     constexpr auto co_cin = make_shape(index_ints<C_out / C_in, C_in>{});
     constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
 
-    // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix
-    constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{});
-
+    // Full-rank output region per block
+    constexpr auto output_lens = join(index_ints<1, 1>{}, OutputLens{});
     constexpr auto halo_lens =
-        transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; });
+        transform(output_lens, kernel_lens, [](auto o, auto k) { return o + k - 1; });
     constexpr auto out_spatial_lens =
         transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; });
     constexpr auto tiles_per_dim =
-        transform(out_spatial_lens, tile_lens, [](auto o, auto t) { return (o + t - 1) / t; });
+        transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
 
-    constexpr auto tile_shape      = make_shape(tile_lens);
-    constexpr auto halo_shape      = make_shape(halo_lens);
-    constexpr index_int halo_total = halo_lens.product();
-    constexpr index_int tile_total = tile_lens.product();
+    constexpr auto output_shape      = make_shape(output_lens);
+    constexpr auto halo_shape        = make_shape(halo_lens);
+    constexpr index_int halo_total   = halo_lens.product();
+    constexpr index_int output_total = output_lens.product();
 
     // Block shape: [N, C_out, tiles_h, tiles_w]
-    constexpr auto block_lens  = return_array_c([] {
+    constexpr auto block_lens = return_array_c([] {
         constexpr auto tpd     = decltype(tiles_per_dim){};
         constexpr index_int nd = tpd.size();
         array<index_int, nd> result;
@@ -110,13 +110,12 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     auto w_ch   = slice_tensor(w, co, keep_spatial);
     auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial);
 
-    // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW]
     constexpr index_int NDIM = spatial_lens.size();
     auto tile_origin         = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;
         else
-            return block_multi[d] * tile_lens[d];
+            return block_multi[d] * output_lens[d];
     });
 
     // Phase 1: load halo tile into shared memory with bounds checking
@@ -134,10 +133,10 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
 
     __syncthreads();
 
-    // Phase 3: compute output tile with bounds checking
-    idx.local_stride(_c<tile_total>, [&](auto j) {
-        auto tile_multi = tile_shape.multi(index_int{j});
-        auto out_pos    = tile_origin + tile_multi;
+    // Phase 3: compute output region (each thread handles output_total / block_size elements)
+    idx.local_stride(_c<output_total>, [&](auto j) {
+        auto out_multi = output_shape.multi(index_int{j});
+        auto out_pos   = tile_origin + out_multi;
         if(not in_bounds(out_pos, out_spatial_lens))
             return;
 
@@ -145,7 +144,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
         for(index_int ki = 0; ki < kernel_total; ki++)
         {
             auto k_multi = wregs_shape.multi(ki);
-            acc += smem_view[tile_multi + k_multi] * wregs[k_multi];
+            acc += smem_view[out_multi + k_multi] * wregs[k_multi];
         }
 
         out_ch[out_pos] = acc;

From 0942c87c093c3ec48c184a13372d9fca12bbbef1 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 14:33:07 +0000
Subject: [PATCH 21/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 37 ++++++++++---------
 .../migraphx/kernels/channelwise_conv.hpp     |  2 +-
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index e95efbea4e3..c16cb3422ee 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -56,19 +56,16 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 
 struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 {
-    std::vector<std::string> names() const
-    {
-        return {"gpu::channelwise_conv", "channelwise_conv"};
-    }
+    std::vector<std::string> names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        auto num_spatial    = v.at("num_spatial").to<std::size_t>();
-        const auto& out_s   = inputs.back();
-        options.inputs      = inputs;
-        options.output      = out_s;
-        options.kernel_name = "channelwise_conv_kernel";
+        auto num_spatial       = v.at("num_spatial").to<std::size_t>();
+        const auto& out_s      = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "channelwise_conv_kernel";
         options.virtual_inputs = inputs;
 
         auto out_lens = out_s.lens();
@@ -159,15 +156,19 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
             // for(auto opt : {1, 2})
             // {
-            //     tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-            //     tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-            //     tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-            //     tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}});
-            //     // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}});
-            //     tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}});
-    
-            //     // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-            //     // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}});
+            //     tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread",
+            //     opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32},
+            //     {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w",
+            //     32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24},
+            //     {"tile_w", 16}, {"outputs_per_thread", opt}});
+            //     // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread",
+            //     opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4},
+            //     {"outputs_per_thread", opt}});
+
+            //     // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread",
+            //     opt}});
+            //     // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread",
+            //     opt}});
             // }
         }
         return tc;
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 1e8d2bb0093..1b5641fbc62 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -83,7 +83,7 @@ __device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, W
     constexpr index_int output_total = output_lens.product();
 
     // Block shape: [N, C_out, tiles_h, tiles_w]
-    constexpr auto block_lens = return_array_c([] {
+    constexpr auto block_lens  = return_array_c([] {
         constexpr auto tpd     = decltype(tiles_per_dim){};
         constexpr index_int nd = tpd.size();
         array<index_int, nd> result;

From ca147d2c921f42dce453f238f4403aac308bfccc Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:34:53 +0000
Subject: [PATCH 22/84] Add spatial tiler

---
 src/targets/gpu/jit/channelwise_conv.cpp      |   4 +-
 .../migraphx/kernels/channelwise_conv.hpp     | 125 ++----------
 .../migraphx/kernels/spatial_tiler.hpp        | 178 ++++++++++++++++++
 3 files changed, 197 insertions(+), 110 deletions(-)
 create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index c16cb3422ee..8290ffe6114 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -44,7 +44,7 @@ extern "C" {
 MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
 {
     transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
-        channelwise_conv(index_ints<${tile}>{}, index_ints<${output_tile}>{}, output, x, w);
+        channelwise_conv<index_ints<${tile}>, ${ntiles}>(index_ints<${tile}>{}, output, x, w);
     });
 }
 
@@ -107,7 +107,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         auto src = interpolate_string(channelwise_conv_kernel,
                                       {{"tile", to_string_range(tile_sizes)},
-                                       {"output_tile", to_string_range(output_tile_sizes)}});
+                                       {"ntiles", std::to_string(outputs_per_thread)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 1b5641fbc62..ecb39860789 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -25,128 +25,37 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
 #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP
 
-#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/spatial_tiler.hpp>
 #include <migraphx/kernels/algorithm.hpp>
-#include <migraphx/kernels/slice.hpp>
 #include <migraphx/kernels/copy.hpp>
-#include <migraphx/kernels/reduce.hpp>
-#include <migraphx/kernels/uninitialized_buffer.hpp>
-#include <migraphx/kernels/pooling.hpp>
 
 namespace migraphx {
 
-template <class Pos, class Lens>
-constexpr bool in_bounds(Pos pos, Lens lens)
+template <class TileLens, index_int NTiles, class Output, class Input, class Weights>
+__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
 {
-    for(index_int d = 0; d < pos.size(); d++)
-    {
-        if(pos[d] >= lens[d])
-            return false;
-    }
-    return true;
-}
-
-// TileLens:   thread block tile (determines block_size = product)
-// OutputLens: output region per block (>= TileLens, multiple along W)
-template <class TileLens, class OutputLens, class Output, class Input, class Weights>
-__device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, Weights w)
-{
-    auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
-
-    constexpr index_int N     = get_shape_c<Output>{}.lens[0];
-    constexpr index_int C_out = get_shape_c<Output>{}.lens[1];
-    constexpr index_int C_in  = get_shape_c<Input>{}.lens[1];
-
-    constexpr auto spatial_lens = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
-    constexpr auto kernel_lens  = make_slice(get_shape_c<Weights>{}, keep_spatial).lens;
-    constexpr auto wregs_shape =
-        make_packed_shape(make_slice(get_shape_c<Weights>{}, keep_spatial));
+    auto idx   = make_index();
+    auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{});
 
-    constexpr index_int kernel_total = kernel_lens.product();
-
-    constexpr auto out_nc = make_shape(index_ints<N, C_out>{});
-    constexpr auto co_cin = make_shape(index_ints<C_out / C_in, C_in>{});
-    constexpr auto in_nc  = make_shape(index_ints<N, C_in>{});
-
-    // Full-rank output region per block
-    constexpr auto output_lens = join(index_ints<1, 1>{}, OutputLens{});
-    constexpr auto halo_lens =
-        transform(output_lens, kernel_lens, [](auto o, auto k) { return o + k - 1; });
-    constexpr auto out_spatial_lens =
-        transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; });
-    constexpr auto tiles_per_dim =
-        transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
-
-    constexpr auto output_shape      = make_shape(output_lens);
-    constexpr auto halo_shape        = make_shape(halo_lens);
-    constexpr index_int halo_total   = halo_lens.product();
-    constexpr index_int output_total = output_lens.product();
-
-    // Block shape: [N, C_out, tiles_h, tiles_w]
-    constexpr auto block_lens  = return_array_c([] {
-        constexpr auto tpd     = decltype(tiles_per_dim){};
-        constexpr index_int nd = tpd.size();
-        array<index_int, nd> result;
-        for(index_int i = 0; i < nd; i++)
-            result[i] = tpd[i];
-        result[0] = N;
-        result[1] = C_out;
-        return result;
-    });
-    constexpr auto block_shape = make_shape(block_lens);
+    __shared__ decltype(tiler.template shared_allocate<Input>()) smem;
 
-    using T = typename Output::type;
-    __shared__ uninitialized_buffer<T, halo_total> smem;
+    auto x_ch   = tiler.copy(x, smem);
+    auto w_ch   = tiler.slice(w);
+    auto out_ch = tiler.slice(output);
 
-    auto idx = make_index();
-
-    // Decompose block index
-    auto block_multi = block_shape.multi(idx.group);
-    auto n           = block_multi[0];
-    auto co          = block_multi[1];
-    auto c_in        = co_cin.multi(co)[1];
-
-    auto x_ch   = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial);
-    auto w_ch   = slice_tensor(w, co, keep_spatial);
-    auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial);
-
-    constexpr index_int NDIM = spatial_lens.size();
-    auto tile_origin         = generate_array<index_int>(_c<NDIM>, [&](auto d) -> index_int {
-        if constexpr(d < 2)
-            return 0;
-        else
-            return block_multi[d] * output_lens[d];
-    });
-
-    // Phase 1: load halo tile into shared memory with bounds checking
-    auto smem_view = make_tensor_view(smem.data(), halo_shape);
-    idx.local_stride(_c<halo_total>, [&](auto i) {
-        auto halo_multi = halo_shape.multi(index_int{i});
-        auto src_pos    = tile_origin + halo_multi;
-        smem[i]         = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0};
-    });
-
-    // Phase 2: copy weights into registers
-    array<T, kernel_total> wregs_arr;
-    auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape);
+    using T                    = typename Output::type;
+    array<T, decltype(w_ch.get_shape().elements()){}> wregs_arr;
+    auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape()));
     copy(w_ch.begin(), w_ch.end(), wregs.begin());
 
     __syncthreads();
 
-    // Phase 3: compute output region (each thread handles output_total / block_size elements)
-    idx.local_stride(_c<output_total>, [&](auto j) {
-        auto out_multi = output_shape.multi(index_int{j});
-        auto out_pos   = tile_origin + out_multi;
-        if(not in_bounds(out_pos, out_spatial_lens))
-            return;
-
+    tiler.for_each([&](auto out_pos, auto out_multi) {
         T acc = 0;
-        for(index_int ki = 0; ki < kernel_total; ki++)
-        {
-            auto k_multi = wregs_shape.multi(ki);
-            acc += smem_view[out_multi + k_multi] * wregs[k_multi];
-        }
-
+        repeat(wregs.get_shape().elements(), [&](auto ki) {
+            auto k_multi = wregs.get_shape().multi(ki);
+            acc += x_ch[out_multi + k_multi] * wregs[k_multi];
+        });
         out_ch[out_pos] = acc;
     });
 }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
new file mode 100644
index 00000000000..132f29ac347
--- /dev/null
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -0,0 +1,178 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP
+#define MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/slice.hpp>
+#include <migraphx/kernels/copy.hpp>
+#include <migraphx/kernels/uninitialized_buffer.hpp>
+
+namespace migraphx {
+
+template <class Pos, class Lens>
+constexpr bool in_bounds(Pos pos, Lens lens)
+{
+    for(index_int d = 0; d < pos.size(); d++)
+    {
+        if(pos[d] >= lens[d])
+            return false;
+    }
+    return true;
+}
+
+template <index_int NTiles, class TileLens, class OutputShape>
+struct spatial_tiler
+{
+    static constexpr auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
+
+    // Full-rank tile lens: [1, 1, TileH, TileW]
+    static constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{});
+
+    // Output region per block: tile with last dim scaled by NTiles
+    static constexpr auto output_lens = return_array_c([] {
+        auto result    = decltype(tile_lens){};
+        constexpr auto nd = result.size();
+        array<index_int, nd> r;
+        for(index_int i = 0; i < nd; i++)
+            r[i] = result[i];
+        r[nd - 1] *= NTiles;
+        return r;
+    });
+
+    static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens;
+
+    static constexpr auto tiles_per_dim = transform(
+        out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
+
+    static constexpr auto block_lens = return_array_c([] {
+        constexpr auto tpd     = decltype(tiles_per_dim){};
+        constexpr index_int nd = tpd.size();
+        constexpr auto olens   = OutputShape{}.lens;
+        array<index_int, nd> result;
+        for(index_int i = 0; i < nd; i++)
+            result[i] = tpd[i];
+        result[0] = olens[0];
+        result[1] = olens[1];
+        return result;
+    });
+    static constexpr auto block_shape = make_shape(block_lens);
+
+    static constexpr auto output_shape       = make_shape(output_lens);
+    static constexpr index_int output_total  = output_lens.product();
+    static constexpr index_int tiles_total   = tiles_per_dim.product();
+    static constexpr index_int NDIM          = out_spatial_lens.size();
+
+    index idx;
+    array<index_int, NDIM> tile_origin;
+
+    // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial)
+    template <class InputShape>
+    static constexpr auto halo_lens_for()
+    {
+        constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens;
+        constexpr auto halo_extra    = transform(
+            input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; });
+        return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; });
+    }
+
+    // Type for shared memory allocation
+    template <class Input>
+    __device__ auto shared_allocate() const
+    {
+        using T                          = typename Input::type;
+        constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
+        constexpr index_int halo_total_v = hl.product();
+        return uninitialized_buffer<T, halo_total_v>{};
+    }
+
+    // Slice a tensor to per-channel spatial view
+    template <class Tensor>
+    __device__ auto slice(Tensor t) const
+    {
+        constexpr auto n_ch = nslices(get_shape_c<Tensor>{}, keep_spatial);
+        return slice_tensor(t, (idx.group / tiles_total) % index_int{n_ch}, keep_spatial);
+    }
+
+    // Copy input halo tile into shared memory, return tensor_view over smem
+    template <class Input, class Smem>
+    __device__ auto copy(Input input, Smem& smem) const
+    {
+        using T                          = typename Input::type;
+        constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
+        constexpr auto halo_shape        = make_shape(hl);
+        constexpr index_int halo_total_v = hl.product();
+        constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
+
+        constexpr auto n_out  = nslices(OutputShape{}, keep_spatial);
+        constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial);
+        constexpr auto groups = n_out / n_in;
+        auto channel_idx      = idx.group / tiles_total;
+        auto input_ch         = slice_tensor(
+            input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial);
+
+        idx.local_stride(_c<halo_total_v>, [&](auto i) {
+            auto halo_multi = halo_shape.multi(index_int{i});
+            auto src_pos    = tile_origin + halo_multi;
+            smem[i]         = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0};
+        });
+
+        return make_tensor_view(smem.data(), halo_shape);
+    }
+
+    // Iterate over output tile positions with bounds checking
+    template <class F>
+    __device__ void for_each(F f) const
+    {
+        idx.local_stride(_c<output_total>, [&](auto j) {
+            auto out_multi = output_shape.multi(index_int{j});
+            auto out_pos   = tile_origin + out_multi;
+            if(not in_bounds(out_pos, out_spatial_lens))
+                return;
+            f(out_pos, out_multi);
+        });
+    }
+};
+
+template <index_int NTiles, class TileLens, class OutputShape>
+__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
+{
+    using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
+
+    auto block_multi = tiler_type::block_shape.multi(idx.group);
+    auto tile_origin =
+        generate_array<index_int>(_c<tiler_type::NDIM>, [&](auto d) -> index_int {
+            if constexpr(d < 2)
+                return 0;
+            else
+                return block_multi[d] * tiler_type::output_lens[d];
+        });
+
+    return tiler_type{idx, tile_origin};
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP

From 3b17a09e31bdef122125344247360791d97940a7 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:34:56 +0000
Subject: [PATCH 23/84] Format

---
 .../migraphx/kernels/channelwise_conv.hpp     |  2 +-
 .../migraphx/kernels/spatial_tiler.hpp        | 37 +++++++++----------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index ecb39860789..fadf92159c0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -43,7 +43,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
     auto w_ch   = tiler.slice(w);
     auto out_ch = tiler.slice(output);
 
-    using T                    = typename Output::type;
+    using T = typename Output::type;
     array<T, decltype(w_ch.get_shape().elements()){}> wregs_arr;
     auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape()));
     copy(w_ch.begin(), w_ch.end(), wregs.begin());
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 132f29ac347..1bd43e2e8c0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -54,7 +54,7 @@ struct spatial_tiler
 
     // Output region per block: tile with last dim scaled by NTiles
     static constexpr auto output_lens = return_array_c([] {
-        auto result    = decltype(tile_lens){};
+        auto result       = decltype(tile_lens){};
         constexpr auto nd = result.size();
         array<index_int, nd> r;
         for(index_int i = 0; i < nd; i++)
@@ -65,10 +65,10 @@ struct spatial_tiler
 
     static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens;
 
-    static constexpr auto tiles_per_dim = transform(
-        out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
+    static constexpr auto tiles_per_dim =
+        transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
 
-    static constexpr auto block_lens = return_array_c([] {
+    static constexpr auto block_lens  = return_array_c([] {
         constexpr auto tpd     = decltype(tiles_per_dim){};
         constexpr index_int nd = tpd.size();
         constexpr auto olens   = OutputShape{}.lens;
@@ -81,10 +81,10 @@ struct spatial_tiler
     });
     static constexpr auto block_shape = make_shape(block_lens);
 
-    static constexpr auto output_shape       = make_shape(output_lens);
-    static constexpr index_int output_total  = output_lens.product();
-    static constexpr index_int tiles_total   = tiles_per_dim.product();
-    static constexpr index_int NDIM          = out_spatial_lens.size();
+    static constexpr auto output_shape      = make_shape(output_lens);
+    static constexpr index_int output_total = output_lens.product();
+    static constexpr index_int tiles_total  = tiles_per_dim.product();
+    static constexpr index_int NDIM         = out_spatial_lens.size();
 
     index idx;
     array<index_int, NDIM> tile_origin;
@@ -94,8 +94,8 @@ struct spatial_tiler
     static constexpr auto halo_lens_for()
     {
         constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens;
-        constexpr auto halo_extra    = transform(
-            input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; });
+        constexpr auto halo_extra =
+            transform(input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; });
         return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; });
     }
 
@@ -131,8 +131,8 @@ struct spatial_tiler
         constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial);
         constexpr auto groups = n_out / n_in;
         auto channel_idx      = idx.group / tiles_total;
-        auto input_ch         = slice_tensor(
-            input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial);
+        auto input_ch =
+            slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial);
 
         idx.local_stride(_c<halo_total_v>, [&](auto i) {
             auto halo_multi = halo_shape.multi(index_int{i});
@@ -163,13 +163,12 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
     auto block_multi = tiler_type::block_shape.multi(idx.group);
-    auto tile_origin =
-        generate_array<index_int>(_c<tiler_type::NDIM>, [&](auto d) -> index_int {
-            if constexpr(d < 2)
-                return 0;
-            else
-                return block_multi[d] * tiler_type::output_lens[d];
-        });
+    auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM>, [&](auto d) -> index_int {
+        if constexpr(d < 2)
+            return 0;
+        else
+            return block_multi[d] * tiler_type::output_lens[d];
+    });
 
     return tiler_type{idx, tile_origin};
 }

From 037d10f2e5e5a6666ef0fa997e6338a5e2be9de3 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:45:42 +0000
Subject: [PATCH 24/84] Avoid bounds check when there is no padding

---
 .../migraphx/kernels/spatial_tiler.hpp        | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 1bd43e2e8c0..366e80aa4d2 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -86,6 +86,18 @@ struct spatial_tiler
     static constexpr index_int tiles_total  = tiles_per_dim.product();
     static constexpr index_int NDIM         = out_spatial_lens.size();
 
+    static constexpr bool is_padded = [] {
+        return (out_spatial_lens != tiles_per_dim * output_lens);
+        // constexpr auto osl     = decltype(out_spatial_lens){};
+        // constexpr auto tpd     = decltype(tiles_per_dim){};
+        // constexpr auto ol      = decltype(output_lens){};
+        // constexpr index_int nd = osl.size();
+        // for(index_int i = 0; i < nd; i++)
+        //     if(tpd[i] * ol[i] != osl[i])
+        //         return true;
+        // return false;
+    }();
+
     index idx;
     array<index_int, NDIM> tile_origin;
 
@@ -137,7 +149,10 @@ struct spatial_tiler
         idx.local_stride(_c<halo_total_v>, [&](auto i) {
             auto halo_multi = halo_shape.multi(index_int{i});
             auto src_pos    = tile_origin + halo_multi;
-            smem[i]         = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0};
+            if constexpr(is_padded)
+                smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0};
+            else
+                smem[i] = input_ch[src_pos];
         });
 
         return make_tensor_view(smem.data(), halo_shape);
@@ -150,8 +165,11 @@ struct spatial_tiler
         idx.local_stride(_c<output_total>, [&](auto j) {
             auto out_multi = output_shape.multi(index_int{j});
             auto out_pos   = tile_origin + out_multi;
-            if(not in_bounds(out_pos, out_spatial_lens))
-                return;
+            if constexpr(is_padded)
+            {
+                if(not in_bounds(out_pos, out_spatial_lens))
+                    return;
+            }
             f(out_pos, out_multi);
         });
     }

From 7bc6d7842c5c0ddbe01d83100b5bc9477bb917cf Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:47:39 +0000
Subject: [PATCH 25/84] Remove lines

---
 .../kernels/include/migraphx/kernels/spatial_tiler.hpp    | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 366e80aa4d2..0db1f1847da 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -88,14 +88,6 @@ struct spatial_tiler
 
     static constexpr bool is_padded = [] {
         return (out_spatial_lens != tiles_per_dim * output_lens);
-        // constexpr auto osl     = decltype(out_spatial_lens){};
-        // constexpr auto tpd     = decltype(tiles_per_dim){};
-        // constexpr auto ol      = decltype(output_lens){};
-        // constexpr index_int nd = osl.size();
-        // for(index_int i = 0; i < nd; i++)
-        //     if(tpd[i] * ol[i] != osl[i])
-        //         return true;
-        // return false;
     }();
 
     index idx;

From e3077b8cb8ecbcd92000e86796c341ba5eaf9b7b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 22:46:21 +0000
Subject: [PATCH 26/84] Use functions instead of variables

---
 src/targets/gpu/jit/channelwise_conv.cpp      |   3 +-
 .../migraphx/kernels/spatial_tiler.hpp        | 137 ++++++++++--------
 2 files changed, 78 insertions(+), 62 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 8290ffe6114..e2ff0c31ade 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -153,7 +153,8 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         }
         else
         {
-            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
+            // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}});
             // for(auto opt : {1, 2})
             // {
             //     tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread",
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 0db1f1847da..7f3d081cf29 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -47,60 +47,74 @@ constexpr bool in_bounds(Pos pos, Lens lens)
 template <index_int NTiles, class TileLens, class OutputShape>
 struct spatial_tiler
 {
-    static constexpr auto keep_spatial = [](auto, auto i, auto) { return i >= 2; };
+    static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; }
 
     // Full-rank tile lens: [1, 1, TileH, TileW]
-    static constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{});
+    static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); }
 
     // Output region per block: tile with last dim scaled by NTiles
-    static constexpr auto output_lens = return_array_c([] {
-        auto result       = decltype(tile_lens){};
-        constexpr auto nd = result.size();
-        array<index_int, nd> r;
-        for(index_int i = 0; i < nd; i++)
-            r[i] = result[i];
-        r[nd - 1] *= NTiles;
-        return r;
-    });
-
-    static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens;
-
-    static constexpr auto tiles_per_dim =
-        transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; });
-
-    static constexpr auto block_lens  = return_array_c([] {
-        constexpr auto tpd     = decltype(tiles_per_dim){};
-        constexpr index_int nd = tpd.size();
-        constexpr auto olens   = OutputShape{}.lens;
-        array<index_int, nd> result;
-        for(index_int i = 0; i < nd; i++)
-            result[i] = tpd[i];
-        result[0] = olens[0];
-        result[1] = olens[1];
-        return result;
-    });
-    static constexpr auto block_shape = make_shape(block_lens);
-
-    static constexpr auto output_shape      = make_shape(output_lens);
-    static constexpr index_int output_total = output_lens.product();
-    static constexpr index_int tiles_total  = tiles_per_dim.product();
-    static constexpr index_int NDIM         = out_spatial_lens.size();
-
-    static constexpr bool is_padded = [] {
-        return (out_spatial_lens != tiles_per_dim * output_lens);
-    }();
+    static constexpr auto output_lens()
+    {
+        return return_array_c([] {
+            auto result       = decltype(tile_lens()){};
+            constexpr auto nd = result.size();
+            array<index_int, nd> r;
+            for(index_int i = 0; i < nd; i++)
+                r[i] = result[i];
+            r[nd - 1] *= NTiles;
+            return r;
+        });
+    }
+
+    static constexpr auto out_spatial_lens()
+    {
+        return make_slice(OutputShape{}, keep_spatial()).lens;
+    }
+
+    static constexpr auto tiles_per_dim()
+    {
+        return transform(
+            out_spatial_lens(), output_lens(), [](auto o, auto t) { return (o + t - 1) / t; });
+    }
+
+    static constexpr auto block_lens()
+    {
+        return return_array_c([] {
+            constexpr auto tpd     = decltype(tiles_per_dim()){};
+            constexpr index_int nd = tpd.size();
+            constexpr auto olens   = OutputShape{}.lens;
+            array<index_int, nd> result;
+            for(index_int i = 0; i < nd; i++)
+                result[i] = tpd[i];
+            result[0] = olens[0];
+            result[1] = olens[1];
+            return result;
+        });
+    }
+
+    static constexpr auto block_shape() { return make_shape(block_lens()); }
+
+    static constexpr auto output_shape() { return make_shape(output_lens()); }
+    static constexpr index_int output_total() { return output_lens().product(); }
+    static constexpr index_int tiles_total() { return tiles_per_dim().product(); }
+    static constexpr index_int NDIM() { return out_spatial_lens().size(); }
+
+    static constexpr bool is_padded()
+    {
+        return (out_spatial_lens() != tiles_per_dim() * output_lens());
+    }
 
     index idx;
-    array<index_int, NDIM> tile_origin;
+    array<index_int, NDIM()> tile_origin;
 
     // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial)
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens;
+        constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
         constexpr auto halo_extra =
-            transform(input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; });
-        return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; });
+            transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
+        return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 
     // Type for shared memory allocation
@@ -117,8 +131,8 @@ struct spatial_tiler
     template <class Tensor>
     __device__ auto slice(Tensor t) const
     {
-        constexpr auto n_ch = nslices(get_shape_c<Tensor>{}, keep_spatial);
-        return slice_tensor(t, (idx.group / tiles_total) % index_int{n_ch}, keep_spatial);
+        constexpr auto n_ch = nslices(get_shape_c<Tensor>{}, keep_spatial());
+        return slice_tensor(t, (idx.group / tiles_total()) % index_int{n_ch}, keep_spatial());
     }
 
     // Copy input halo tile into shared memory, return tensor_view over smem
@@ -129,19 +143,19 @@ struct spatial_tiler
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr auto halo_shape        = make_shape(hl);
         constexpr index_int halo_total_v = hl.product();
-        constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial).lens;
+        constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial()).lens;
 
-        constexpr auto n_out  = nslices(OutputShape{}, keep_spatial);
-        constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial);
+        constexpr auto n_out  = nslices(OutputShape{}, keep_spatial());
+        constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial());
         constexpr auto groups = n_out / n_in;
-        auto channel_idx      = idx.group / tiles_total;
+        auto channel_idx      = idx.group / tiles_total();
         auto input_ch =
-            slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial);
+            slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial());
 
         idx.local_stride(_c<halo_total_v>, [&](auto i) {
             auto halo_multi = halo_shape.multi(index_int{i});
             auto src_pos    = tile_origin + halo_multi;
-            if constexpr(is_padded)
+            if constexpr(is_padded())
                 smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0};
             else
                 smem[i] = input_ch[src_pos];
@@ -154,12 +168,12 @@ struct spatial_tiler
     template <class F>
     __device__ void for_each(F f) const
     {
-        idx.local_stride(_c<output_total>, [&](auto j) {
-            auto out_multi = output_shape.multi(index_int{j});
+        idx.local_stride(_c<output_total()>, [&](auto j) {
+            auto out_multi = output_shape().multi(index_int{j});
             auto out_pos   = tile_origin + out_multi;
-            if constexpr(is_padded)
+            if constexpr(is_padded())
             {
-                if(not in_bounds(out_pos, out_spatial_lens))
+                if(not in_bounds(out_pos, out_spatial_lens()))
                     return;
             }
             f(out_pos, out_multi);
@@ -172,13 +186,14 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
 {
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
-    auto block_multi = tiler_type::block_shape.multi(idx.group);
-    auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM>, [&](auto d) -> index_int {
-        if constexpr(d < 2)
-            return 0;
-        else
-            return block_multi[d] * tiler_type::output_lens[d];
-    });
+    auto block_multi = tiler_type::block_shape().multi(idx.group);
+    auto tile_origin =
+        generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
+            if constexpr(d < 2)
+                return 0;
+            else
+                return block_multi[d] * tiler_type::output_lens()[d];
+        });
 
     return tiler_type{idx, tile_origin};
 }

From 414aab469ccf5ce40632c0186e0f83c32c3e5625 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 22:46:25 +0000
Subject: [PATCH 27/84] Format

---
 .../migraphx/kernels/spatial_tiler.hpp        | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 7f3d081cf29..043d391c835 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -47,7 +47,10 @@ constexpr bool in_bounds(Pos pos, Lens lens)
 template <index_int NTiles, class TileLens, class OutputShape>
 struct spatial_tiler
 {
-    static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; }
+    static constexpr auto keep_spatial()
+    {
+        return [](auto, auto i, auto) { return i >= 2; };
+    }
 
     // Full-rank tile lens: [1, 1, TileH, TileW]
     static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); }
@@ -149,8 +152,8 @@ struct spatial_tiler
         constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial());
         constexpr auto groups = n_out / n_in;
         auto channel_idx      = idx.group / tiles_total();
-        auto input_ch =
-            slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial());
+        auto input_ch         = slice_tensor(
+            input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial());
 
         idx.local_stride(_c<halo_total_v>, [&](auto i) {
             auto halo_multi = halo_shape.multi(index_int{i});
@@ -187,13 +190,12 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
     auto block_multi = tiler_type::block_shape().multi(idx.group);
-    auto tile_origin =
-        generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
-            if constexpr(d < 2)
-                return 0;
-            else
-                return block_multi[d] * tiler_type::output_lens()[d];
-        });
+    auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
+        if constexpr(d < 2)
+            return 0;
+        else
+            return block_multi[d] * tiler_type::output_lens()[d];
+    });
 
     return tiler_type{idx, tile_origin};
 }

From e56c4f16b02bd3b487112bb723e843626d7e9d4c Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 22:59:15 +0000
Subject: [PATCH 28/84] Inine methods

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 34 +++++++-------
 .../migraphx/kernels/spatial_tiler.hpp        | 46 +++++++------------
 2 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index e2ff0c31ade..383b812e671 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -154,23 +154,23 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         else
         {
             // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
-            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}});
-            // for(auto opt : {1, 2})
-            // {
-            //     tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread",
-            //     opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32},
-            //     {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w",
-            //     32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24},
-            //     {"tile_w", 16}, {"outputs_per_thread", opt}});
-            //     // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread",
-            //     opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4},
-            //     {"outputs_per_thread", opt}});
-
-            //     // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread",
-            //     opt}});
-            //     // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread",
-            //     opt}});
-            // }
+            // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}});
+            for(auto opt : {1, 2})
+            {
+                tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread",
+                opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32},
+                {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w",
+                32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24},
+                {"tile_w", 16}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread",
+                opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4},
+                {"outputs_per_thread", opt}});
+
+                tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread",
+                opt}});
+                tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread",
+                opt}});
+            }
         }
         return tc;
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 043d391c835..08e44df231e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -47,19 +47,13 @@ constexpr bool in_bounds(Pos pos, Lens lens)
 template <index_int NTiles, class TileLens, class OutputShape>
 struct spatial_tiler
 {
-    static constexpr auto keep_spatial()
-    {
-        return [](auto, auto i, auto) { return i >= 2; };
-    }
-
-    // Full-rank tile lens: [1, 1, TileH, TileW]
-    static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); }
+    static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; }
 
     // Output region per block: tile with last dim scaled by NTiles
     static constexpr auto output_lens()
     {
         return return_array_c([] {
-            auto result       = decltype(tile_lens()){};
+            auto result       = join(index_ints<1, 1>{}, TileLens{});
             constexpr auto nd = result.size();
             array<index_int, nd> r;
             for(index_int i = 0; i < nd; i++)
@@ -80,25 +74,6 @@ struct spatial_tiler
             out_spatial_lens(), output_lens(), [](auto o, auto t) { return (o + t - 1) / t; });
     }
 
-    static constexpr auto block_lens()
-    {
-        return return_array_c([] {
-            constexpr auto tpd     = decltype(tiles_per_dim()){};
-            constexpr index_int nd = tpd.size();
-            constexpr auto olens   = OutputShape{}.lens;
-            array<index_int, nd> result;
-            for(index_int i = 0; i < nd; i++)
-                result[i] = tpd[i];
-            result[0] = olens[0];
-            result[1] = olens[1];
-            return result;
-        });
-    }
-
-    static constexpr auto block_shape() { return make_shape(block_lens()); }
-
-    static constexpr auto output_shape() { return make_shape(output_lens()); }
-    static constexpr index_int output_total() { return output_lens().product(); }
     static constexpr index_int tiles_total() { return tiles_per_dim().product(); }
     static constexpr index_int NDIM() { return out_spatial_lens().size(); }
 
@@ -171,8 +146,8 @@ struct spatial_tiler
     template <class F>
     __device__ void for_each(F f) const
     {
-        idx.local_stride(_c<output_total()>, [&](auto j) {
-            auto out_multi = output_shape().multi(index_int{j});
+        idx.local_stride(_c<output_lens().product()>, [&](auto j) {
+            auto out_multi = make_shape(output_lens()).multi(index_int{j});
             auto out_pos   = tile_origin + out_multi;
             if constexpr(is_padded())
             {
@@ -189,7 +164,18 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
 {
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
-    auto block_multi = tiler_type::block_shape().multi(idx.group);
+    constexpr auto block_shape = make_shape(return_array_c([] {
+        constexpr auto tpd     = decltype(tiler_type::tiles_per_dim()){};
+        constexpr index_int nd = tpd.size();
+        constexpr auto olens   = OutputShape{}.lens;
+        array<index_int, nd> result;
+        for(index_int i = 0; i < nd; i++)
+            result[i] = tpd[i];
+        result[0] = olens[0];
+        result[1] = olens[1];
+        return result;
+    }));
+    auto block_multi = block_shape.multi(idx.group);
     auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;

From b51c74f46d7a26ae31dbe53b10918f878a02919b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 22:59:18 +0000
Subject: [PATCH 29/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp      | 30 +++++++++++--------
 .../migraphx/kernels/spatial_tiler.hpp        |  7 +++--
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 383b812e671..b0eb76cb76e 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -157,19 +157,23 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}});
             for(auto opt : {1, 2})
             {
-                tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread",
-                opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32},
-                {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w",
-                32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24},
-                {"tile_w", 16}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread",
-                opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4},
-                {"outputs_per_thread", opt}});
-
-                tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread",
-                opt}});
-                tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread",
-                opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}});
+
+                tc.solutions.push_back(
+                    {{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}});
+                tc.solutions.push_back(
+                    {{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}});
             }
         }
         return tc;
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 08e44df231e..df4c7c22fbc 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -47,7 +47,10 @@ constexpr bool in_bounds(Pos pos, Lens lens)
 template <index_int NTiles, class TileLens, class OutputShape>
 struct spatial_tiler
 {
-    static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; }
+    static constexpr auto keep_spatial()
+    {
+        return [](auto, auto i, auto) { return i >= 2; };
+    }
 
     // Output region per block: tile with last dim scaled by NTiles
     static constexpr auto output_lens()
@@ -175,7 +178,7 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
         result[1] = olens[1];
         return result;
     }));
-    auto block_multi = block_shape.multi(idx.group);
+    auto block_multi           = block_shape.multi(idx.group);
     auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;

From 3d4bfe448ad3b43a43df5a7d833a76e6dd5c2c9b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:45:46 -0600
Subject: [PATCH 30/84] Update quick tuning list

---
 src/targets/gpu/jit/channelwise_conv.cpp | 41 +++++++-----------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index b0eb76cb76e..a1e2939c722 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -84,12 +84,12 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
                 tile_sizes[d] = 1;
         }
 
-        // Outputs per thread along W (last spatial dim)
-        auto outputs_per_thread = v.get("outputs_per_thread", std::size_t{4});
+        // Outputs per lane along W (last spatial dim)
+        auto noutputs = v.get("noutputs", std::size_t{4});
 
-        // Output tile = thread tile with last dim scaled by outputs_per_thread
+        // Output tile = lane tile with last dim scaled by noutputs
         std::vector<std::size_t> output_tile_sizes = tile_sizes;
-        output_tile_sizes.back() *= outputs_per_thread;
+        output_tile_sizes.back() *= noutputs;
 
         std::size_t block_size = 1;
         for(auto t : tile_sizes)
@@ -107,7 +107,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         auto src = interpolate_string(channelwise_conv_kernel,
                                       {{"tile", to_string_range(tile_sizes)},
-                                       {"ntiles", std::to_string(outputs_per_thread)}});
+                                       {"ntiles", std::to_string(noutputs)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
@@ -123,7 +123,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
     optional<tuning_config> get_tuning_config(const context& ctx,
                                               instruction_ref ins,
-                                              const operation& op,
+                                              const operation&,
                                               bool exhaustive) const
     {
         tuning_config tc;
@@ -147,34 +147,17 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
                         continue;
                     for(auto opt : {1, 2, 4, 8})
                         tc.solutions.push_back(
-                            {{"tile_h", tile_h}, {"tile_w", tile_w}, {"outputs_per_thread", opt}});
+                            {{"tile_h", tile_h}, {"tile_w", tile_w}, {"noutputs", opt}});
                 }
             }
         }
         else
         {
-            // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}});
-            // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}});
-            for(auto opt : {1, 2})
-            {
-                tc.solutions.push_back(
-                    {{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}});
-
-                tc.solutions.push_back(
-                    {{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}});
-                tc.solutions.push_back(
-                    {{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}});
-            }
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}});
+            tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}});
+            tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}});        
         }
         return tc;
     }

From a362a19e879debe0e88d529f42633188db6bb534 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 17 Feb 2026 17:45:50 -0600
Subject: [PATCH 31/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index a1e2939c722..f8b6d3ef190 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -105,9 +105,9 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
-        auto src = interpolate_string(channelwise_conv_kernel,
-                                      {{"tile", to_string_range(tile_sizes)},
-                                       {"ntiles", std::to_string(noutputs)}});
+        auto src = interpolate_string(
+            channelwise_conv_kernel,
+            {{"tile", to_string_range(tile_sizes)}, {"ntiles", std::to_string(noutputs)}});
 
         return compile_hip_code_object(ctx, src, options);
     }
@@ -157,7 +157,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}});
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}});
             tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}});
-            tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}});        
+            tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}});
         }
         return tc;
     }

From 208c7ada24ac964bba6a9162e12c9005511d53cd Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 18 Feb 2026 12:38:28 -0600
Subject: [PATCH 32/84] Add another config

---
 src/targets/gpu/jit/channelwise_conv.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index f8b6d3ef190..d41a1e4d13d 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -158,6 +158,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}});
             tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}});
             tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}});
+            tc.solutions.push_back({{"tile_h", 56}, {"tile_w", 4}, {"noutputs", 1}});
         }
         return tc;
     }

From f2daa29d77310d393b38b2b680dc0ff121d6b6c8 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 18 Feb 2026 17:33:48 -0600
Subject: [PATCH 33/84] Add more configs

---
 src/targets/gpu/jit/channelwise_conv.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index d41a1e4d13d..cfe905a40f6 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -154,11 +154,21 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         else
         {
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}});
-            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}});
+            
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 8}, {"noutputs", 8}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 16}, {"noutputs", 2}});
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}});
+            tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}});
+            tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 8}, {"noutputs", 4}});
+            tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 16}, {"noutputs", 2}});
             tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}});
+            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 16}, {"noutputs", 8}});
+            tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"noutputs", 1}});
+            tc.solutions.push_back({{"tile_h", 40}, {"tile_w", 12}, {"noutputs", 1}});
             tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}});
             tc.solutions.push_back({{"tile_h", 56}, {"tile_w", 4}, {"noutputs", 1}});
+            tc.solutions.push_back({{"tile_h", 76}, {"tile_w", 8}, {"noutputs", 8}});
+            tc.solutions.push_back({{"tile_h", 128}, {"tile_w", 8}, {"noutputs", 8}});
         }
         return tc;
     }

From 36110cf5e735ac1ba34a6eebdd9df2e6dc0b8729 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 18 Feb 2026 17:33:53 -0600
Subject: [PATCH 34/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index cfe905a40f6..bc84ceabf78 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -154,7 +154,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         else
         {
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}});
-            
+
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 8}, {"noutputs", 8}});
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 16}, {"noutputs", 2}});
             tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}});

From 882fe3b685e1d88b1146cf57e0324773fb29db50 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 13:33:19 -0600
Subject: [PATCH 35/84] Add pointwise fusion

---
 src/targets/gpu/fuse_ops.cpp                  |  34 ++++++
 src/targets/gpu/jit/channelwise_conv.cpp      |  29 ++++-
 .../migraphx/kernels/channelwise_conv.hpp     |  13 ++-
 test/gpu/fuse_ops.cpp                         | 108 +++++++++++++++++-
 test/verify/test_channelwise_conv_add.cpp     |  50 ++++++++
 .../verify/test_channelwise_conv_add_relu.cpp |  51 +++++++++
 test/verify/test_channelwise_conv_relu.cpp    |  47 ++++++++
 7 files changed, 323 insertions(+), 9 deletions(-)
 create mode 100644 test/verify/test_channelwise_conv_add.cpp
 create mode 100644 test/verify/test_channelwise_conv_add_relu.cpp
 create mode 100644 test/verify/test_channelwise_conv_relu.cpp

diff --git a/src/targets/gpu/fuse_ops.cpp b/src/targets/gpu/fuse_ops.cpp
index ac925269fbc..e102588c901 100644
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -983,6 +983,39 @@ struct find_layernorm_pointwise
     }
 };
 
+struct find_channelwise_conv_pointwise
+{
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::not_tuple(),
+            match::arg(0)(
+                precompile_name("gpu::channelwise_conv").bind("channelwise_conv")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto pw_ins          = r.result;
+        auto channelwise_ins = r.instructions["channelwise_conv"];
+        if(not channelwise_ins->module_inputs().empty())
+            return;
+        auto* pm       = pw_ins->module_inputs().front();
+        auto pw_inputs = pw_ins->inputs();
+        auto cw_pos    = std::find(pw_inputs.begin(), pw_inputs.end(), channelwise_ins);
+        assert(cw_pos != pw_inputs.end());
+        pw_inputs.erase(cw_pos);
+        auto inputs = channelwise_ins->inputs();
+        inputs.pop_back();
+        inputs.insert(inputs.end(), pw_inputs.begin(), pw_inputs.end());
+
+        auto cw_op_val            = channelwise_ins->get_operator().to_value();
+        cw_op_val["output_shape"] = to_value(pw_ins->get_shape());
+
+        m.replace_instruction(
+            pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm});
+    }
+};
+
 struct find_concat_pointwise
 {
     auto matcher() const
@@ -1032,6 +1065,7 @@ void fuse_ops::apply(module& m) const
 #endif
     match::find_matches(m,
                         find_layernorm_pointwise{},
+                        find_channelwise_conv_pointwise{},
                         find_concat_pointwise{},
                         find_contiguous_transpose_rocblas_gemm{},
 #if MIGRAPHX_USE_HIPBLASLT
diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index bc84ceabf78..e308dc1bab9 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -25,26 +25,32 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
+using namespace migraphx::gpu::gen; // NOLINT
+
 // NOLINTNEXTLINE
 static const char* const channelwise_conv_kernel = R"__migraphx__(
 #include <migraphx/kernels/channelwise_conv.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/generic_constant.hpp>
+#include <migraphx/kernels/ops.hpp>
 #include <args.hpp>
 
 namespace migraphx {
 
+${preamble}
+
 extern "C" {
 
-MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p)
+MIGRAPHX_GLOBAL void ${kernel}(${params})
 {
-    transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) {
-        channelwise_conv<index_ints<${tile}>, ${ntiles}>(index_ints<${tile}>{}, output, x, w);
+    transform_args(make_tensors(), rotate_last())(${args})([](auto output, auto x, auto w, auto... inputs) {
+        channelwise_conv<index_ints<${tile}>, ${ntiles}>(index_ints<${tile}>{}, ${post}, output, x, w, inputs...);
     });
 }
 
@@ -65,7 +71,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         const auto& out_s      = inputs.back();
         options.inputs         = inputs;
         options.output         = out_s;
-        options.kernel_name    = "channelwise_conv_kernel";
+        options.kernel_name    = v.get("kernel", std::string{"channelwise_conv_kernel"});
         options.virtual_inputs = inputs;
 
         auto out_lens = out_s.lens();
@@ -107,7 +113,13 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         auto src = interpolate_string(
             channelwise_conv_kernel,
-            {{"tile", to_string_range(tile_sizes)}, {"ntiles", std::to_string(noutputs)}});
+            {{"tile", to_string_range(tile_sizes)},
+             {"ntiles", std::to_string(noutputs)},
+             {"kernel", options.kernel_name},
+             {"params", enum_params(inputs.size(), "void * private_p")},
+             {"args", enum_params(inputs.size(), "private_p")},
+             {"post", v.get("post", std::string{"op::id{}"})},
+             {"preamble", v.get("preamble", std::string{})}});
 
         return compile_hip_code_object(ctx, src, options);
     }
@@ -118,6 +130,13 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         auto v = op.to_value();
         for(const auto& x : solution)
             v.insert(x);
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_channelwise_conv");
+            v["post"]     = "MIGRAPHX_LIFT(post_channelwise_conv)";
+            v["kernel"]   = "channelwise_conv_" + generate_name_from_ops(*pm) + "_kernel";
+        }
         return compile_op(ctx, to_shapes(ins->inputs()), v);
     }
 
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index fadf92159c0..103d8c074cf 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -31,8 +31,15 @@
 
 namespace migraphx {
 
-template <class TileLens, index_int NTiles, class Output, class Input, class Weights>
-__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
+template <class TileLens,
+          index_int NTiles,
+          class F,
+          class Output,
+          class Input,
+          class Weights,
+          class... Inputs>
+__device__ void
+channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs)
 {
     auto idx   = make_index();
     auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{});
@@ -56,7 +63,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w)
             auto k_multi = wregs.get_shape().multi(ki);
             acc += x_ch[out_multi + k_multi] * wregs[k_multi];
         });
-        out_ch[out_pos] = acc;
+        out_ch[out_pos] = f(acc, tiler.slice(inputs)[out_pos]...);
     });
 }
 
diff --git a/test/gpu/fuse_ops.cpp b/test/gpu/fuse_ops.cpp
index 9867515e6a8..4377afdafb0 100644
--- a/test/gpu/fuse_ops.cpp
+++ b/test/gpu/fuse_ops.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -421,4 +421,110 @@ TEST_CASE(concat_pointwise_contiguous)
     EXPECT(p1 == p2);
 }
 
+TEST_CASE(channelwise_conv_pointwise)
+{
+    migraphx::shape sx{migraphx::shape::float_type, {2, 4, 8, 8}};
+    migraphx::shape sw{migraphx::shape::float_type, {4, 1, 3, 3}};
+    migraphx::shape sout{migraphx::shape::float_type, {2, 4, 6, 6}};
+
+    auto create_program = [=](bool first_arg_conv) {
+        migraphx::program p;
+        auto* mm       = p.get_main_module();
+        auto x         = mm->add_parameter("x", sx);
+        auto w         = mm->add_parameter("w", sw);
+        auto z         = mm->add_parameter("z", sout);
+        auto alloc     = migraphx::make_op("allocate", {{"shape", to_value(sout)}});
+        auto alloc_ins = mm->add_instruction(alloc);
+        auto conv_ins =
+            mm->add_instruction(make_precompile_op("gpu::channelwise_conv"), x, w, alloc_ins);
+        std::vector<migraphx::instruction_ref> pw_inputs = {conv_ins, z};
+        if(not first_arg_conv)
+        {
+            pw_inputs = {z, conv_ins};
+        }
+        auto* pw_add =
+            create_pointwise_module(p, "main:pointwise0", pw_inputs, single_pointwise("add"));
+        auto alloc_ins2 = mm->add_instruction(alloc);
+        pw_inputs.push_back(alloc_ins2);
+        auto add_ins =
+            mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add});
+        mm->add_return({add_ins});
+        return p;
+    };
+
+    auto create_fused_program = [=]() {
+        migraphx::program p;
+        auto* mm       = p.get_main_module();
+        auto x         = mm->add_parameter("x", sx);
+        auto w         = mm->add_parameter("w", sw);
+        auto z         = mm->add_parameter("z", sout);
+        auto alloc     = migraphx::make_op("allocate", {{"shape", to_value(sout)}});
+        auto alloc_ins = mm->add_instruction(alloc);
+        auto* pw_add =
+            create_pointwise_module(p, "main:pointwise0", {x, z}, single_pointwise("add"));
+        auto conv_op     = migraphx::make_op("gpu::channelwise_conv");
+        auto pre_comp_op = migraphx::make_op(
+            "gpu::precompile_op",
+            {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}});
+        auto fused_ins =
+            mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add});
+        mm->add_return({fused_ins});
+        return p;
+    };
+
+    {
+        migraphx::program p1 = create_program(true);
+        run_pass(p1);
+        migraphx::program p2 = create_fused_program();
+        EXPECT(p1 == p2);
+    }
+    {
+        // conv is not arg(0), should not fuse
+        migraphx::program p1 = create_program(false);
+        run_pass(p1);
+        EXPECT(p1 == create_program(false));
+    }
+}
+
+TEST_CASE(channelwise_conv_pointwise_already_fused)
+{
+    migraphx::shape sx{migraphx::shape::float_type, {2, 4, 8, 8}};
+    migraphx::shape sw{migraphx::shape::float_type, {4, 1, 3, 3}};
+    migraphx::shape sout{migraphx::shape::float_type, {2, 4, 6, 6}};
+
+    auto create_program = [=]() {
+        migraphx::program p;
+        auto* mm       = p.get_main_module();
+        auto x         = mm->add_parameter("x", sx);
+        auto w         = mm->add_parameter("w", sw);
+        auto z         = mm->add_parameter("z", sout);
+        auto y         = mm->add_parameter("y", sout);
+        auto alloc     = migraphx::make_op("allocate", {{"shape", to_value(sout)}});
+        auto alloc_ins = mm->add_instruction(alloc);
+        // channelwise_conv already has a module (already fused)
+        auto* pw_relu =
+            create_pointwise_module(p, "main:pointwise0", {x}, [](auto* pm, const auto& inputs) {
+                return pm->add_instruction(migraphx::make_op("relu"), inputs[0]);
+            });
+        auto conv_op     = migraphx::make_op("gpu::channelwise_conv");
+        auto pre_comp_op = migraphx::make_op(
+            "gpu::precompile_op",
+            {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}});
+        auto conv_ins =
+            mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu});
+        auto* pw_add =
+            create_pointwise_module(p, "main:pointwise1", {conv_ins, y}, single_pointwise("add"));
+        auto alloc_ins2 = mm->add_instruction(alloc);
+        auto add_ins    = mm->add_instruction(
+            make_precompile_op("pointwise"), {conv_ins, y, alloc_ins2}, {pw_add});
+        mm->add_return({add_ins});
+        return p;
+    };
+
+    // Should not fuse since channelwise_conv already has a module
+    migraphx::program p1 = create_program();
+    run_pass(p1);
+    EXPECT(p1 == create_program());
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
diff --git a/test/verify/test_channelwise_conv_add.cpp b/test/verify/test_channelwise_conv_add.cpp
new file mode 100644
index 00000000000..7354a4616cb
--- /dev/null
+++ b/test/verify/test_channelwise_conv_add.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_add : verify_program<test_channelwise_conv_add<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        auto bias    = mm->add_parameter("b", migraphx::shape{DType, {4}});
+        auto conv =
+            mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights);
+        auto bcast_bias = mm->add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {2, 4, 6, 6}}}), bias);
+        mm->add_instruction(migraphx::make_op("add"), conv, bcast_bias);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_add<migraphx::shape::float_type>;
+template struct test_channelwise_conv_add<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_add_relu.cpp b/test/verify/test_channelwise_conv_add_relu.cpp
new file mode 100644
index 00000000000..1665b9b5b63
--- /dev/null
+++ b/test/verify/test_channelwise_conv_add_relu.cpp
@@ -0,0 +1,51 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_add_relu : verify_program<test_channelwise_conv_add_relu<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
+        auto bias    = mm->add_parameter("b", migraphx::shape{DType, {8}});
+        auto conv =
+            mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
+        auto bcast_bias = mm->add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 8, 10, 10}}}), bias);
+        auto add = mm->add_instruction(migraphx::make_op("add"), conv, bcast_bias);
+        mm->add_instruction(migraphx::make_op("relu"), add);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_add_relu<migraphx::shape::float_type>;
+template struct test_channelwise_conv_add_relu<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_relu.cpp b/test/verify/test_channelwise_conv_relu.cpp
new file mode 100644
index 00000000000..ac1510e89f5
--- /dev/null
+++ b/test/verify/test_channelwise_conv_relu.cpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_relu : verify_program<test_channelwise_conv_relu<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        auto conv =
+            mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights);
+        mm->add_instruction(migraphx::make_op("relu"), conv);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_relu<migraphx::shape::float_type>;
+template struct test_channelwise_conv_relu<migraphx::shape::half_type>;

From 24a2645701f9c14adcf0e2b7cf57eec97cc1c636 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 13:33:22 -0600
Subject: [PATCH 36/84] Format

---
 src/targets/gpu/fuse_ops.cpp                    |  6 ++----
 src/targets/gpu/jit/channelwise_conv.cpp        | 17 ++++++++---------
 .../migraphx/kernels/channelwise_conv.hpp       |  3 +--
 test/gpu/fuse_ops.cpp                           |  9 +++------
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/targets/gpu/fuse_ops.cpp b/src/targets/gpu/fuse_ops.cpp
index e102588c901..d2e9503a993 100644
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -989,8 +989,7 @@ struct find_channelwise_conv_pointwise
     {
         return precompile_name("pointwise")(
             match::not_tuple(),
-            match::arg(0)(
-                precompile_name("gpu::channelwise_conv").bind("channelwise_conv")));
+            match::arg(0)(precompile_name("gpu::channelwise_conv").bind("channelwise_conv")));
     }
 
     void apply(module& m, const match::matcher_result& r) const
@@ -1011,8 +1010,7 @@ struct find_channelwise_conv_pointwise
         auto cw_op_val            = channelwise_ins->get_operator().to_value();
         cw_op_val["output_shape"] = to_value(pw_ins->get_shape());
 
-        m.replace_instruction(
-            pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm});
+        m.replace_instruction(pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm});
     }
 };
 
diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index e308dc1bab9..608c33bd63f 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -111,15 +111,14 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
-        auto src = interpolate_string(
-            channelwise_conv_kernel,
-            {{"tile", to_string_range(tile_sizes)},
-             {"ntiles", std::to_string(noutputs)},
-             {"kernel", options.kernel_name},
-             {"params", enum_params(inputs.size(), "void * private_p")},
-             {"args", enum_params(inputs.size(), "private_p")},
-             {"post", v.get("post", std::string{"op::id{}"})},
-             {"preamble", v.get("preamble", std::string{})}});
+        auto src = interpolate_string(channelwise_conv_kernel,
+                                      {{"tile", to_string_range(tile_sizes)},
+                                       {"ntiles", std::to_string(noutputs)},
+                                       {"kernel", options.kernel_name},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"post", v.get("post", std::string{"op::id{}"})},
+                                       {"preamble", v.get("preamble", std::string{})}});
 
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 103d8c074cf..f7be9bf6a66 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -38,8 +38,7 @@ template <class TileLens,
           class Input,
           class Weights,
           class... Inputs>
-__device__ void
-channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs)
+__device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs)
 {
     auto idx   = make_index();
     auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{});
diff --git a/test/gpu/fuse_ops.cpp b/test/gpu/fuse_ops.cpp
index 4377afdafb0..f7c8c389862 100644
--- a/test/gpu/fuse_ops.cpp
+++ b/test/gpu/fuse_ops.cpp
@@ -446,8 +446,7 @@ TEST_CASE(channelwise_conv_pointwise)
             create_pointwise_module(p, "main:pointwise0", pw_inputs, single_pointwise("add"));
         auto alloc_ins2 = mm->add_instruction(alloc);
         pw_inputs.push_back(alloc_ins2);
-        auto add_ins =
-            mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add});
+        auto add_ins = mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add});
         mm->add_return({add_ins});
         return p;
     };
@@ -466,8 +465,7 @@ TEST_CASE(channelwise_conv_pointwise)
         auto pre_comp_op = migraphx::make_op(
             "gpu::precompile_op",
             {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}});
-        auto fused_ins =
-            mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add});
+        auto fused_ins = mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add});
         mm->add_return({fused_ins});
         return p;
     };
@@ -510,8 +508,7 @@ TEST_CASE(channelwise_conv_pointwise_already_fused)
         auto pre_comp_op = migraphx::make_op(
             "gpu::precompile_op",
             {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}});
-        auto conv_ins =
-            mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu});
+        auto conv_ins = mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu});
         auto* pw_add =
             create_pointwise_module(p, "main:pointwise1", {conv_ins, y}, single_pointwise("add"));
         auto alloc_ins2 = mm->add_instruction(alloc);

From 28e32af6c3f947a703ee4f119542062c3878c481 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 13:45:34 -0600
Subject: [PATCH 37/84] Only enable for float and navi

---
 src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp | 3 +++
 src/targets/gpu/prefuse_ops.cpp                      | 7 +++++--
 src/targets/gpu/target.cpp                           | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
index bed64052009..ddb30a7f18e 100644
--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -34,8 +34,11 @@ struct module_pass_manager;
 
 namespace gpu {
 
+struct context;
+
 struct MIGRAPHX_GPU_EXPORT prefuse_ops
 {
+    context* ctx = nullptr;
     bool enable_attention = false;
     std::string name() const { return "gpu::prefuse_ops"; }
     void apply(module_pass_manager& mpm) const;
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 34f19869660..df948754d6b 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -297,6 +297,9 @@ struct find_channelwise_convolution
         auto weights     = ins->inputs().back();
         auto num_spatial = ins->get_shape().ndim() - 2;
 
+        if(input->get_shape().type() != shape::float_type)
+            return;
+
         m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights);
     }
 };
@@ -326,8 +329,8 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
         match::find_matches(mpm.get_module(), find_add_layernorm{});
     }
     match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention});
-    match::find_matches(mpm.get_module(), find_channelwise_convolution{});
-
+    if(ctx != nullptr and starts_with(ctx->get_current_device().get_gfx_name(), "gfx1"))
+        match::find_matches(mpm.get_module(), find_channelwise_convolution{});
     if(enabled(MIGRAPHX_DISABLE_MLIR{}))
     {
         inline_group_sub_module(mpm);
diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp
index ad8d0a36f2b..348f1b66495 100644
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -129,7 +129,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
         optimize_module{},
         layout_convolution{.channels_last = enabled(MIGRAPHX_ENABLE_NHWC{})},
         dead_code_elimination{},
-        prefuse_ops{},
+        prefuse_ops{.ctx = &ctx},
         dead_code_elimination{},
         dead_code_elimination{},
         rewrite_reduce{},

From e35373cd6aa365a4b5eca45e1644d40ddfadace3 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 13:45:38 -0600
Subject: [PATCH 38/84] Format

---
 src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
index ddb30a7f18e..e559132cd3d 100644
--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -38,7 +38,7 @@ struct context;
 
 struct MIGRAPHX_GPU_EXPORT prefuse_ops
 {
-    context* ctx = nullptr;
+    context* ctx          = nullptr;
     bool enable_attention = false;
     std::string name() const { return "gpu::prefuse_ops"; }
     void apply(module_pass_manager& mpm) const;

From f69d9bb21a08fc0539879c03588485f3332e7da0 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:45:00 -0600
Subject: [PATCH 39/84] Fix tidy

---
 .../include/migraphx/kernels/channelwise_conv.hpp    |  6 +++---
 .../include/migraphx/kernels/spatial_tiler.hpp       | 12 ++++++------
 test/gpu/prefuse_ops.cpp                             |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index f7be9bf6a66..4837e99b719 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -49,15 +49,15 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
     auto w_ch   = tiler.slice(w);
     auto out_ch = tiler.slice(output);
 
-    using T = typename Output::type;
-    array<T, decltype(w_ch.get_shape().elements()){}> wregs_arr;
+    using t = typename Output::type;
+    array<t, decltype(w_ch.get_shape().elements()){}> wregs_arr;
     auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape()));
     copy(w_ch.begin(), w_ch.end(), wregs.begin());
 
     __syncthreads();
 
     tiler.for_each([&](auto out_pos, auto out_multi) {
-        T acc = 0;
+        t acc = 0;
         repeat(wregs.get_shape().elements(), [&](auto ki) {
             auto k_multi = wregs.get_shape().multi(ki);
             acc += x_ch[out_multi + k_multi] * wregs[k_multi];
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index df4c7c22fbc..6adcff59fb4 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -78,7 +78,7 @@ struct spatial_tiler
     }
 
     static constexpr index_int tiles_total() { return tiles_per_dim().product(); }
-    static constexpr index_int NDIM() { return out_spatial_lens().size(); }
+    static constexpr index_int ndim() { return out_spatial_lens().size(); }
 
     static constexpr bool is_padded()
     {
@@ -86,7 +86,7 @@ struct spatial_tiler
     }
 
     index idx;
-    array<index_int, NDIM()> tile_origin;
+    array<index_int, ndim()> tile_origin;
 
     // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial)
     template <class InputShape>
@@ -102,10 +102,10 @@ struct spatial_tiler
     template <class Input>
     __device__ auto shared_allocate() const
     {
-        using T                          = typename Input::type;
+        using t                          = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr index_int halo_total_v = hl.product();
-        return uninitialized_buffer<T, halo_total_v>{};
+        return uninitialized_buffer<t, halo_total_v>{};
     }
 
     // Slice a tensor to per-channel spatial view
@@ -120,7 +120,7 @@ struct spatial_tiler
     template <class Input, class Smem>
     __device__ auto copy(Input input, Smem& smem) const
     {
-        using T                          = typename Input::type;
+        using t                          = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr auto halo_shape        = make_shape(hl);
         constexpr index_int halo_total_v = hl.product();
@@ -137,7 +137,7 @@ struct spatial_tiler
             auto halo_multi = halo_shape.multi(index_int{i});
             auto src_pos    = tile_origin + halo_multi;
             if constexpr(is_padded())
-                smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0};
+                smem[i] = in_bounds(src_pos, input_spatial) ? t{input_ch[src_pos]} : t{0};
             else
                 smem[i] = input_ch[src_pos];
         });
diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp
index bdb99e4097b..425fce6d038 100644
--- a/test/gpu/prefuse_ops.cpp
+++ b/test/gpu/prefuse_ops.cpp
@@ -41,7 +41,7 @@ struct pre_gemm_softmax_gemm : migraphx::gpu::gemm_softmax_gemm
 
 static void run_pass(migraphx::module& m)
 {
-    migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{true}, migraphx::dead_code_elimination{}});
+    migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{.enable_attention=true}, migraphx::dead_code_elimination{}});
 }
 
 TEST_CASE(find_gemm_softmax_gemm)

From fb48be7caf1b59f70020c86282048748d4967b47 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:45:04 -0600
Subject: [PATCH 40/84] Format

---
 test/gpu/prefuse_ops.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp
index 425fce6d038..9499c6a5ec5 100644
--- a/test/gpu/prefuse_ops.cpp
+++ b/test/gpu/prefuse_ops.cpp
@@ -41,7 +41,9 @@ struct pre_gemm_softmax_gemm : migraphx::gpu::gemm_softmax_gemm
 
 static void run_pass(migraphx::module& m)
 {
-    migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{.enable_attention=true}, migraphx::dead_code_elimination{}});
+    migraphx::run_passes(
+        m,
+        {migraphx::gpu::prefuse_ops{.enable_attention = true}, migraphx::dead_code_elimination{}});
 }
 
 TEST_CASE(find_gemm_softmax_gemm)

From ef923a8741e8a92bd909214905bdc9468913cbc1 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:49:41 -0600
Subject: [PATCH 41/84] Fix tidy

---
 src/targets/gpu/jit/channelwise_conv.cpp | 2 +-
 src/targets/gpu/prefuse_ops.cpp          | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 608c33bd63f..2d7a8f3d28c 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -74,7 +74,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         options.kernel_name    = v.get("kernel", std::string{"channelwise_conv_kernel"});
         options.virtual_inputs = inputs;
 
-        auto out_lens = out_s.lens();
+        const auto& out_lens = out_s.lens();
 
         // Thread block tile dimensions
         std::vector<std::size_t> tile_sizes(num_spatial);
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index df948754d6b..2f399de9a4e 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -280,10 +280,8 @@ MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins)
         return false;
     auto x_lens = ins->inputs().front()->get_shape().lens();
     auto c_in   = x_lens[1];
-    auto group  = v.at("group").to<int>();
-    if(group != 1 and group != static_cast<int>(c_in))
-        return false;
-    return true;
+    auto group  = v.at("group").to<std::size_t>();
+    return group == 1 or group == c_in;
 }
 
 struct find_channelwise_convolution

From 513fafc85524ca79479d61ed222efbf8eb03e54f Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:51:53 -0600
Subject: [PATCH 42/84] Update year

---
 src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp         | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/array.hpp   | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/index.hpp   | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp  | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp   | 2 +-
 src/targets/gpu/prefuse_ops.cpp                              | 2 +-
 test/gpu/prefuse_ops.cpp                                     | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
index e559132cd3d..a1afd7ab087 100644
--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
index 10270d20c2c..e7977dd4676 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
index 4b9de7ae7ce..1994d0c16c0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
index 410deefb9be..71641fd498d 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
index 0abae0363d7..4578feed9ea 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
index 4bc4d6354b6..e2f7393c32c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 2f399de9a4e..1d8dd6d01bc 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp
index 9499c6a5ec5..90a70c37830 100644
--- a/test/gpu/prefuse_ops.cpp
+++ b/test/gpu/prefuse_ops.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

From ec3c657ae4719743bd4538d68e7c562330712773 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:55:28 -0600
Subject: [PATCH 43/84] Fix cppcheck

---
 src/targets/gpu/jit/channelwise_conv.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 2d7a8f3d28c..a950bcfc791 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -97,9 +97,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         std::vector<std::size_t> output_tile_sizes = tile_sizes;
         output_tile_sizes.back() *= noutputs;
 
-        std::size_t block_size = 1;
-        for(auto t : tile_sizes)
-            block_size *= t;
+        std::size_t block_size = std::accumulate(tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>());
 
         // Blocks: N * C_out * prod(ceil(out_spatial / output_tile))
         std::size_t num_blocks = out_lens[0] * out_lens[1];
@@ -150,8 +148,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         if(exhaustive)
         {
             std::vector<std::size_t> sizes;
-            for(auto i : range(1, 64))
-                sizes.push_back(i * 4);
+            transform(range(1, 64), std::back_inserter(sizes), [](auto i) { return i * 4; });
             for(auto tile_h : sizes)
             {
                 for(auto tile_w : sizes)

From 5d8051bfa5682e82aa426c81173c8f06d2ca1d92 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 15:55:32 -0600
Subject: [PATCH 44/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index a950bcfc791..1d153c323ba 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -97,7 +97,8 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         std::vector<std::size_t> output_tile_sizes = tile_sizes;
         output_tile_sizes.back() *= noutputs;
 
-        std::size_t block_size = std::accumulate(tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>());
+        std::size_t block_size = std::accumulate(
+            tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>());
 
         // Blocks: N * C_out * prod(ceil(out_spatial / output_tile))
         std::size_t num_blocks = out_lens[0] * out_lens[1];

From 99c896c67ad44aef914a82f9753d4b9c67b38c3b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:00:53 +0000
Subject: [PATCH 45/84] Use std algos

---
 src/targets/gpu/jit/channelwise_conv.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 1d153c323ba..cb7762a69b1 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -77,7 +77,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         const auto& out_lens = out_s.lens();
 
         // Thread block tile dimensions
-        std::vector<std::size_t> tile_sizes(num_spatial);
+        std::vector<std::size_t> tile_sizes(num_spatial, 1);
         if(num_spatial == 1)
         {
             tile_sizes[0] = v.get("tile_w", std::size_t{256});
@@ -86,8 +86,6 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
         {
             tile_sizes[0]               = v.get("tile_h", std::size_t{8});
             tile_sizes[num_spatial - 1] = v.get("tile_w", std::size_t{32});
-            for(std::size_t d = 1; d + 1 < num_spatial; ++d)
-                tile_sizes[d] = 1;
         }
 
         // Outputs per lane along W (last spatial dim)
@@ -101,12 +99,15 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>());
 
         // Blocks: N * C_out * prod(ceil(out_spatial / output_tile))
-        std::size_t num_blocks = out_lens[0] * out_lens[1];
-        for(std::size_t d = 0; d < num_spatial; ++d)
-        {
-            auto out_spatial = out_lens[2 + d];
-            num_blocks *= (out_spatial + output_tile_sizes[d] - 1) / output_tile_sizes[d];
-        }
+        auto num_blocks = std::inner_product(
+            out_lens.begin() + 2,
+            out_lens.end(),
+            output_tile_sizes.begin(),
+            out_lens[0] * out_lens[1],
+            std::multiplies<>{},
+            [](auto out_spatial, auto tile) {
+                return (out_spatial + tile - 1) / tile;
+            });
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 

From 9f0903d29cce8e77b9a527e25f6bd1641efe80c4 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:00:56 +0000
Subject: [PATCH 46/84] Format

---
 src/targets/gpu/jit/channelwise_conv.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index cb7762a69b1..60d02fab30d 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -105,9 +105,7 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
             output_tile_sizes.begin(),
             out_lens[0] * out_lens[1],
             std::multiplies<>{},
-            [](auto out_spatial, auto tile) {
-                return (out_spatial + tile - 1) / tile;
-            });
+            [](auto out_spatial, auto tile) { return (out_spatial + tile - 1) / tile; });
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 

From 680328bc022e79e74d2a1dc8b2ca62d70d01005b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:03:34 +0000
Subject: [PATCH 47/84] Move in_bounds function

---
 .../gpu/kernels/include/migraphx/kernels/shape.hpp    | 11 +++++++++++
 .../include/migraphx/kernels/spatial_tiler.hpp        | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
index 54da3fccd38..a2ae4f9c0dc 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -199,6 +199,17 @@ struct shape : equality_comparable<shape<Lens, Strides>>
     }
 };
 
+template <class Pos, class Lens>
+constexpr bool in_bounds(Pos pos, Lens lens)
+{
+    for(index_int d = 0; d < pos.size(); d++)
+    {
+        if(pos[d] >= lens[d])
+            return false;
+    }
+    return true;
+}
+
 template <class Lens>
 constexpr auto calculate_strides(Lens)
 {
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 6adcff59fb4..b5358469398 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -33,17 +33,6 @@
 
 namespace migraphx {
 
-template <class Pos, class Lens>
-constexpr bool in_bounds(Pos pos, Lens lens)
-{
-    for(index_int d = 0; d < pos.size(); d++)
-    {
-        if(pos[d] >= lens[d])
-            return false;
-    }
-    return true;
-}
-
 template <index_int NTiles, class TileLens, class OutputShape>
 struct spatial_tiler
 {

From 11203097642fd89914ebdb613120e0ffe759778f Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:05:03 +0000
Subject: [PATCH 48/84] Rename type

---
 .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index b5358469398..d427b2271c9 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -91,10 +91,10 @@ struct spatial_tiler
     template <class Input>
     __device__ auto shared_allocate() const
     {
-        using t                          = typename Input::type;
+        using type                          = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr index_int halo_total_v = hl.product();
-        return uninitialized_buffer<t, halo_total_v>{};
+        return uninitialized_buffer<type, halo_total_v>{};
     }
 
     // Slice a tensor to per-channel spatial view

From 76457922593e9eb99a5b92249803d47ceba3f7ea Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:05:06 +0000
Subject: [PATCH 49/84] Format

---
 .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index d427b2271c9..e874365715c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -91,7 +91,7 @@ struct spatial_tiler
     template <class Input>
     __device__ auto shared_allocate() const
     {
-        using type                          = typename Input::type;
+        using type                       = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr index_int halo_total_v = hl.product();
         return uninitialized_buffer<type, halo_total_v>{};

From 32b58940bf6d99a6e234350d16d22793ecf37bee Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:11:43 -0600
Subject: [PATCH 50/84] Fix compilation failure

---
 .../migraphx/kernels/spatial_tiler.hpp        | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index e874365715c..dccdd0eb6c1 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -47,11 +47,8 @@ struct spatial_tiler
         return return_array_c([] {
             auto result       = join(index_ints<1, 1>{}, TileLens{});
             constexpr auto nd = result.size();
-            array<index_int, nd> r;
-            for(index_int i = 0; i < nd; i++)
-                r[i] = result[i];
-            r[nd - 1] *= NTiles;
-            return r;
+            result[nd - 1] *= NTiles;
+            return result;
         });
     }
 
@@ -93,8 +90,7 @@ struct spatial_tiler
     {
         using type                       = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
-        constexpr index_int halo_total_v = hl.product();
-        return uninitialized_buffer<type, halo_total_v>{};
+        return uninitialized_buffer<type, hl.product()>{};
     }
 
     // Slice a tensor to per-channel spatial view
@@ -109,10 +105,9 @@ struct spatial_tiler
     template <class Input, class Smem>
     __device__ auto copy(Input input, Smem& smem) const
     {
-        using t                          = typename Input::type;
+        using type                          = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr auto halo_shape        = make_shape(hl);
-        constexpr index_int halo_total_v = hl.product();
         constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial()).lens;
 
         constexpr auto n_out  = nslices(OutputShape{}, keep_spatial());
@@ -122,11 +117,11 @@ struct spatial_tiler
         auto input_ch         = slice_tensor(
             input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial());
 
-        idx.local_stride(_c<halo_total_v>, [&](auto i) {
+        idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(index_int{i});
             auto src_pos    = tile_origin + halo_multi;
             if constexpr(is_padded())
-                smem[i] = in_bounds(src_pos, input_spatial) ? t{input_ch[src_pos]} : t{0};
+                smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
             else
                 smem[i] = input_ch[src_pos];
         });
@@ -168,7 +163,7 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
         return result;
     }));
     auto block_multi           = block_shape.multi(idx.group);
-    auto tile_origin = generate_array<index_int>(_c<tiler_type::NDIM()>, [&](auto d) -> index_int {
+    auto tile_origin = generate_array<index_int>(_c<tiler_type::ndim()>, [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;
         else

From 214126495ae98df54338b1a86092849282fca3f6 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:11:46 -0600
Subject: [PATCH 51/84] Format

---
 .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index dccdd0eb6c1..6f5c112b530 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -105,7 +105,7 @@ struct spatial_tiler
     template <class Input, class Smem>
     __device__ auto copy(Input input, Smem& smem) const
     {
-        using type                          = typename Input::type;
+        using type                       = typename Input::type;
         constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
         constexpr auto halo_shape        = make_shape(hl);
         constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial()).lens;

From 19cf17396e12cd0b3d27b210aae6b66ce6eef48c Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:24:06 -0600
Subject: [PATCH 52/84] Simplify some more

---
 .../include/migraphx/kernels/spatial_tiler.hpp   | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 6f5c112b530..39aa1c0200c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -64,7 +64,7 @@ struct spatial_tiler
     }
 
     static constexpr index_int tiles_total() { return tiles_per_dim().product(); }
-    static constexpr index_int ndim() { return out_spatial_lens().size(); }
+    static constexpr auto ndim() { return out_spatial_lens().size(); }
 
     static constexpr bool is_padded()
     {
@@ -118,7 +118,7 @@ struct spatial_tiler
             input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial());
 
         idx.local_stride(_c<hl.product()>, [&](auto i) {
-            auto halo_multi = halo_shape.multi(index_int{i});
+            auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
             if constexpr(is_padded())
                 smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
@@ -134,7 +134,7 @@ struct spatial_tiler
     __device__ void for_each(F f) const
     {
         idx.local_stride(_c<output_lens().product()>, [&](auto j) {
-            auto out_multi = make_shape(output_lens()).multi(index_int{j});
+            auto out_multi = make_shape(output_lens()).multi(j);
             auto out_pos   = tile_origin + out_multi;
             if constexpr(is_padded())
             {
@@ -152,18 +152,14 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
     constexpr auto block_shape = make_shape(return_array_c([] {
-        constexpr auto tpd     = decltype(tiler_type::tiles_per_dim()){};
-        constexpr index_int nd = tpd.size();
-        constexpr auto olens   = OutputShape{}.lens;
-        array<index_int, nd> result;
-        for(index_int i = 0; i < nd; i++)
-            result[i] = tpd[i];
+        auto result     = tiler_type::tiles_per_dim().base();        
+        auto olens   = OutputShape{}.lens;
         result[0] = olens[0];
         result[1] = olens[1];
         return result;
     }));
     auto block_multi           = block_shape.multi(idx.group);
-    auto tile_origin = generate_array<index_int>(_c<tiler_type::ndim()>, [&](auto d) -> index_int {
+    auto tile_origin = generate_array<index_int>(tiler_type::ndim(), [&](auto d) -> index_int {
         if constexpr(d < 2)
             return 0;
         else

From b39416ec3ac841315c5f40d41f13fbd90342e701 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:24:09 -0600
Subject: [PATCH 53/84] Format

---
 .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 39aa1c0200c..0c03a986c57 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -152,8 +152,8 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
 
     constexpr auto block_shape = make_shape(return_array_c([] {
-        auto result     = tiler_type::tiles_per_dim().base();        
-        auto olens   = OutputShape{}.lens;
+        auto result = tiler_type::tiles_per_dim().base();
+        auto olens  = OutputShape{}.lens;
         result[0] = olens[0];
         result[1] = olens[1];
         return result;

From 6c990fd3d44ea4f725535fcb53b164b0a693b3c0 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:28:28 +0000
Subject: [PATCH 54/84] Use std::transform

---
 src/targets/gpu/prefuse_ops.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 1d8dd6d01bc..b2e5e2552bd 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -251,15 +251,18 @@ struct channelwise_conv
 
     shape compute_shape(std::vector<shape> inputs) const
     {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this}.has(2).same_ndims();
         auto x_lens = inputs[0].lens();
         auto w_lens = inputs[1].lens();
         std::vector<std::size_t> out_lens;
         out_lens.push_back(x_lens[0]);
         out_lens.push_back(w_lens[0]);
-        for(std::size_t d = 0; d < num_spatial; ++d)
-            out_lens.push_back(x_lens[2 + d] - w_lens[2 + d] + 1);
-        return {inputs.front().type(), out_lens};
+        std::transform(x_lens.begin() + 2,
+                       x_lens.begin() + 2 + num_spatial,
+                       w_lens.begin() + 2,
+                       std::back_inserter(out_lens),
+                       [](auto x, auto w) { return x - w + 1; });
+        return inputs[0].with_lens(out_lens);
     }
 };
 MIGRAPHX_REGISTER_OP(channelwise_conv);

From 90638f89021747627fb4950452cc192c50fe1ac7 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:31:39 -0600
Subject: [PATCH 55/84] Precompute slices

---
 .../kernels/include/migraphx/kernels/channelwise_conv.hpp    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 4837e99b719..30cd136443c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -48,6 +48,7 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
     auto x_ch   = tiler.copy(x, smem);
     auto w_ch   = tiler.slice(w);
     auto out_ch = tiler.slice(output);
+    auto xs_pack = pack(tiler.slice(inputs)...);
 
     using t = typename Output::type;
     array<t, decltype(w_ch.get_shape().elements()){}> wregs_arr;
@@ -62,7 +63,9 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
             auto k_multi = wregs.get_shape().multi(ki);
             acc += x_ch[out_multi + k_multi] * wregs[k_multi];
         });
-        out_ch[out_pos] = f(acc, tiler.slice(inputs)[out_pos]...);
+        xs_pack([&](auto... xs) {
+            out_ch[out_pos] = f(acc, xs[out_pos]...);
+        });
     });
 }
 

From 053bf4fd033cbd009aa35496f9902a4a29c76d64 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:31:42 -0600
Subject: [PATCH 56/84] Format

---
 .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 30cd136443c..d75cd590c18 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -63,9 +63,7 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
             auto k_multi = wregs.get_shape().multi(ki);
             acc += x_ch[out_multi + k_multi] * wregs[k_multi];
         });
-        xs_pack([&](auto... xs) {
-            out_ch[out_pos] = f(acc, xs[out_pos]...);
-        });
+        xs_pack([&](auto... xs) { out_ch[out_pos] = f(acc, xs[out_pos]...); });
     });
 }
 

From ffaa5c384d3a2bb24adfb725ab6fafa476f76d0e Mon Sep 17 00:00:00 2001
From: Paul Fultz II <paul.fultz@amd.com>
Date: Mon, 2 Mar 2026 16:32:31 -0600
Subject: [PATCH 57/84] Update
 src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
index e2f7393c32c..89f1a4a615e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
@@ -126,7 +126,7 @@ constexpr auto slice_tensor(Input input, T start, Ss... ss)
     constexpr auto inner_shape = make_slice(get_shape_c<Input>{}, ss...);
     auto outer_lens            = transform(
         get_shape_c<Input>{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; });
-    // TODO: Handle non-divisble dimensions
+    // TODO: Handle non-divisible dimensions
     auto outer_shape = make_shape(outer_lens, get_shape_c<Input>{}.strides * inner_shape.lens);
     auto offset      = outer_shape.index(start);
     MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() ==

From 8a06baf1960d551b2096f607fa3d9cccc4bc8380 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:36:53 -0600
Subject: [PATCH 58/84] Change the navi check

---
 src/targets/gpu/prefuse_ops.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index b2e5e2552bd..8a3645ab828 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -323,6 +323,8 @@ void inline_group_sub_module(module_pass_manager& mpm)
 
 void prefuse_ops::apply(module_pass_manager& mpm) const
 {
+    const auto& device_name = ctx == nullptr ? "" : ctx->get_current_device().get_gfx_name();
+    const bool is_navi = starts_with(device_name, "gfx11") or starts_with(device_name, "gfx12");
     if(enabled(MIGRAPHX_ENABLE_LAYERNORM_FUSION{}))
     {
         match::find_matches(mpm.get_module(), find_layernorm{});
@@ -330,7 +332,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
         match::find_matches(mpm.get_module(), find_add_layernorm{});
     }
     match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention});
-    if(ctx != nullptr and starts_with(ctx->get_current_device().get_gfx_name(), "gfx1"))
+    if(is_navi)
         match::find_matches(mpm.get_module(), find_channelwise_convolution{});
     if(enabled(MIGRAPHX_DISABLE_MLIR{}))
     {

From 258af41258fd1a10e327ed708f8f109c135e40a6 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:45:04 +0000
Subject: [PATCH 59/84] Split verify classes

---
 test/verify/test_channelwise_conv.cpp         | 137 ------------------
 test/verify/test_channelwise_conv_1d.cpp      |  49 +++++++
 .../test_channelwise_conv_depthwise.cpp       |  45 ++++++
 .../test_channelwise_conv_depthwise_5x5.cpp   |  46 ++++++
 test/verify/test_channelwise_conv_large.cpp   |  45 ++++++
 .../test_channelwise_conv_non_divisible.cpp   |  46 ++++++
 .../test_channelwise_conv_single_channel.cpp  |  46 ++++++
 7 files changed, 277 insertions(+), 137 deletions(-)
 delete mode 100644 test/verify/test_channelwise_conv.cpp
 create mode 100644 test/verify/test_channelwise_conv_1d.cpp
 create mode 100644 test/verify/test_channelwise_conv_depthwise.cpp
 create mode 100644 test/verify/test_channelwise_conv_depthwise_5x5.cpp
 create mode 100644 test/verify/test_channelwise_conv_large.cpp
 create mode 100644 test/verify/test_channelwise_conv_non_divisible.cpp
 create mode 100644 test/verify/test_channelwise_conv_single_channel.cpp

diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp
deleted file mode 100644
index 91731d1d2f2..00000000000
--- a/test/verify/test_channelwise_conv.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include "verify_program.hpp"
-#include <migraphx/program.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/make_op.hpp>
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_depthwise : verify_program<test_channelwise_conv_depthwise<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
-        mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_depthwise<migraphx::shape::float_type>;
-template struct test_channelwise_conv_depthwise<migraphx::shape::half_type>;
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_single_channel
-    : verify_program<test_channelwise_conv_single_channel<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
-        mm->add_instruction(migraphx::make_op("convolution"), input, weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_single_channel<migraphx::shape::float_type>;
-template struct test_channelwise_conv_single_channel<migraphx::shape::half_type>;
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_depthwise_5x5
-    : verify_program<test_channelwise_conv_depthwise_5x5<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}});
-        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::float_type>;
-template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::half_type>;
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_1d : verify_program<test_channelwise_conv_1d<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}});
-        mm->add_instruction(
-            migraphx::make_op("convolution",
-                              {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}),
-            input,
-            weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_1d<migraphx::shape::float_type>;
-template struct test_channelwise_conv_1d<migraphx::shape::half_type>;
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_large : verify_program<test_channelwise_conv_large<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}});
-        mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_large<migraphx::shape::float_type>;
-template struct test_channelwise_conv_large<migraphx::shape::half_type>;
-
-template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_non_divisible
-    : verify_program<test_channelwise_conv_non_divisible<DType>>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm     = p.get_main_module();
-        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}});
-        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
-        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
-        return p;
-    }
-    std::string section() const { return "conv"; }
-};
-template struct test_channelwise_conv_non_divisible<migraphx::shape::float_type>;
-template struct test_channelwise_conv_non_divisible<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_1d.cpp b/test/verify/test_channelwise_conv_1d.cpp
new file mode 100644
index 00000000000..e78d2095ec7
--- /dev/null
+++ b/test/verify/test_channelwise_conv_1d.cpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_1d : verify_program<test_channelwise_conv_1d<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}});
+        mm->add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}),
+            input,
+            weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_1d<migraphx::shape::float_type>;
+template struct test_channelwise_conv_1d<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_depthwise.cpp b/test/verify/test_channelwise_conv_depthwise.cpp
new file mode 100644
index 00000000000..326184e9053
--- /dev/null
+++ b/test/verify/test_channelwise_conv_depthwise.cpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_depthwise : verify_program<test_channelwise_conv_depthwise<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_depthwise<migraphx::shape::float_type>;
+template struct test_channelwise_conv_depthwise<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_depthwise_5x5.cpp b/test/verify/test_channelwise_conv_depthwise_5x5.cpp
new file mode 100644
index 00000000000..425fe7187a7
--- /dev/null
+++ b/test/verify/test_channelwise_conv_depthwise_5x5.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_depthwise_5x5
+    : verify_program<test_channelwise_conv_depthwise_5x5<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::float_type>;
+template struct test_channelwise_conv_depthwise_5x5<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_large.cpp b/test/verify/test_channelwise_conv_large.cpp
new file mode 100644
index 00000000000..f736fc788f5
--- /dev/null
+++ b/test/verify/test_channelwise_conv_large.cpp
@@ -0,0 +1,45 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_large : verify_program<test_channelwise_conv_large<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_large<migraphx::shape::float_type>;
+template struct test_channelwise_conv_large<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_non_divisible.cpp b/test/verify/test_channelwise_conv_non_divisible.cpp
new file mode 100644
index 00000000000..69a458c5210
--- /dev/null
+++ b/test/verify/test_channelwise_conv_non_divisible.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_non_divisible
+    : verify_program<test_channelwise_conv_non_divisible<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_non_divisible<migraphx::shape::float_type>;
+template struct test_channelwise_conv_non_divisible<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_single_channel.cpp b/test/verify/test_channelwise_conv_single_channel.cpp
new file mode 100644
index 00000000000..9d214be82ec
--- /dev/null
+++ b/test/verify/test_channelwise_conv_single_channel.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_single_channel
+    : verify_program<test_channelwise_conv_single_channel<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        mm->add_instruction(migraphx::make_op("convolution"), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_single_channel<migraphx::shape::float_type>;
+template struct test_channelwise_conv_single_channel<migraphx::shape::half_type>;

From bcd468d5be758b0a8b99822ed4a9987e5aa84c66 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:47:51 -0600
Subject: [PATCH 60/84] Revert the reduce and index changes

---
 .../include/migraphx/kernels/index.hpp        | 31 +------------------
 .../include/migraphx/kernels/reduce.hpp       | 22 ++++++-------
 2 files changed, 10 insertions(+), 43 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
index 1994d0c16c0..77da7283190 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -267,12 +267,6 @@ struct index
         }
     }
 
-    template <class F, class N>
-    __device__ void device_stride(N n, F f) const
-    {
-        for_stride<false>(_c<0>, n, _c<1>, f);
-    }
-
     template <class F, class N>
     __device__ void global_stride(N n, F f) const
     {
@@ -339,28 +333,5 @@ struct per_block
     }
 };
 
-struct per_device
-{
-    index idx;
-
-    constexpr auto local() const { return idx.global; }
-
-    constexpr auto nlocal() const { return idx.nglobal(); }
-
-    constexpr auto size() const { return _c<1>; }
-
-    template <class N, class F>
-    constexpr void group_stride(N n, F f) const
-    {
-        return idx.device_stride(n, f);
-    }
-
-    template <class N, class F>
-    constexpr void local_stride(N n, F f) const
-    {
-        return idx.global_stride(n, f);
-    }
-};
-
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_INDEX_HPP
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
index 4578feed9ea..59bf17b3eda 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -516,13 +516,12 @@ struct block
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule = per_device, class F>
+    template <class Output, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -571,13 +570,12 @@ struct block_large
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule = per_device, class F>
+    template <class Output, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        schedule.local_stride(nelements * idx.nlocal(), [&](auto i) {
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -650,13 +648,12 @@ struct subwave
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule = per_device, class F>
+    template <class Output, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        schedule.local_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
+        idx.global_stride(nelements * idx.nlocal_subwave<SubWaveSize>(), [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal_subwave<SubWaveSize>());
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });
@@ -712,13 +709,12 @@ struct lane
         return reducer<Slicer>{{}, idx, slicer};
     }
 
-    template <class Output, class Schedule = per_device, class F>
+    template <class Output, class F>
     static __device__ void run(F f)
     {
         auto idx                 = make_index();
-        auto schedule            = Schedule{idx};
         constexpr auto nelements = get_shape_c<Output>{}.elements();
-        schedule.local_stride(nelements, [&](auto i) {
+        idx.global_stride(nelements, [&](auto i) {
             const auto out_idx = get_shape_c<Output>{}.multi(i);
             f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
         });

From 7ba2ccac22b58646205f0059810e9b748cf2f8ee Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 22:49:52 +0000
Subject: [PATCH 61/84] Revert pooling changes

---
 .../gpu/kernels/include/migraphx/kernels/pooling.hpp      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
index 71641fd498d..76bb7c3cb6b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -186,18 +186,18 @@ constexpr window<Window, Stride, Padding> make_window(Window w, Stride s, Paddin
     return {w, s, p};
 }
 
-template <class Algo, index_int GroupSize, class Schedule = per_device, class Output, class F>
+template <class Algo, index_int GroupSize, class Output, class F>
 __device__ void pooling_reduce(Output output, F f)
 {
     if constexpr(GroupSize < 2)
     {
-        Algo::template run<decltype(output), Schedule>(
+        Algo::template run<decltype(output)>(
             [&](auto out_idx, auto r) { r.outer([&] { output[out_idx] = f(out_idx, r); }); });
     }
     else
     {
         auto goutput = as_vec<GroupSize>(output, output.get_shape().lens.size() - _c<1>);
-        Algo::template run<decltype(goutput), Schedule>([&](auto out_idx, auto r) {
+        Algo::template run<decltype(goutput)>([&](auto out_idx, auto r) {
             auto i = out_idx;
             i.back() *= GroupSize;
             auto result = vec_generate<GroupSize>([&](auto) {

From 61f6ffb4503cbfbf3a8b47d6c0e9e2547924a3f3 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:53:03 -0600
Subject: [PATCH 62/84] Use signed integer

---
 src/targets/gpu/prefuse_ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 8a3645ab828..7dab5508394 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -261,7 +261,7 @@ struct channelwise_conv
                        x_lens.begin() + 2 + num_spatial,
                        w_lens.begin() + 2,
                        std::back_inserter(out_lens),
-                       [](auto x, auto w) { return x - w + 1; });
+                       [](std::ptrdiff_t x, std::ptrdiff_t w) { return x - w + 1; });
         return inputs[0].with_lens(out_lens);
     }
 };

From b5cad757329b54aa3a3967b61d933034603dd64f Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:57:11 -0600
Subject: [PATCH 63/84] Update year

---
 src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
index a2ae4f9c0dc..0c59388e8b3 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

From 5b49459fd3a750fa04bb331b152e1173064c5f79 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Mon, 2 Mar 2026 16:57:30 -0600
Subject: [PATCH 64/84] Format

---
 .../migraphx/kernels/channelwise_conv.hpp        |  6 +++---
 .../include/migraphx/kernels/spatial_tiler.hpp   | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index d75cd590c18..92e60351edd 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -45,9 +45,9 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
 
     __shared__ decltype(tiler.template shared_allocate<Input>()) smem;
 
-    auto x_ch   = tiler.copy(x, smem);
-    auto w_ch   = tiler.slice(w);
-    auto out_ch = tiler.slice(output);
+    auto x_ch    = tiler.copy(x, smem);
+    auto w_ch    = tiler.slice(w);
+    auto out_ch  = tiler.slice(output);
     auto xs_pack = pack(tiler.slice(inputs)...);
 
     using t = typename Output::type;
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 0c03a986c57..9be73bc6d52 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -88,8 +88,8 @@ struct spatial_tiler
     template <class Input>
     __device__ auto shared_allocate() const
     {
-        using type                       = typename Input::type;
-        constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
+        using type        = typename Input::type;
+        constexpr auto hl = halo_lens_for<get_shape_c<Input>>();
         return uninitialized_buffer<type, hl.product()>{};
     }
 
@@ -105,10 +105,10 @@ struct spatial_tiler
     template <class Input, class Smem>
     __device__ auto copy(Input input, Smem& smem) const
     {
-        using type                       = typename Input::type;
-        constexpr auto hl                = halo_lens_for<get_shape_c<Input>>();
-        constexpr auto halo_shape        = make_shape(hl);
-        constexpr auto input_spatial     = make_slice(get_shape_c<Input>{}, keep_spatial()).lens;
+        using type                   = typename Input::type;
+        constexpr auto hl            = halo_lens_for<get_shape_c<Input>>();
+        constexpr auto halo_shape    = make_shape(hl);
+        constexpr auto input_spatial = make_slice(get_shape_c<Input>{}, keep_spatial()).lens;
 
         constexpr auto n_out  = nslices(OutputShape{}, keep_spatial());
         constexpr auto n_in   = nslices(get_shape_c<Input>{}, keep_spatial());
@@ -154,8 +154,8 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     constexpr auto block_shape = make_shape(return_array_c([] {
         auto result = tiler_type::tiles_per_dim().base();
         auto olens  = OutputShape{}.lens;
-        result[0] = olens[0];
-        result[1] = olens[1];
+        result[0]   = olens[0];
+        result[1]   = olens[1];
         return result;
     }));
     auto block_multi           = block_shape.multi(idx.group);

From 18a7efa306a00f737ce1bfdf6f3665b6231cb946 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 3 Apr 2026 16:14:31 -0500
Subject: [PATCH 65/84] Support padding

---
 src/targets/gpu/jit/channelwise_conv.cpp      |  8 ++-
 .../migraphx/kernels/channelwise_conv.hpp     | 12 ++--
 .../migraphx/kernels/spatial_tiler.hpp        | 65 +++++++++++++++++--
 src/targets/gpu/prefuse_ops.cpp               | 30 ++++++---
 4 files changed, 96 insertions(+), 19 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 60d02fab30d..6425b2a6cfd 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -50,7 +50,7 @@ extern "C" {
 MIGRAPHX_GLOBAL void ${kernel}(${params})
 {
     transform_args(make_tensors(), rotate_last())(${args})([](auto output, auto x, auto w, auto... inputs) {
-        channelwise_conv<index_ints<${tile}>, ${ntiles}>(index_ints<${tile}>{}, ${post}, output, x, w, inputs...);
+        channelwise_conv<index_ints<${tile}>, ${ntiles}>(index_ints<${tile}>{}, index_ints<${padding}>{}, ${post}, output, x, w, inputs...);
     });
 }
 
@@ -109,9 +109,15 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
+        auto full_padding = v.get("padding", std::vector<std::size_t>{});
+        std::vector<std::size_t> padding(num_spatial, 0);
+        for(std::size_t i = 0; i < num_spatial and i < full_padding.size(); i++)
+            padding[i] = full_padding[i];
+
         auto src = interpolate_string(channelwise_conv_kernel,
                                       {{"tile", to_string_range(tile_sizes)},
                                        {"ntiles", std::to_string(noutputs)},
+                                       {"padding", to_string_range(padding)},
                                        {"kernel", options.kernel_name},
                                        {"params", enum_params(inputs.size(), "void * private_p")},
                                        {"args", enum_params(inputs.size(), "private_p")},
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index 92e60351edd..c0c25e38155 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -33,15 +33,17 @@ namespace migraphx {
 
 template <class TileLens,
           index_int NTiles,
+          class Padding,
           class F,
           class Output,
           class Input,
           class Weights,
           class... Inputs>
-__device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs)
+__device__ void channelwise_conv(
+    TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs)
 {
     auto idx   = make_index();
-    auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{});
+    auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{}, Padding{});
 
     __shared__ decltype(tiler.template shared_allocate<Input>()) smem;
 
@@ -50,15 +52,15 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights
     auto out_ch  = tiler.slice(output);
     auto xs_pack = pack(tiler.slice(inputs)...);
 
-    using t = typename Output::type;
-    array<t, decltype(w_ch.get_shape().elements()){}> wregs_arr;
+    using type = typename Output::type;
+    array<type, decltype(w_ch.get_shape().elements()){}> wregs_arr;
     auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape()));
     copy(w_ch.begin(), w_ch.end(), wregs.begin());
 
     __syncthreads();
 
     tiler.for_each([&](auto out_pos, auto out_multi) {
-        t acc = 0;
+        type acc = 0;
         repeat(wregs.get_shape().elements(), [&](auto ki) {
             auto k_multi = wregs.get_shape().multi(ki);
             acc += x_ch[out_multi + k_multi] * wregs[k_multi];
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 9be73bc6d52..78c3a269304 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -33,7 +33,13 @@
 
 namespace migraphx {
 
-template <index_int NTiles, class TileLens, class OutputShape>
+template <index_int... Ps>
+constexpr bool has_nonzero(index_ints<Ps...>)
+{
+    return ((Ps != 0) or ...);
+}
+
+template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<>>
 struct spatial_tiler
 {
     static constexpr auto keep_spatial()
@@ -71,17 +77,33 @@ struct spatial_tiler
         return (out_spatial_lens() != tiles_per_dim() * output_lens());
     }
 
+    // Full-dimensional padding: (0, 0, p_h, p_w, ...)
+    static constexpr auto full_padding() { return join(index_ints<0, 0>{}, Padding{}); }
+
+    static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); }
+
     index idx;
     array<index_int, ndim()> tile_origin;
 
     // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial)
+    // With padding, the output is larger so the raw difference is too small; add padding back.
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
         constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
         constexpr auto halo_extra =
             transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
-        return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
+        if constexpr(has_conv_padding())
+        {
+            constexpr auto corrected = transform(
+                halo_extra, full_padding(), [](auto h, auto p) -> index_int { return h + p; });
+            return transform(
+                output_lens(), corrected, [](auto o, auto h) -> index_int { return o + h; });
+        }
+        else
+        {
+            return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
+        }
     }
 
     // Type for shared memory allocation
@@ -120,10 +142,22 @@ struct spatial_tiler
         idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
-            if constexpr(is_padded())
-                smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
+            if constexpr(has_conv_padding())
+            {
+                constexpr auto pad = full_padding();
+                auto input_pos     = src_pos - pad;
+                smem[i] =
+                    in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
+            }
+            else if constexpr(is_padded())
+            {
+                smem[i] =
+                    in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
+            }
             else
+            {
                 smem[i] = input_ch[src_pos];
+            }
         });
 
         return make_tensor_view(smem.data(), halo_shape);
@@ -169,5 +203,28 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
     return tiler_type{idx, tile_origin};
 }
 
+template <index_int NTiles, class TileLens, class OutputShape, class Padding>
+__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding)
+{
+    using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape, Padding>;
+
+    constexpr auto block_shape = make_shape(return_array_c([] {
+        auto result = tiler_type::tiles_per_dim().base();
+        auto olens  = OutputShape{}.lens;
+        result[0]   = olens[0];
+        result[1]   = olens[1];
+        return result;
+    }));
+    auto block_multi           = block_shape.multi(idx.group);
+    auto tile_origin = generate_array<index_int>(tiler_type::ndim(), [&](auto d) -> index_int {
+        if constexpr(d < 2)
+            return 0;
+        else
+            return block_multi[d] * tiler_type::output_lens()[d];
+    });
+
+    return tiler_type{idx, tile_origin};
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP
diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp
index 7dab5508394..3db298930bd 100644
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -240,13 +240,14 @@ struct find_gemm_softmax_gemm
 struct channelwise_conv
 {
     std::size_t num_spatial = 2;
+    std::vector<std::size_t> padding;
 
     std::string name() const { return "gpu::channelwise_conv"; }
 
     template <class Self, class F>
     static auto reflect(Self& self, F f)
     {
-        return pack(f(self.num_spatial, "num_spatial"));
+        return pack(f(self.num_spatial, "num_spatial"), f(self.padding, "padding"));
     }
 
     shape compute_shape(std::vector<shape> inputs) const
@@ -257,11 +258,15 @@ struct channelwise_conv
         std::vector<std::size_t> out_lens;
         out_lens.push_back(x_lens[0]);
         out_lens.push_back(w_lens[0]);
-        std::transform(x_lens.begin() + 2,
-                       x_lens.begin() + 2 + num_spatial,
-                       w_lens.begin() + 2,
-                       std::back_inserter(out_lens),
-                       [](std::ptrdiff_t x, std::ptrdiff_t w) { return x - w + 1; });
+        for(std::size_t i = 0; i < num_spatial; i++)
+        {
+            std::size_t total_pad = 0;
+            if(i < padding.size())
+                total_pad += padding[i];
+            if(i + num_spatial < padding.size())
+                total_pad += padding[i + num_spatial];
+            out_lens.push_back(x_lens[i + 2] + total_pad - w_lens[i + 2] + 1);
+        }
         return inputs[0].with_lens(out_lens);
     }
 };
@@ -274,8 +279,6 @@ MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins)
     auto v = ins->get_operator().to_value();
     if(not all_of(v.at("stride"), [](const value& x) { return x.to<std::size_t>() == 1; }))
         return false;
-    if(not all_of(v.at("padding"), [](const value& x) { return x.to<std::size_t>() == 0; }))
-        return false;
     if(not all_of(v.at("dilation"), [](const value& x) { return x.to<std::size_t>() == 1; }))
         return false;
     auto w_lens = ins->inputs().back()->get_shape().lens();
@@ -301,7 +304,16 @@ struct find_channelwise_convolution
         if(input->get_shape().type() != shape::float_type)
             return;
 
-        m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights);
+        auto v        = ins->get_operator().to_value();
+        auto pad_vals = v.at("padding");
+        std::vector<std::size_t> padding;
+        std::transform(pad_vals.begin(),
+                       pad_vals.end(),
+                       std::back_inserter(padding),
+                       [](const value& x) { return x.to<std::size_t>(); });
+
+        m.replace_instruction(
+            ins, channelwise_conv{num_spatial, std::move(padding)}, input, weights);
     }
 };
 

From c23a8e8f2d111b228ae998d67ea1a0c6b792d9c4 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 3 Apr 2026 16:14:36 -0500
Subject: [PATCH 66/84] Format

---
 .../kernels/include/migraphx/kernels/channelwise_conv.hpp   | 4 ++--
 .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp  | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
index c0c25e38155..be186ecf91e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp
@@ -39,8 +39,8 @@ template <class TileLens,
           class Input,
           class Weights,
           class... Inputs>
-__device__ void channelwise_conv(
-    TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs)
+__device__ void
+channelwise_conv(TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs)
 {
     auto idx   = make_index();
     auto tiler = make_spatial_tiler<NTiles>(idx, TileLens{}, get_shape_c<Output>{}, Padding{});
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 78c3a269304..dc0db118752 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -146,13 +146,11 @@ struct spatial_tiler
             {
                 constexpr auto pad = full_padding();
                 auto input_pos     = src_pos - pad;
-                smem[i] =
-                    in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
+                smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
             }
             else if constexpr(is_padded())
             {
-                smem[i] =
-                    in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
+                smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
             }
             else
             {

From 747292cc8cf4ba12bfe860e486232e6e786e1c0f Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 3 Apr 2026 17:59:04 -0500
Subject: [PATCH 67/84] Fix selection

---
 .../include/migraphx/kernels/spatial_tiler.hpp    | 15 ++++++++-------
 src/targets/gpu/target.cpp                        |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index dc0db118752..3734d443f4c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -90,18 +90,19 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
-        constexpr auto halo_extra =
-            transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
         if constexpr(has_conv_padding())
         {
-            constexpr auto corrected = transform(
-                halo_extra, full_padding(), [](auto h, auto p) -> index_int { return h + p; });
-            return transform(
-                output_lens(), corrected, [](auto o, auto h) -> index_int { return o + h; });
+            constexpr auto halo_extra = return_array_c([] {
+                return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
+                       full_padding();
+            });
+            return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
         }
         else
         {
+            constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
+            constexpr auto halo_extra    = transform(
+                input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
             return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
         }
     }
diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp
index dfcd0797d1e..39649f8e4a6 100644
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -135,7 +135,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
         dead_code_elimination{},
         fuse_horizontal{},
         dead_code_elimination{},
-        prefuse_ops{},
+        prefuse_ops{&ctx},
         dead_code_elimination{},
         dead_code_elimination{},
         rewrite_reduce{},

From ad9b8d1434f32e820a94b340d314cf3cc00a9fe3 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 3 Apr 2026 21:57:43 -0500
Subject: [PATCH 68/84] Fix padding

---
 src/targets/gpu/jit/channelwise_conv.cpp      |  7 ++--
 .../migraphx/kernels/spatial_tiler.hpp        | 35 +++++++++++++++----
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp
index 6425b2a6cfd..7fc4b679138 100644
--- a/src/targets/gpu/jit/channelwise_conv.cpp
+++ b/src/targets/gpu/jit/channelwise_conv.cpp
@@ -109,10 +109,9 @@ struct channelwise_conv_compiler : compiler<channelwise_conv_compiler>
 
         options.set_launch_params(v, num_blocks * block_size, block_size);
 
-        auto full_padding = v.get("padding", std::vector<std::size_t>{});
-        std::vector<std::size_t> padding(num_spatial, 0);
-        for(std::size_t i = 0; i < num_spatial and i < full_padding.size(); i++)
-            padding[i] = full_padding[i];
+        auto padding = v.get("padding", std::vector<std::size_t>{});
+        if(padding.size() < 2 * num_spatial)
+            padding.resize(2 * num_spatial, 0);
 
         auto src = interpolate_string(channelwise_conv_kernel,
                                       {{"tile", to_string_range(tile_sizes)},
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 3734d443f4c..52e88a9e43b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -77,16 +77,39 @@ struct spatial_tiler
         return (out_spatial_lens() != tiles_per_dim() * output_lens());
     }
 
-    // Full-dimensional padding: (0, 0, p_h, p_w, ...)
-    static constexpr auto full_padding() { return join(index_ints<0, 0>{}, Padding{}); }
-
     static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); }
 
+    // Left (begin) padding per dim: (0, 0, left_h, left_w)
+    static constexpr auto left_padding()
+    {
+        return return_array_c([] {
+            constexpr auto p  = Padding{};
+            constexpr auto ns = p.size() / 2;
+            auto result       = array<index_int, ns + 2>(index_int{0});
+            for(index_int i = 0; i < ns; i++)
+                result[i + 2] = p[i];
+            return result;
+        });
+    }
+
+    // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w)
+    static constexpr auto total_padding()
+    {
+        return return_array_c([] {
+            constexpr auto p  = Padding{};
+            constexpr auto ns = p.size() / 2;
+            auto result       = array<index_int, ns + 2>(index_int{0});
+            for(index_int i = 0; i < ns; i++)
+                result[i + 2] = p[i] + p[i + ns];
+            return result;
+        });
+    }
+
     index idx;
     array<index_int, ndim()> tile_origin;
 
     // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial)
-    // With padding, the output is larger so the raw difference is too small; add padding back.
+    // With padding, the output is larger so the raw difference is too small; add total padding.
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
@@ -94,7 +117,7 @@ struct spatial_tiler
         {
             constexpr auto halo_extra = return_array_c([] {
                 return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
-                       full_padding();
+                       total_padding();
             });
             return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
         }
@@ -145,7 +168,7 @@ struct spatial_tiler
             auto src_pos    = tile_origin + halo_multi;
             if constexpr(has_conv_padding())
             {
-                constexpr auto pad = full_padding();
+                constexpr auto pad = left_padding();
                 auto input_pos     = src_pos - pad;
                 smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
             }

From 77dac357a4eae67df73e545fc22c31aa7766ffac Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 3 Apr 2026 22:24:08 -0500
Subject: [PATCH 69/84] Cleanup

---
 .../migraphx/kernels/spatial_tiler.hpp        | 58 ++++++-------------
 1 file changed, 18 insertions(+), 40 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 52e88a9e43b..72ca68deac2 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -113,21 +113,22 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        if constexpr(has_conv_padding())
-        {
-            constexpr auto halo_extra = return_array_c([] {
-                return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
-                       total_padding();
-            });
-            return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
-        }
-        else
-        {
-            constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
-            constexpr auto halo_extra    = transform(
-                input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
-            return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
-        }
+        constexpr auto halo_extra = [] {
+            if constexpr(has_conv_padding())
+            {
+                return return_array_c([] {
+                    return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
+                           total_padding();
+                });
+            }
+            else
+            {
+                constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
+                return transform(
+                    input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
+            }
+        }();
+        return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 
     // Type for shared memory allocation
@@ -202,31 +203,8 @@ struct spatial_tiler
     }
 };
 
-template <index_int NTiles, class TileLens, class OutputShape>
-__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape)
-{
-    using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape>;
-
-    constexpr auto block_shape = make_shape(return_array_c([] {
-        auto result = tiler_type::tiles_per_dim().base();
-        auto olens  = OutputShape{}.lens;
-        result[0]   = olens[0];
-        result[1]   = olens[1];
-        return result;
-    }));
-    auto block_multi           = block_shape.multi(idx.group);
-    auto tile_origin = generate_array<index_int>(tiler_type::ndim(), [&](auto d) -> index_int {
-        if constexpr(d < 2)
-            return 0;
-        else
-            return block_multi[d] * tiler_type::output_lens()[d];
-    });
-
-    return tiler_type{idx, tile_origin};
-}
-
-template <index_int NTiles, class TileLens, class OutputShape, class Padding>
-__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding)
+template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<>>
+__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding = {})
 {
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape, Padding>;
 

From c47b394b60b2ab70caa24aa4950a0a976e7c30d1 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 18:32:31 -0500
Subject: [PATCH 70/84] Use generate_array instead

---
 .../include/migraphx/kernels/array.hpp        |  6 ++++
 .../migraphx/kernels/spatial_tiler.hpp        | 28 +++++++++----------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
index 6c87fb2ad86..9a5e6432a25 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -360,6 +360,12 @@ constexpr auto make_const_array(T x, Ts... xs)
     return integral_const_array<typename T::value_type, x, xs...>{};
 }
 
+template <class T, class N, class F>
+constexpr auto generate_const_array(N n, F f)
+{
+    return sequence_c<n>([=](auto... is) { return make_const_array(f(is)...); });
+}
+
 template <class T, class N, class F>
 constexpr auto generate_array(N n, F f)
 {
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 72ca68deac2..ffeb67e2267 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -82,26 +82,26 @@ struct spatial_tiler
     // Left (begin) padding per dim: (0, 0, left_h, left_w)
     static constexpr auto left_padding()
     {
-        return return_array_c([] {
-            constexpr auto p  = Padding{};
-            constexpr auto ns = p.size() / 2;
-            auto result       = array<index_int, ns + 2>(index_int{0});
-            for(index_int i = 0; i < ns; i++)
-                result[i + 2] = p[i];
-            return result;
+        constexpr auto p  = Padding{};
+        constexpr auto ns = p.size() / 2;
+        return generate_const_array<index_int>(ns + 2, [](auto i) {
+            if(i < 2)
+                return index_int{0};
+            else
+                return p[i - 2];
         });
     }
 
     // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w)
     static constexpr auto total_padding()
     {
-        return return_array_c([] {
-            constexpr auto p  = Padding{};
-            constexpr auto ns = p.size() / 2;
-            auto result       = array<index_int, ns + 2>(index_int{0});
-            for(index_int i = 0; i < ns; i++)
-                result[i + 2] = p[i] + p[i + ns];
-            return result;
+        constexpr auto p  = Padding{};
+        constexpr auto ns = p.size() / 2;
+        return generate_const_array<index_int>(ns + 2, [](auto i) {
+            if(i < 2)
+                return index_int{0};
+            else
+                return p[i - 2] + p[i - 2 + ns];
         });
     }
 

From 604d408d462e489ed56a35d86dd632df02d8087b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:13:04 -0500
Subject: [PATCH 71/84] Use generate array

---
 .../migraphx/kernels/integral_constant.hpp    |  3 ++
 .../migraphx/kernels/spatial_tiler.hpp        | 46 +++++++++++--------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
index 74a4aa51cb5..e444ebd7107 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -131,6 +131,9 @@ struct is_integral_constant<integral_constant<T, V>> : true_type
 template <index_int N>
 using index_constant = integral_constant<index_int, N>;
 
+template<index_int N>
+static constexpr auto index_c = index_constant<N>{};
+
 template <auto V>
 static constexpr auto _c = integral_constant<decltype(V), V>{}; // NOLINT
 
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index ffeb67e2267..2e756c18a81 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -79,29 +79,37 @@ struct spatial_tiler
 
     static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); }
 
+    static constexpr auto get_padding() 
+    {
+        if constexpr(Padding{}.empty())
+            return transform(TileLens{}, [](auto) { return index_int{0}; });
+        else
+            return Padding{};
+    }
+
     // Left (begin) padding per dim: (0, 0, left_h, left_w)
     static constexpr auto left_padding()
     {
-        constexpr auto p  = Padding{};
+        constexpr auto p  = get_padding();
         constexpr auto ns = p.size() / 2;
-        return generate_const_array<index_int>(ns + 2, [](auto i) {
-            if(i < 2)
-                return index_int{0};
+        return generate_const_array<index_int>(_c<ns + 2>, [&](auto i) {
+            if constexpr(i < 2)
+                return index_c<0>;
             else
-                return p[i - 2];
+                return index_c<p[i - 2]>;
         });
     }
 
     // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w)
     static constexpr auto total_padding()
     {
-        constexpr auto p  = Padding{};
+        constexpr auto p  = get_padding();
         constexpr auto ns = p.size() / 2;
-        return generate_const_array<index_int>(ns + 2, [](auto i) {
-            if(i < 2)
-                return index_int{0};
+        return generate_const_array<index_int>(_c<ns + 2>, [&](auto i) {
+            if constexpr(i < 2)
+                return index_c<0>;
             else
-                return p[i - 2] + p[i - 2 + ns];
+                return index_c<p[i - 2] + p[i - 2 + ns]>;
         });
     }
 
@@ -114,19 +122,19 @@ struct spatial_tiler
     static constexpr auto halo_lens_for()
     {
         constexpr auto halo_extra = [] {
-            if constexpr(has_conv_padding())
-            {
+            // if constexpr(has_conv_padding())
+            // {
                 return return_array_c([] {
                     return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
                            total_padding();
                 });
-            }
-            else
-            {
-                constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
-                return transform(
-                    input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
-            }
+            // }
+            // else
+            // {
+            //     constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
+            //     return transform(
+            //         input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
+            // }
         }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }

From 5fc446ab1a784dba37e6295a337a3d2a8ce97a0e Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:13:13 -0500
Subject: [PATCH 72/84] Format

---
 .../migraphx/kernels/integral_constant.hpp    |  2 +-
 .../migraphx/kernels/spatial_tiler.hpp        | 26 ++++++++++---------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
index e444ebd7107..9d48717bbd8 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -131,7 +131,7 @@ struct is_integral_constant<integral_constant<T, V>> : true_type
 template <index_int N>
 using index_constant = integral_constant<index_int, N>;
 
-template<index_int N>
+template <index_int N>
 static constexpr auto index_c = index_constant<N>{};
 
 template <auto V>
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 2e756c18a81..e4f457ac441 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -79,7 +79,7 @@ struct spatial_tiler
 
     static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); }
 
-    static constexpr auto get_padding() 
+    static constexpr auto get_padding()
     {
         if constexpr(Padding{}.empty())
             return transform(TileLens{}, [](auto) { return index_int{0}; });
@@ -121,21 +121,23 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto halo_extra = [] {
-            // if constexpr(has_conv_padding())
-            // {
+        constexpr auto halo_extra =
+            [] {
+                // if constexpr(has_conv_padding())
+                // {
                 return return_array_c([] {
                     return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
                            total_padding();
                 });
-            // }
-            // else
-            // {
-            //     constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
-            //     return transform(
-            //         input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; });
-            // }
-        }();
+                // }
+                // else
+                // {
+                //     constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
+                //     return transform(
+                //         input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os;
+                //         });
+                // }
+            }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 

From 21442c45de0e2c880bc2df9ef741cbd183cedf1b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:14:48 -0500
Subject: [PATCH 73/84] Add padding tests

---
 test/verify/test_channelwise_conv_padding.cpp | 46 +++++++++++++++++
 .../test_channelwise_conv_padding_1d.cpp      | 50 +++++++++++++++++++
 .../test_channelwise_conv_padding_5x5.cpp     | 47 +++++++++++++++++
 ...channelwise_conv_padding_non_divisible.cpp | 47 +++++++++++++++++
 .../test_channelwise_conv_padding_relu.cpp    | 48 ++++++++++++++++++
 5 files changed, 238 insertions(+)
 create mode 100644 test/verify/test_channelwise_conv_padding.cpp
 create mode 100644 test/verify/test_channelwise_conv_padding_1d.cpp
 create mode 100644 test/verify/test_channelwise_conv_padding_5x5.cpp
 create mode 100644 test/verify/test_channelwise_conv_padding_non_divisible.cpp
 create mode 100644 test/verify/test_channelwise_conv_padding_relu.cpp

diff --git a/test/verify/test_channelwise_conv_padding.cpp b/test/verify/test_channelwise_conv_padding.cpp
new file mode 100644
index 00000000000..fa38209e455
--- /dev/null
+++ b/test/verify/test_channelwise_conv_padding.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_padding : verify_program<test_channelwise_conv_padding<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}});
+        mm->add_instruction(
+            migraphx::make_op("convolution", {{"group", 4}, {"padding", {1, 1}}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_padding<migraphx::shape::float_type>;
+template struct test_channelwise_conv_padding<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_padding_1d.cpp b/test/verify/test_channelwise_conv_padding_1d.cpp
new file mode 100644
index 00000000000..6094fc98bb7
--- /dev/null
+++ b/test/verify/test_channelwise_conv_padding_1d.cpp
@@ -0,0 +1,50 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_padding_1d :
+    verify_program<test_channelwise_conv_padding_1d<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}});
+        mm->add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {1}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}),
+            input,
+            weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_padding_1d<migraphx::shape::float_type>;
+template struct test_channelwise_conv_padding_1d<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_padding_5x5.cpp b/test/verify/test_channelwise_conv_padding_5x5.cpp
new file mode 100644
index 00000000000..49652dd5ab4
--- /dev/null
+++ b/test/verify/test_channelwise_conv_padding_5x5.cpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_padding_5x5 :
+    verify_program<test_channelwise_conv_padding_5x5<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}});
+        mm->add_instruction(
+            migraphx::make_op("convolution", {{"group", 8}, {"padding", {2, 2}}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_padding_5x5<migraphx::shape::float_type>;
+template struct test_channelwise_conv_padding_5x5<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_padding_non_divisible.cpp b/test/verify/test_channelwise_conv_padding_non_divisible.cpp
new file mode 100644
index 00000000000..e2d643575bf
--- /dev/null
+++ b/test/verify/test_channelwise_conv_padding_non_divisible.cpp
@@ -0,0 +1,47 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_padding_non_divisible :
+    verify_program<test_channelwise_conv_padding_non_divisible<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
+        mm->add_instruction(
+            migraphx::make_op("convolution", {{"group", 8}, {"padding", {1, 1}}}), input, weights);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_padding_non_divisible<migraphx::shape::float_type>;
+template struct test_channelwise_conv_padding_non_divisible<migraphx::shape::half_type>;
diff --git a/test/verify/test_channelwise_conv_padding_relu.cpp b/test/verify/test_channelwise_conv_padding_relu.cpp
new file mode 100644
index 00000000000..fe1de7b8fa2
--- /dev/null
+++ b/test/verify/test_channelwise_conv_padding_relu.cpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_channelwise_conv_padding_relu :
+    verify_program<test_channelwise_conv_padding_relu<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm     = p.get_main_module();
+        auto input   = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}});
+        auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}});
+        auto conv    = mm->add_instruction(
+            migraphx::make_op("convolution", {{"group", 8}, {"padding", {1, 1}}}), input, weights);
+        mm->add_instruction(migraphx::make_op("relu"), conv);
+        return p;
+    }
+    std::string section() const { return "conv"; }
+};
+template struct test_channelwise_conv_padding_relu<migraphx::shape::float_type>;
+template struct test_channelwise_conv_padding_relu<migraphx::shape::half_type>;

From 371f79b13e77df85303bc93cb1196a725a3f45ca Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:14:51 -0500
Subject: [PATCH 74/84] Format

---
 test/verify/test_channelwise_conv_padding_1d.cpp            | 3 +--
 test/verify/test_channelwise_conv_padding_5x5.cpp           | 3 +--
 test/verify/test_channelwise_conv_padding_non_divisible.cpp | 4 ++--
 test/verify/test_channelwise_conv_padding_relu.cpp          | 4 ++--
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/test/verify/test_channelwise_conv_padding_1d.cpp b/test/verify/test_channelwise_conv_padding_1d.cpp
index 6094fc98bb7..7e4c5f3d170 100644
--- a/test/verify/test_channelwise_conv_padding_1d.cpp
+++ b/test/verify/test_channelwise_conv_padding_1d.cpp
@@ -28,8 +28,7 @@
 #include <migraphx/make_op.hpp>
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_padding_1d :
-    verify_program<test_channelwise_conv_padding_1d<DType>>
+struct test_channelwise_conv_padding_1d : verify_program<test_channelwise_conv_padding_1d<DType>>
 {
     migraphx::program create_program() const
     {
diff --git a/test/verify/test_channelwise_conv_padding_5x5.cpp b/test/verify/test_channelwise_conv_padding_5x5.cpp
index 49652dd5ab4..4fcf2ce218b 100644
--- a/test/verify/test_channelwise_conv_padding_5x5.cpp
+++ b/test/verify/test_channelwise_conv_padding_5x5.cpp
@@ -28,8 +28,7 @@
 #include <migraphx/make_op.hpp>
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_padding_5x5 :
-    verify_program<test_channelwise_conv_padding_5x5<DType>>
+struct test_channelwise_conv_padding_5x5 : verify_program<test_channelwise_conv_padding_5x5<DType>>
 {
     migraphx::program create_program() const
     {
diff --git a/test/verify/test_channelwise_conv_padding_non_divisible.cpp b/test/verify/test_channelwise_conv_padding_non_divisible.cpp
index e2d643575bf..4a1fdde33cf 100644
--- a/test/verify/test_channelwise_conv_padding_non_divisible.cpp
+++ b/test/verify/test_channelwise_conv_padding_non_divisible.cpp
@@ -28,8 +28,8 @@
 #include <migraphx/make_op.hpp>
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_padding_non_divisible :
-    verify_program<test_channelwise_conv_padding_non_divisible<DType>>
+struct test_channelwise_conv_padding_non_divisible
+    : verify_program<test_channelwise_conv_padding_non_divisible<DType>>
 {
     migraphx::program create_program() const
     {
diff --git a/test/verify/test_channelwise_conv_padding_relu.cpp b/test/verify/test_channelwise_conv_padding_relu.cpp
index fe1de7b8fa2..2d934d39ac0 100644
--- a/test/verify/test_channelwise_conv_padding_relu.cpp
+++ b/test/verify/test_channelwise_conv_padding_relu.cpp
@@ -28,8 +28,8 @@
 #include <migraphx/make_op.hpp>
 
 template <migraphx::shape::type_t DType>
-struct test_channelwise_conv_padding_relu :
-    verify_program<test_channelwise_conv_padding_relu<DType>>
+struct test_channelwise_conv_padding_relu
+    : verify_program<test_channelwise_conv_padding_relu<DType>>
 {
     migraphx::program create_program() const
     {

From 8949117b741c15b16388bbf711c5b02687d27220 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:35:50 -0500
Subject: [PATCH 75/84] Update is_padded() check

---
 .../migraphx/kernels/spatial_tiler.hpp        | 33 +++++--------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index e4f457ac441..8591152bb4b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -72,13 +72,6 @@ struct spatial_tiler
     static constexpr index_int tiles_total() { return tiles_per_dim().product(); }
     static constexpr auto ndim() { return out_spatial_lens().size(); }
 
-    static constexpr bool is_padded()
-    {
-        return (out_spatial_lens() != tiles_per_dim() * output_lens());
-    }
-
-    static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); }
-
     static constexpr auto get_padding()
     {
         if constexpr(Padding{}.empty())
@@ -113,6 +106,11 @@ struct spatial_tiler
         });
     }
 
+    static constexpr bool is_padded()
+    {
+        return (out_spatial_lens() != (tiles_per_dim() * output_lens() + total_padding()));
+    }
+
     index idx;
     array<index_int, ndim()> tile_origin;
 
@@ -123,20 +121,10 @@ struct spatial_tiler
     {
         constexpr auto halo_extra =
             [] {
-                // if constexpr(has_conv_padding())
-                // {
                 return return_array_c([] {
                     return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
                            total_padding();
                 });
-                // }
-                // else
-                // {
-                //     constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens;
-                //     return transform(
-                //         input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os;
-                //         });
-                // }
             }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
@@ -177,19 +165,14 @@ struct spatial_tiler
         idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
-            if constexpr(has_conv_padding())
+            auto input_pos     = src_pos - left_padding();
+            if constexpr(is_padded())
             {
-                constexpr auto pad = left_padding();
-                auto input_pos     = src_pos - pad;
                 smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
             }
-            else if constexpr(is_padded())
-            {
-                smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0};
-            }
             else
             {
-                smem[i] = input_ch[src_pos];
+                smem[i] = input_ch[input_pos];
             }
         });
 

From be32bda8ac8f969ba7f9cf8937a504d6c71b557c Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:35:54 -0500
Subject: [PATCH 76/84] Format

---
 .../include/migraphx/kernels/spatial_tiler.hpp    | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 8591152bb4b..6c8a9a0f125 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -119,13 +119,12 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto halo_extra =
-            [] {
-                return return_array_c([] {
-                    return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
-                           total_padding();
-                });
-            }();
+        constexpr auto halo_extra = [] {
+            return return_array_c([] {
+                return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
+                       total_padding();
+            });
+        }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 
@@ -165,7 +164,7 @@ struct spatial_tiler
         idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
-            auto input_pos     = src_pos - left_padding();
+            auto input_pos  = src_pos - left_padding();
             if constexpr(is_padded())
             {
                 smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};

From 4f5221e8fdfd73e24f583336c2009140e56d1da6 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:57:35 -0500
Subject: [PATCH 77/84] Add unit tests

---
 .../migraphx/kernels/spatial_tiler.hpp        |  28 +-
 test/gpu/kernels/spatial_tiler.cpp            | 329 ++++++++++++++++++
 2 files changed, 346 insertions(+), 11 deletions(-)
 create mode 100644 test/gpu/kernels/spatial_tiler.cpp

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 6c8a9a0f125..9c98a269ee7 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -39,7 +39,7 @@ constexpr bool has_nonzero(index_ints<Ps...>)
     return ((Ps != 0) or ...);
 }
 
-template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<>>
+template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<0>>
 struct spatial_tiler
 {
     static constexpr auto keep_spatial()
@@ -74,10 +74,15 @@ struct spatial_tiler
 
     static constexpr auto get_padding()
     {
-        if constexpr(Padding{}.empty())
-            return transform(TileLens{}, [](auto) { return index_int{0}; });
+        if constexpr(Padding{}.size() < 2) 
+        {
+            auto pre = transform(TileLens{}, [](auto) { return index_c<0>; });
+            return join(pre, pre);
+        }
         else
+        {
             return Padding{};
+        }
     }
 
     // Left (begin) padding per dim: (0, 0, left_h, left_w)
@@ -119,12 +124,13 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto halo_extra = [] {
-            return return_array_c([] {
-                return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
-                       total_padding();
-            });
-        }();
+        constexpr auto halo_extra =
+            [] {
+                return return_array_c([] {
+                    return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
+                           total_padding();
+                });
+            }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 
@@ -164,7 +170,7 @@ struct spatial_tiler
         idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
-            auto input_pos  = src_pos - left_padding();
+            auto input_pos     = src_pos - left_padding();
             if constexpr(is_padded())
             {
                 smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
@@ -195,7 +201,7 @@ struct spatial_tiler
     }
 };
 
-template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<>>
+template <index_int NTiles, class TileLens, class OutputShape, class Padding = index_ints<0>>
 __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding = {})
 {
     using tiler_type = spatial_tiler<NTiles, TileLens, OutputShape, Padding>;
diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp
new file mode 100644
index 00000000000..dd0d5950c4e
--- /dev/null
+++ b/test/gpu/kernels/spatial_tiler.cpp
@@ -0,0 +1,329 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#include <migraphx/kernels/spatial_tiler.hpp>
+#include <migraphx/kernels/test.hpp>
+
+// Helper: create a standard 4D shape from lens
+template <migraphx::index_int N, migraphx::index_int C, migraphx::index_int H, migraphx::index_int W>
+constexpr auto make_4d_shape()
+{
+    constexpr auto lens = migraphx::index_ints<N, C, H, W>{};
+    return migraphx::make_shape(lens);
+}
+
+// ======== output_lens ========
+
+// Tile {4, 4} with NTiles=1 → output_lens = {1, 1, 4, 4}
+TEST_CASE(output_lens_ntiles_1)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    constexpr auto ol = tiler::output_lens();
+    EXPECT(ol.size() == 4);
+    EXPECT(ol[0] == 1);
+    EXPECT(ol[1] == 1);
+    EXPECT(ol[2] == 4);
+    EXPECT(ol[3] == 4);
+}
+
+// Tile {4, 4} with NTiles=2 → last dim doubled: {1, 1, 4, 8}
+TEST_CASE(output_lens_ntiles_2)
+{
+    using tiler = migraphx::spatial_tiler<2,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    constexpr auto ol = tiler::output_lens();
+    EXPECT(ol[2] == 4);
+    EXPECT(ol[3] == 8);
+}
+
+// ======== out_spatial_lens ========
+
+TEST_CASE(out_spatial_lens_basic)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<2, 3, 16, 16>())>;
+    constexpr auto sl = tiler::out_spatial_lens();
+    // keep_spatial sets dims 0,1 to 1; keeps H,W
+    EXPECT(sl[0] == 1);
+    EXPECT(sl[1] == 1);
+    EXPECT(sl[2] == 16);
+    EXPECT(sl[3] == 16);
+}
+
+// ======== tiles_per_dim ========
+
+// 8x8 output, 4x4 tile, NTiles=1 → ceil(8/4)=2 per spatial dim
+TEST_CASE(tiles_per_dim_exact)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    constexpr auto tpd = tiler::tiles_per_dim();
+    EXPECT(tpd[2] == 2);
+    EXPECT(tpd[3] == 2);
+}
+
+// 10x10 output, 4x4 tile → ceil(10/4)=3 per spatial dim
+TEST_CASE(tiles_per_dim_inexact)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 10, 10>())>;
+    constexpr auto tpd = tiler::tiles_per_dim();
+    EXPECT(tpd[2] == 3);
+    EXPECT(tpd[3] == 3);
+}
+
+// NTiles=2 scales last dim: tile output is {4, 8} → ceil(16/4)=4, ceil(16/8)=2
+TEST_CASE(tiles_per_dim_ntiles)
+{
+    using tiler = migraphx::spatial_tiler<2,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 16, 16>())>;
+    constexpr auto tpd = tiler::tiles_per_dim();
+    EXPECT(tpd[2] == 4);
+    EXPECT(tpd[3] == 2);
+}
+
+// ======== tiles_total ========
+
+TEST_CASE(tiles_total_exact)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    // tiles_per_dim = {1, 1, 2, 2}, product = 4
+    EXPECT(tiler::tiles_total() == 4);
+}
+
+// ======== get_padding / left_padding / total_padding ========
+
+// No Padding arg → get_padding returns zeros matching TileLens size
+TEST_CASE(get_padding_default)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    constexpr auto gp = tiler::get_padding();
+    EXPECT(gp.size() == 4);
+    EXPECT(gp[0] == 0);
+    EXPECT(gp[1] == 0);
+    EXPECT(gp[2] == 0);
+    EXPECT(gp[3] == 0);
+}
+
+// No padding template arg → all zeros
+TEST_CASE(padding_default_no_padding)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    constexpr auto lp = tiler::left_padding();
+    constexpr auto tp = tiler::total_padding();
+    EXPECT(lp[0] == 0);
+    EXPECT(lp[1] == 0);
+    EXPECT(lp[2] == 0);
+    EXPECT(lp[3] == 0);
+    EXPECT(tp[0] == 0);
+    EXPECT(tp[1] == 0);
+    EXPECT(tp[2] == 0);
+    EXPECT(tp[3] == 0);
+}
+
+// Symmetric padding {1, 1, 1, 1} → left={0,0,1,1}, total={0,0,2,2}
+TEST_CASE(padding_symmetric)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>()),
+                                          migraphx::index_ints<1, 1, 1, 1>>;
+    constexpr auto lp = tiler::left_padding();
+    EXPECT(lp[0] == 0);
+    EXPECT(lp[1] == 0);
+    EXPECT(lp[2] == 1);
+    EXPECT(lp[3] == 1);
+
+    constexpr auto tp = tiler::total_padding();
+    EXPECT(tp[0] == 0);
+    EXPECT(tp[1] == 0);
+    EXPECT(tp[2] == 2);
+    EXPECT(tp[3] == 2);
+}
+
+// Asymmetric padding {1, 2, 3, 4} → left={0,0,1,2}, total={0,0,1+3,2+4}={0,0,4,6}
+TEST_CASE(padding_asymmetric)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>()),
+                                          migraphx::index_ints<1, 2, 3, 4>>;
+    constexpr auto lp = tiler::left_padding();
+    EXPECT(lp[2] == 1);
+    EXPECT(lp[3] == 2);
+
+    constexpr auto tp = tiler::total_padding();
+    EXPECT(tp[2] == 4);
+    EXPECT(tp[3] == 6);
+}
+
+// ======== is_padded ========
+
+// Tiles exactly cover output, no conv padding → not padded
+TEST_CASE(is_padded_exact_no_padding)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    EXPECT(not tiler::is_padded());
+}
+
+// Tiles don't exactly cover output (10 not divisible by 4) → padded
+TEST_CASE(is_padded_overhang)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 10, 10>())>;
+    EXPECT(tiler::is_padded());
+}
+
+// Tiles exactly cover output but conv padding present → padded
+TEST_CASE(is_padded_conv_padding_exact_tiles)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>()),
+                                          migraphx::index_ints<1, 1, 1, 1>>;
+    EXPECT(tiler::is_padded());
+}
+
+// Both overhang and conv padding → padded
+TEST_CASE(is_padded_overhang_and_conv_padding)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 10, 10>()),
+                                          migraphx::index_ints<1, 1, 1, 1>>;
+    EXPECT(tiler::is_padded());
+}
+
+// Edge case: tile overhang equals total padding → still padded
+// out_spatial=10, tile=8, tiles_per_dim=2, tiles*tile=16, total_pad=6
+// Without the fix: 10 != 16 → padded (only by coincidence).
+// With total_padding in formula: 10 != 16+6=22 → padded.
+TEST_CASE(is_padded_overhang_equals_padding)
+{
+    // tiles_per_dim = ceil(10/8) = 2, coverage = 16, total_pad_h=3+3=6
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<8, 8>,
+                                          decltype(make_4d_shape<1, 1, 10, 10>()),
+                                          migraphx::index_ints<3, 3, 3, 3>>;
+    EXPECT(tiler::is_padded());
+}
+
+// Only one spatial dim has overhang
+TEST_CASE(is_padded_partial_overhang)
+{
+    // H=8 exactly tiled by tile_h=4. W=10 not divisible by tile_w=4.
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 10>())>;
+    EXPECT(tiler::is_padded());
+}
+
+// Large padding values
+TEST_CASE(is_padded_large_padding)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>()),
+                                          migraphx::index_ints<3, 3, 3, 3>>;
+    EXPECT(tiler::is_padded());
+}
+
+// ======== has_nonzero ========
+
+TEST_CASE(has_nonzero_all_zero)
+{
+    EXPECT(not migraphx::has_nonzero(migraphx::index_ints<0, 0, 0, 0>{}));
+}
+
+TEST_CASE(has_nonzero_some_nonzero)
+{
+    EXPECT(migraphx::has_nonzero(migraphx::index_ints<0, 0, 1, 0>{}));
+}
+
+TEST_CASE(has_nonzero_all_nonzero)
+{
+    EXPECT(migraphx::has_nonzero(migraphx::index_ints<1, 2, 3, 4>{}));
+}
+
+// ======== halo_lens_for ========
+
+// No padding: halo = output_lens + (input_spatial - out_spatial)
+TEST_CASE(halo_lens_no_padding)
+{
+    // Output 8x8, input 10x10 (e.g. 3x3 conv), tile 4x4
+    // out_spatial = {1,1,8,8}, input_spatial = {1,1,10,10}
+    // halo_extra = {1,1,10,10} - {1,1,8,8} + {0,0,0,0} = {0,0,2,2}
+    // halo_lens = output_lens + halo_extra = {1,1,4,4} + {0,0,2,2} = {1,1,6,6}
+    using output_shape = decltype(make_4d_shape<1, 1, 8, 8>());
+    using input_shape  = decltype(make_4d_shape<1, 1, 10, 10>());
+    using tiler        = migraphx::spatial_tiler<1, migraphx::index_ints<4, 4>, output_shape>;
+
+    constexpr auto hl = tiler::template halo_lens_for<input_shape>();
+    EXPECT(hl[2] == 6);
+    EXPECT(hl[3] == 6);
+}
+
+// With padding: halo = output_lens + (input_spatial - out_spatial + total_padding)
+TEST_CASE(halo_lens_with_padding)
+{
+    // Output 8x8, input 8x8 (same-padding conv), pad {1,1,1,1} → total_pad={0,0,2,2}
+    // halo_extra = {1,1,8,8} - {1,1,8,8} + {0,0,2,2} = {0,0,2,2}
+    // halo_lens = {1,1,4,4} + {0,0,2,2} = {1,1,6,6}
+    using output_shape = decltype(make_4d_shape<1, 1, 8, 8>());
+    using input_shape  = decltype(make_4d_shape<1, 1, 8, 8>());
+    using tiler        = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          output_shape,
+                                          migraphx::index_ints<1, 1, 1, 1>>;
+
+    constexpr auto hl = tiler::template halo_lens_for<input_shape>();
+    EXPECT(hl[2] == 6);
+    EXPECT(hl[3] == 6);
+}
+
+// ======== ndim ========
+
+TEST_CASE(ndim_4d)
+{
+    using tiler = migraphx::spatial_tiler<1,
+                                          migraphx::index_ints<4, 4>,
+                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    EXPECT(tiler::ndim() == 4);
+}

From b0e4634232dcecc10bfba2b83f82d6af5cb3a728 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Tue, 7 Apr 2026 19:57:38 -0500
Subject: [PATCH 78/84] Format

---
 .../migraphx/kernels/spatial_tiler.hpp        | 17 ++--
 test/gpu/kernels/spatial_tiler.cpp            | 92 +++++++++----------
 2 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
index 9c98a269ee7..bc89ef88268 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp
@@ -74,7 +74,7 @@ struct spatial_tiler
 
     static constexpr auto get_padding()
     {
-        if constexpr(Padding{}.size() < 2) 
+        if constexpr(Padding{}.size() < 2)
         {
             auto pre = transform(TileLens{}, [](auto) { return index_c<0>; });
             return join(pre, pre);
@@ -124,13 +124,12 @@ struct spatial_tiler
     template <class InputShape>
     static constexpr auto halo_lens_for()
     {
-        constexpr auto halo_extra =
-            [] {
-                return return_array_c([] {
-                    return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
-                           total_padding();
-                });
-            }();
+        constexpr auto halo_extra = [] {
+            return return_array_c([] {
+                return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() +
+                       total_padding();
+            });
+        }();
         return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; });
     }
 
@@ -170,7 +169,7 @@ struct spatial_tiler
         idx.local_stride(_c<hl.product()>, [&](auto i) {
             auto halo_multi = halo_shape.multi(i);
             auto src_pos    = tile_origin + halo_multi;
-            auto input_pos     = src_pos - left_padding();
+            auto input_pos  = src_pos - left_padding();
             if constexpr(is_padded())
             {
                 smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0};
diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp
index dd0d5950c4e..2cecb75d242 100644
--- a/test/gpu/kernels/spatial_tiler.cpp
+++ b/test/gpu/kernels/spatial_tiler.cpp
@@ -26,7 +26,10 @@
 #include <migraphx/kernels/test.hpp>
 
 // Helper: create a standard 4D shape from lens
-template <migraphx::index_int N, migraphx::index_int C, migraphx::index_int H, migraphx::index_int W>
+template <migraphx::index_int N,
+          migraphx::index_int C,
+          migraphx::index_int H,
+          migraphx::index_int W>
 constexpr auto make_4d_shape()
 {
     constexpr auto lens = migraphx::index_ints<N, C, H, W>{};
@@ -38,9 +41,8 @@ constexpr auto make_4d_shape()
 // Tile {4, 4} with NTiles=1 → output_lens = {1, 1, 4, 4}
 TEST_CASE(output_lens_ntiles_1)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     constexpr auto ol = tiler::output_lens();
     EXPECT(ol.size() == 4);
     EXPECT(ol[0] == 1);
@@ -52,9 +54,8 @@ TEST_CASE(output_lens_ntiles_1)
 // Tile {4, 4} with NTiles=2 → last dim doubled: {1, 1, 4, 8}
 TEST_CASE(output_lens_ntiles_2)
 {
-    using tiler = migraphx::spatial_tiler<2,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<2, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     constexpr auto ol = tiler::output_lens();
     EXPECT(ol[2] == 4);
     EXPECT(ol[3] == 8);
@@ -64,9 +65,8 @@ TEST_CASE(output_lens_ntiles_2)
 
 TEST_CASE(out_spatial_lens_basic)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<2, 3, 16, 16>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<2, 3, 16, 16>())>;
     constexpr auto sl = tiler::out_spatial_lens();
     // keep_spatial sets dims 0,1 to 1; keeps H,W
     EXPECT(sl[0] == 1);
@@ -80,9 +80,8 @@ TEST_CASE(out_spatial_lens_basic)
 // 8x8 output, 4x4 tile, NTiles=1 → ceil(8/4)=2 per spatial dim
 TEST_CASE(tiles_per_dim_exact)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     constexpr auto tpd = tiler::tiles_per_dim();
     EXPECT(tpd[2] == 2);
     EXPECT(tpd[3] == 2);
@@ -91,9 +90,8 @@ TEST_CASE(tiles_per_dim_exact)
 // 10x10 output, 4x4 tile → ceil(10/4)=3 per spatial dim
 TEST_CASE(tiles_per_dim_inexact)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 10, 10>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 10, 10>())>;
     constexpr auto tpd = tiler::tiles_per_dim();
     EXPECT(tpd[2] == 3);
     EXPECT(tpd[3] == 3);
@@ -102,9 +100,8 @@ TEST_CASE(tiles_per_dim_inexact)
 // NTiles=2 scales last dim: tile output is {4, 8} → ceil(16/4)=4, ceil(16/8)=2
 TEST_CASE(tiles_per_dim_ntiles)
 {
-    using tiler = migraphx::spatial_tiler<2,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 16, 16>())>;
+    using tiler = migraphx::
+        spatial_tiler<2, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 16, 16>())>;
     constexpr auto tpd = tiler::tiles_per_dim();
     EXPECT(tpd[2] == 4);
     EXPECT(tpd[3] == 2);
@@ -114,9 +111,8 @@ TEST_CASE(tiles_per_dim_ntiles)
 
 TEST_CASE(tiles_total_exact)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     // tiles_per_dim = {1, 1, 2, 2}, product = 4
     EXPECT(tiler::tiles_total() == 4);
 }
@@ -126,9 +122,8 @@ TEST_CASE(tiles_total_exact)
 // No Padding arg → get_padding returns zeros matching TileLens size
 TEST_CASE(get_padding_default)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     constexpr auto gp = tiler::get_padding();
     EXPECT(gp.size() == 4);
     EXPECT(gp[0] == 0);
@@ -140,9 +135,8 @@ TEST_CASE(get_padding_default)
 // No padding template arg → all zeros
 TEST_CASE(padding_default_no_padding)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     constexpr auto lp = tiler::left_padding();
     constexpr auto tp = tiler::total_padding();
     EXPECT(lp[0] == 0);
@@ -158,10 +152,10 @@ TEST_CASE(padding_default_no_padding)
 // Symmetric padding {1, 1, 1, 1} → left={0,0,1,1}, total={0,0,2,2}
 TEST_CASE(padding_symmetric)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>()),
-                                          migraphx::index_ints<1, 1, 1, 1>>;
+    using tiler       = migraphx::spatial_tiler<1,
+                                                migraphx::index_ints<4, 4>,
+                                                decltype(make_4d_shape<1, 1, 8, 8>()),
+                                                migraphx::index_ints<1, 1, 1, 1>>;
     constexpr auto lp = tiler::left_padding();
     EXPECT(lp[0] == 0);
     EXPECT(lp[1] == 0);
@@ -178,10 +172,10 @@ TEST_CASE(padding_symmetric)
 // Asymmetric padding {1, 2, 3, 4} → left={0,0,1,2}, total={0,0,1+3,2+4}={0,0,4,6}
 TEST_CASE(padding_asymmetric)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>()),
-                                          migraphx::index_ints<1, 2, 3, 4>>;
+    using tiler       = migraphx::spatial_tiler<1,
+                                                migraphx::index_ints<4, 4>,
+                                                decltype(make_4d_shape<1, 1, 8, 8>()),
+                                                migraphx::index_ints<1, 2, 3, 4>>;
     constexpr auto lp = tiler::left_padding();
     EXPECT(lp[2] == 1);
     EXPECT(lp[3] == 2);
@@ -196,18 +190,16 @@ TEST_CASE(padding_asymmetric)
 // Tiles exactly cover output, no conv padding → not padded
 TEST_CASE(is_padded_exact_no_padding)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     EXPECT(not tiler::is_padded());
 }
 
 // Tiles don't exactly cover output (10 not divisible by 4) → padded
 TEST_CASE(is_padded_overhang)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 10, 10>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 10, 10>())>;
     EXPECT(tiler::is_padded());
 }
 
@@ -249,9 +241,8 @@ TEST_CASE(is_padded_overhang_equals_padding)
 TEST_CASE(is_padded_partial_overhang)
 {
     // H=8 exactly tiled by tile_h=4. W=10 not divisible by tile_w=4.
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 10>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 10>())>;
     EXPECT(tiler::is_padded());
 }
 
@@ -309,9 +300,9 @@ TEST_CASE(halo_lens_with_padding)
     using output_shape = decltype(make_4d_shape<1, 1, 8, 8>());
     using input_shape  = decltype(make_4d_shape<1, 1, 8, 8>());
     using tiler        = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          output_shape,
-                                          migraphx::index_ints<1, 1, 1, 1>>;
+                                                 migraphx::index_ints<4, 4>,
+                                                 output_shape,
+                                                 migraphx::index_ints<1, 1, 1, 1>>;
 
     constexpr auto hl = tiler::template halo_lens_for<input_shape>();
     EXPECT(hl[2] == 6);
@@ -322,8 +313,7 @@ TEST_CASE(halo_lens_with_padding)
 
 TEST_CASE(ndim_4d)
 {
-    using tiler = migraphx::spatial_tiler<1,
-                                          migraphx::index_ints<4, 4>,
-                                          decltype(make_4d_shape<1, 1, 8, 8>())>;
+    using tiler = migraphx::
+        spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>;
     EXPECT(tiler::ndim() == 4);
 }

From eecf7855f4182812f7bfc7662c9e5ba45a5836a0 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 22 Apr 2026 11:04:22 -0500
Subject: [PATCH 79/84] Fix tidy

---
 test/gpu/kernels/spatial_tiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp
index 2cecb75d242..6ce858b83f3 100644
--- a/test/gpu/kernels/spatial_tiler.cpp
+++ b/test/gpu/kernels/spatial_tiler.cpp
@@ -30,7 +30,7 @@ template <migraphx::index_int N,
           migraphx::index_int C,
           migraphx::index_int H,
           migraphx::index_int W>
-constexpr auto make_4d_shape()
+static constexpr auto make_4d_shape()
 {
     constexpr auto lens = migraphx::index_ints<N, C, H, W>{};
     return migraphx::make_shape(lens);

From a3b61a2b5471c8c417adb368f2d0ec4ff02448c1 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 22 Apr 2026 12:39:47 -0500
Subject: [PATCH 80/84] Fix cppcheck warnings

---
 .../include/migraphx/kernels/debug.hpp        |  9 +++++++++
 .../include/migraphx/kernels/float8.hpp       | 20 ++++++++-----------
 .../include/migraphx/kernels/float8_impl.hpp  |  1 +
 .../include/migraphx/kernels/slice.hpp        |  3 +--
 test/gpu/kernels/spatial_tiler.cpp            |  2 ++
 5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
index 5e5e16b1315..3e7fffaa2f5 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -206,6 +206,14 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l
 #define MIGRAPHX_CHECK(cond) \
     MIGRAPHX_ASSERT_FAIL(cond, #cond, __FILE__, __LINE__, __PRETTY_FUNCTION__)
 
+#ifdef CPPCHECK
+// NOLINTNEXTLINE
+#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) T
+#define MIGRAPHX_ASSUME assert(cond)
+#define MIGRAPHX_UNREACHABLE assert(false)
+#define MIGRAPHX_ASSERT(cond) assert(cond)
+#define MIGRAPHX_WARN(cond, ...) assert(cond)
+#else
 #ifdef MIGRAPHX_DEBUG
 // NOLINTNEXTLINE
 #define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) source_location_capture<T>
@@ -221,6 +229,7 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l
 #define MIGRAPHX_ASSERT(cond)
 #define MIGRAPHX_WARN(...)
 #endif
+#endif
 
 #define MIGRAPHX_STATIC_ASSERT_FOR(...) \
     static_assert(__VA_ARGS__);         \
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
index 43ee2ca5d87..527a0c7915b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
@@ -59,7 +59,7 @@ class numeric_limits;
 template <migraphx::fp8::f8_type T = migraphx::fp8::f8_type::fp8, bool FNUZ = true>
 struct float8
 {
-    uint8_t data;
+    uint8_t data = 0;
     // default constructor
     __device__ constexpr float8() = default;
     // default copy constructor
@@ -140,7 +140,7 @@ struct float8
            migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
            uint32_t rng                    = 0)
     {
-        if(__builtin_is_constant_evaluated() or !FNUZ)
+        if(__builtin_is_constant_evaluated() or not FNUZ)
         {
             if constexpr(T == migraphx::fp8::f8_type::fp8)
             {
@@ -249,7 +249,7 @@ struct float8
     // upcast using device specific intrinsic
     constexpr __device__ operator float() const
     {
-        if(__builtin_is_constant_evaluated() or !FNUZ)
+        if(__builtin_is_constant_evaluated() or not FNUZ)
         {
             if constexpr(T == migraphx::fp8::f8_type::fp8)
             {
@@ -261,7 +261,7 @@ struct float8
         else
         {
             float fval      = 0;
-            uint32_t i32val = static_cast<uint32_t>(data);
+            uint32_t i32val = data;
 
             // upcast
             if constexpr(T == migraphx::fp8::f8_type::fp8)
@@ -312,7 +312,7 @@ struct float8
         }
         else
         {
-            if(T == migraphx::fp8::f8_type::bf8)
+            if constexpr(T == migraphx::fp8::f8_type::bf8)
             {
                 return (data == 0x7D) or (data == 0x7E) or (data == 0x7F) or (data == 0xFD) or
                        (data == 0xFE) or (data == 0xFF);
@@ -333,7 +333,7 @@ struct float8
         }
         else
         {
-            if(T == migraphx::fp8::f8_type::bf8)
+            if constexpr(T == migraphx::fp8::f8_type::bf8)
             {
                 return (data == 0x7C) or (data == 0xFC);
             }
@@ -370,16 +370,12 @@ struct float8
 
     __device__ constexpr bool operator<(const float8& rhs) const
     {
-        const auto we   = static_cast<float>(*this);
-        const auto them = static_cast<float>(rhs);
-        return we < them;
+        return static_cast<float>(*this) < static_cast<float>(rhs);
     }
 
     __device__ constexpr bool operator>(const float8& rhs) const
     {
-        const auto we   = static_cast<float>(*this);
-        const auto them = static_cast<float>(rhs);
-        return we > them;
+        return static_cast<float>(*this) > static_cast<float>(rhs);
     }
 };
 
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
index 09ab146fbed..9fbe5e6f740 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
@@ -118,6 +118,7 @@ __device__ constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng
     if(x == 0)
         return 0;
     // handle negative zero
+    // cppcheck-suppress compareValueOutOfTypeRangeError
     else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
     {
         return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
index 89f1a4a615e..f7adee4eec5 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp
@@ -98,13 +98,12 @@ template <index_int N>
 constexpr auto slice_group()
 {
     return slice_size_transform{[](auto input, auto s) {
-        auto r = return_array_c([] {
+        return return_array_c([] {
             auto lens = decltype(s){}.lens.base();
             lens.back() *= N;
             lens -= 1;
             return decltype(input){}.lens.carry(lens) + index_int{1};
         });
-        return r;
     }};
 }
 
diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp
index 6ce858b83f3..76875ec7047 100644
--- a/test/gpu/kernels/spatial_tiler.cpp
+++ b/test/gpu/kernels/spatial_tiler.cpp
@@ -22,9 +22,11 @@
  * THE SOFTWARE.
  *
  */
+// cppcheck-suppress-file constStatement
 #include <migraphx/kernels/spatial_tiler.hpp>
 #include <migraphx/kernels/test.hpp>
 
+
 // Helper: create a standard 4D shape from lens
 template <migraphx::index_int N,
           migraphx::index_int C,

From 1457b47f45d128c8cac7142937f9ecf32b0ae477 Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 22 Apr 2026 12:39:55 -0500
Subject: [PATCH 81/84] Format

---
 test/gpu/kernels/spatial_tiler.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp
index 76875ec7047..058eb8e4274 100644
--- a/test/gpu/kernels/spatial_tiler.cpp
+++ b/test/gpu/kernels/spatial_tiler.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/kernels/spatial_tiler.hpp>
 #include <migraphx/kernels/test.hpp>
 
-
 // Helper: create a standard 4D shape from lens
 template <migraphx::index_int N,
           migraphx::index_int C,

From 7a5abf9b02d8877c5462be6952364e0ab1a5e20f Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Wed, 22 Apr 2026 12:40:14 -0500
Subject: [PATCH 82/84] Update year

---
 src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp      | 2 +-
 .../gpu/kernels/include/migraphx/kernels/integral_constant.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
index 3e7fffaa2f5..aa46782ce58 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
index 9d48717bbd8..e8f16b9d5e0 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

From 7d9e8766415f9e3235b8ee830ddb2a265030466b Mon Sep 17 00:00:00 2001
From: Paul <pfultz2@yahoo.com>
Date: Fri, 15 May 2026 14:41:50 -0500
Subject: [PATCH 83/84] Fix tile miscompilation

---
 .../gpu/kernels/include/migraphx/kernels/tile.hpp        | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
index 1f11b214fd1..f183a91d369 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/uninitialized_buffer.hpp>
 #include <migraphx/kernels/copy.hpp>
 
 namespace migraphx {
@@ -61,8 +62,8 @@ struct tile
                 using type          = typename T::type;
                 constexpr auto s    = pad_shape(make_packed_shape(get_shape_c<T>{}));
                 constexpr auto size = s.element_space();
-                __shared__ type buffer[size];
-                auto b = make_tensor_view(buffer, s);
+                __shared__ uninitialized_buffer<type, size> buffer;
+                auto b = make_tensor_view(buffer.data(), s);
                 local_tensor_copy(idx, x, b);
                 f(b);
             };
@@ -77,8 +78,8 @@ struct tile
                 using type          = typename T::type;
                 constexpr auto s    = pad_shape(make_packed_shape(get_shape_c<T>{}));
                 constexpr auto size = s.element_space();
-                __shared__ type buffer[size];
-                auto b = make_tensor_view(buffer, s);
+                __shared__ uninitialized_buffer<type, size> buffer;
+                auto b = make_tensor_view(buffer.data(), s);
                 f(b);
                 local_tensor_copy(idx, b, x);
             };

From c19c8b537315b0c94dbe0449ed6bc02091d05586 Mon Sep 17 00:00:00 2001
From: kahmed10 <15948690+kahmed10@users.noreply.github.com>
Date: Wed, 20 May 2026 23:40:33 -0500
Subject: [PATCH 84/84] update license

---
 src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp | 2 +-
 src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
index 527a0c7915b..08640e9e07b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
@@ -2,7 +2,7 @@
  *
  * The MIT License (MIT)
  *
- * Copyright (C) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
index f183a91d369..6ccbd0ba17f 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal