From 4efdd1a5e405087b2924620629e9cef1053d64e1 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 15:55:29 +0000 Subject: [PATCH 01/84] Add channelwise conv --- src/targets/gpu/jit/channelwise_conv.cpp | 103 ++++++++++++++++ .../migraphx/kernels/channelwise_conv.hpp | 61 ++++++++++ src/targets/gpu/prefuse_ops.cpp | 110 ++++++++++++++++++ test/verify/test_channelwise_conv.cpp | 104 +++++++++++++++++ 4 files changed, 378 insertions(+) create mode 100644 src/targets/gpu/jit/channelwise_conv.cpp create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp create mode 100644 test/verify/test_channelwise_conv.cpp diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp new file mode 100644 index 00000000000..88218c2a03d --- /dev/null +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -0,0 +1,103 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { +namespace gpu { + +// NOLINTNEXTLINE +static const char* const channelwise_conv_kernel = R"__migraphx__( +#include +#include +#include +#include + +namespace migraphx { + +extern "C" { + +MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) +{ + transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { + channelwise_conv<${algo}>(index_ints<${kernel}>{}, output, x, w); + }); +} + +} + +} // namespace migraphx + +)__migraphx__"; + +struct channelwise_conv_compiler : compiler +{ + std::vector names() const + { + return {"gpu::channelwise_conv", "channelwise_conv"}; + } + + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const + { + hip_compile_options options; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; + options.virtual_inputs = inputs; + + auto x_lens = x_s.lens(); + std::vector kernel_sizes(x_lens.begin() + 2, + x_lens.begin() + 2 + num_spatial); + std::size_t kernel_total = 1; + for(auto k : kernel_sizes) + kernel_total *= k; + + std::string algo = "reduce::lane"; + std::size_t block_size = 256; + + options.set_launch_params( + v, compute_global_for(ctx, out_s.elements(), 256), block_size); + + auto src = interpolate_string(channelwise_conv_kernel, + {{"algo", algo}, + {"kernel", to_string_range(kernel_sizes)}}); + + return compile_hip_code_object(ctx, src, options); + } + + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const + { + return compile_op(ctx, to_shapes(ins->inputs()), op.to_value()); + } +}; + +} // namespace gpu +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp new file mode 100644 index 00000000000..5f3a454546a --- /dev/null +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -0,0 +1,61 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +#ifndef MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP +#define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP + +#include +#include +#include +#include + +namespace migraphx { + +template +__device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x, Input2 w) +{ + constexpr index_int NS = array_size(KernelLens{}); + constexpr index_int NDIM = 2 + 2 * NS; + constexpr index_int kernel_total = KernelLens{}.product(); + + pooling_reduce(output, [&](auto out_idx, auto r) { + auto result = r.reduce(op::sum{}, 0, [&](auto ki) { + auto kmulti = kernel_lens.multi(ki); + auto bcast_idx = generate_array( + _c, [&](auto d) -> index_int { + if constexpr(d < 2) + return out_idx[d]; + else if constexpr(d < 2 + NS) + return kmulti[d - _c<2>]; + else + return out_idx[d - _c] + kmulti[d - _c<2 + NS>]; + }); + return x[bcast_idx] * w[bcast_idx]; + })(reduce::make_indices(_c)); + return result; + }); +} + +} // namespace migraphx +#endif // MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 7d3cd43b8db..6153d0a3811 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -27,9 +27,11 @@ #include #include #include +#include #include #include #include +#include #ifdef MIGRAPHX_USE_COMPOSABLEKERNEL #include #endif @@ -237,6 +239,113 @@ struct find_gemm_softmax_gemm } }; +struct channelwise_conv +{ + std::size_t num_spatial = 2; + + std::string name() const { return "gpu::channelwise_conv"; } + + template + static auto reflect(Self& self, F f) + { + return pack(f(self.num_spatial, "num_spatial")); + } + + shape compute_shape(std::vector inputs) const + { + check_shapes{inputs, *this}.has(2); + auto lens = inputs.front().lens(); + std::vector out_lens; + out_lens.push_back(lens[0]); + out_lens.push_back(lens[1]); + for(std::size_t d = 0; d < num_spatial; ++d) + { + auto kernel_size = lens[2 + d]; + auto spatial_size = lens[2 + num_spatial + d]; + out_lens.push_back(spatial_size - kernel_size + 1); + } + return {inputs.front().type(), out_lens}; + } +}; +MIGRAPHX_REGISTER_OP(channelwise_conv); + +MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins) +{ + if(ins->name() != "convolution") + return false; + auto v = ins->get_operator().to_value(); + if(not all_of(v.at("stride"), [](const value& x) { return x.to() == 1; })) + return false; + if(not all_of(v.at("padding"), [](const value& x) { return x.to() == 0; })) + return false; + if(not all_of(v.at("dilation"), [](const value& x) { return x.to() == 1; })) + return false; + auto w_lens = ins->inputs().back()->get_shape().lens(); + if(w_lens[1] != 1) + return false; + auto x_lens = ins->inputs().front()->get_shape().lens(); + auto c_in = x_lens[1]; + auto group = v.at("group").to(); + if(group != 1 and group != static_cast(c_in)) + return false; + return true; +} + +struct find_channelwise_convolution +{ + auto matcher() const { return conv_channelwise(); } + + void apply(module& m, const match::matcher_result& r) const + { + auto ins = r.result; + + auto input = ins->inputs().front(); + auto weights = ins->inputs().back(); + + auto w_lens = weights->get_shape().lens(); + auto x_lens = input->get_shape().lens(); + auto ndim = ins->get_shape().ndim(); + auto num_spatial = ndim - 2; + + // Build product shape: [N, C, k_0, ..., k_{ns-1}, s_0, ..., s_{ns-1}] + std::vector prod_lens; + prod_lens.push_back(x_lens[0]); + prod_lens.push_back(w_lens[0]); + for(std::size_t d = 2; d < ndim; ++d) + prod_lens.push_back(w_lens[d]); + for(std::size_t d = 2; d < ndim; ++d) + prod_lens.push_back(x_lens[d]); + + // Unsqueeze input: [N, C_in, H, W] -> [N, C_in, 1, ..., 1, H, W] + std::vector input_unsq_axes(num_spatial); + std::iota(input_unsq_axes.begin(), input_unsq_axes.end(), 2); + auto unsq_input = + m.insert_instruction(ins, make_op("unsqueeze", {{"axes", input_unsq_axes}}), input); + + // Broadcast input to product shape + auto bcast_input = m.insert_instruction( + ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_input); + + // Squeeze weight axis 1: [C_out, 1, k_0, ...] -> [C_out, k_0, ...] + auto sq_weights = m.insert_instruction(ins, make_op("squeeze", {{"axes", {1}}}), weights); + + // Unsqueeze weight: [C_out, k_0, ...] -> [1, C_out, k_0, ..., 1, ..., 1] + std::vector w_unsq_axes; + w_unsq_axes.push_back(0); + for(std::size_t d = 0; d < num_spatial; ++d) + w_unsq_axes.push_back(static_cast(2 + num_spatial + d)); + auto unsq_weights = + m.insert_instruction(ins, make_op("unsqueeze", {{"axes", w_unsq_axes}}), sq_weights); + + // Broadcast weight to product shape + auto bcast_weights = m.insert_instruction( + ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights); + + m.replace_instruction( + ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights); + } +}; + void inline_group_sub_module(module_pass_manager& mpm) { auto& m = mpm.get_module(); @@ -262,6 +371,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const match::find_matches(mpm.get_module(), find_add_layernorm{}); } match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention}); + match::find_matches(mpm.get_module(), find_channelwise_convolution{}); if(enabled(MIGRAPHX_DISABLE_MLIR{})) { diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp new file mode 100644 index 00000000000..1367fd85076 --- /dev/null +++ b/test/verify/test_channelwise_conv.cpp @@ -0,0 +1,104 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_depthwise + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_depthwise; +template struct test_channelwise_conv_depthwise; + +template +struct test_channelwise_conv_single_channel + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution"), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_single_channel; +template struct test_channelwise_conv_single_channel; + +template +struct test_channelwise_conv_depthwise_5x5 + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_depthwise_5x5; +template struct test_channelwise_conv_depthwise_5x5; + +template +struct test_channelwise_conv_1d + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}}); + mm->add_instruction( + migraphx::make_op("convolution", + {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}), + input, + weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_1d; +template struct test_channelwise_conv_1d; From a0c6b07963d5686b6f4a81a0dddd0ef613001091 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 15:55:32 +0000 Subject: [PATCH 02/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 26 +++++++------------ .../migraphx/kernels/channelwise_conv.hpp | 17 ++++++------ src/targets/gpu/prefuse_ops.cpp | 3 +-- test/verify/test_channelwise_conv.cpp | 6 ++--- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 88218c2a03d..e0101186e5b 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,25 +56,21 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const - { - return {"gpu::channelwise_conv", "channelwise_conv"}; - } + std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); - std::vector kernel_sizes(x_lens.begin() + 2, - x_lens.begin() + 2 + num_spatial); + std::vector kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); std::size_t kernel_total = 1; for(auto k : kernel_sizes) kernel_total *= k; @@ -82,12 +78,10 @@ struct channelwise_conv_compiler : compiler std::string algo = "reduce::lane"; std::size_t block_size = 256; - options.set_launch_params( - v, compute_global_for(ctx, out_s.elements(), 256), block_size); + options.set_launch_params(v, compute_global_for(ctx, out_s.elements(), 256), block_size); auto src = interpolate_string(channelwise_conv_kernel, - {{"algo", algo}, - {"kernel", to_string_range(kernel_sizes)}}); + {{"algo", algo}, {"kernel", to_string_range(kernel_sizes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 5f3a454546a..0ade178111a 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -42,15 +42,14 @@ __device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x pooling_reduce(output, [&](auto out_idx, auto r) { auto result = r.reduce(op::sum{}, 0, [&](auto ki) { auto kmulti = kernel_lens.multi(ki); - auto bcast_idx = generate_array( - _c, [&](auto d) -> index_int { - if constexpr(d < 2) - return out_idx[d]; - else if constexpr(d < 2 + NS) - return kmulti[d - _c<2>]; - else - return out_idx[d - _c] + kmulti[d - _c<2 + NS>]; - }); + auto bcast_idx = generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return out_idx[d]; + else if constexpr(d < 2 + NS) + return kmulti[d - _c<2>]; + else + return out_idx[d - _c] + kmulti[d - _c<2 + NS>]; + }); return x[bcast_idx] * w[bcast_idx]; })(reduce::make_indices(_c)); return result; diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 6153d0a3811..e3a39d05fb7 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -341,8 +341,7 @@ struct find_channelwise_convolution auto bcast_weights = m.insert_instruction( ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights); - m.replace_instruction( - ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights); + m.replace_instruction(ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights); } }; diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp index 1367fd85076..e3483480d8e 100644 --- a/test/verify/test_channelwise_conv.cpp +++ b/test/verify/test_channelwise_conv.cpp @@ -28,8 +28,7 @@ #include template -struct test_channelwise_conv_depthwise - : verify_program> +struct test_channelwise_conv_depthwise : verify_program> { migraphx::program create_program() const { @@ -82,8 +81,7 @@ template struct test_channelwise_conv_depthwise_5x5 template struct test_channelwise_conv_depthwise_5x5; template -struct test_channelwise_conv_1d - : verify_program> +struct test_channelwise_conv_1d : verify_program> { migraphx::program create_program() const { From efeafca404d5b07be2dbcf5ec7face4b6831ad65 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:20:50 +0000 Subject: [PATCH 03/84] Use shared memory --- src/targets/gpu/jit/channelwise_conv.cpp | 33 +++--- .../migraphx/kernels/channelwise_conv.hpp | 112 ++++++++++++++---- 2 files changed, 108 insertions(+), 37 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index e0101186e5b..7cbe07938ae 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -44,7 +44,7 @@ extern "C" { MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) { transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { - channelwise_conv<${algo}>(index_ints<${kernel}>{}, output, x, w); + channelwise_conv(index_ints<${kernel}>{}, index_ints<${spatial}>{}, output, x, w); }); } @@ -56,32 +56,35 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } + std::vector names() const + { + return {"gpu::channelwise_conv", "channelwise_conv"}; + } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); - std::vector kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); - std::size_t kernel_total = 1; - for(auto k : kernel_sizes) - kernel_total *= k; + std::vector kernel_sizes(x_lens.begin() + 2, + x_lens.begin() + 2 + num_spatial); + std::vector spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end()); - std::string algo = "reduce::lane"; + auto num_channels = out_s.lens()[0] * out_s.lens()[1]; std::size_t block_size = 256; - options.set_launch_params(v, compute_global_for(ctx, out_s.elements(), 256), block_size); + options.set_launch_params(v, num_channels * block_size, block_size); auto src = interpolate_string(channelwise_conv_kernel, - {{"algo", algo}, {"kernel", to_string_range(kernel_sizes)}}); + {{"kernel", to_string_range(kernel_sizes)}, + {"spatial", to_string_range(spatial_sizes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 0ade178111a..b72056ba15c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -26,34 +26,102 @@ #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP #include -#include -#include -#include +#include namespace migraphx { -template -__device__ void channelwise_conv(KernelLens kernel_lens, Output output, Input1 x, Input2 w) +template +__device__ void channelwise_conv(KernelLens kernel_lens, + SpatialLens, + Output output, + Input1 x, + Input2 w) { - constexpr index_int NS = array_size(KernelLens{}); - constexpr index_int NDIM = 2 + 2 * NS; - constexpr index_int kernel_total = KernelLens{}.product(); - - pooling_reduce(output, [&](auto out_idx, auto r) { - auto result = r.reduce(op::sum{}, 0, [&](auto ki) { - auto kmulti = kernel_lens.multi(ki); - auto bcast_idx = generate_array(_c, [&](auto d) -> index_int { - if constexpr(d < 2) - return out_idx[d]; - else if constexpr(d < 2 + NS) - return kmulti[d - _c<2>]; - else - return out_idx[d - _c] + kmulti[d - _c<2 + NS>]; - }); - return x[bcast_idx] * w[bcast_idx]; - })(reduce::make_indices(_c)); + constexpr index_int NS = array_size(KernelLens{}); + constexpr index_int kernel_total = KernelLens{}.product(); + constexpr index_int spatial_total = SpatialLens{}.product(); + constexpr index_int product_total = kernel_total * spatial_total; + + constexpr auto out_spatial_lens = return_array_c([] { + constexpr auto kl = KernelLens{}; + constexpr auto sl = SpatialLens{}; + constexpr index_int ns = array_size(KernelLens{}); + array result; + for(index_int i = 0; i < ns; i++) + result[i] = sl[i] - kl[i] + 1; return result; }); + constexpr index_int out_spatial_total = out_spatial_lens.product(); + + constexpr auto prod_lens = return_array_c([] { + constexpr auto kl = KernelLens{}; + constexpr auto sl = SpatialLens{}; + constexpr index_int ns = array_size(KernelLens{}); + array result; + for(index_int i = 0; i < ns; i++) + result[i] = kl[i]; + for(index_int i = 0; i < ns; i++) + result[ns + i] = sl[i]; + return result; + }); + constexpr auto smem_shape = make_shape(prod_lens); + + using T = typename Output::type; + __shared__ T smem[product_total]; + + auto idx = make_index(); + + index_int C = output.get_shape().lens[1]; + auto n = idx.group / C; + auto c = idx.group % C; + + // Phase 1: elementwise multiply into shared memory + for(index_int i = idx.local; i < product_total; i += idx.nlocal()) + { + auto prod_multi = prod_lens.multi(i); + auto bcast_idx = + generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { + if constexpr(d == 0) + return n; + else if constexpr(d == 1) + return c; + else + return prod_multi[d - _c<2>]; + }); + smem[i] = x[bcast_idx] * w[bcast_idx]; + } + + __syncthreads(); + + auto smem_view = make_tensor_view(&smem[0], smem_shape); + + // Phase 2: sliding window reduce from shared memory + for(index_int j = idx.local; j < out_spatial_total; j += idx.nlocal()) + { + auto out_spatial = out_spatial_lens.multi(j); + T acc = 0; + for(index_int ki = 0; ki < kernel_total; ki++) + { + auto k_multi = kernel_lens.multi(ki); + auto smem_idx = generate_array(_c<2 * NS>, [&](auto d) -> index_int { + if constexpr(d < NS) + return k_multi[d]; + else + return out_spatial[d - _c] + k_multi[d - _c]; + }); + acc += smem_view[smem_idx]; + } + + auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { + if constexpr(d == 0) + return n; + else if constexpr(d == 1) + return c; + else + return out_spatial[d - _c<2>]; + }); + output[out_idx] = acc; + } } } // namespace migraphx From 44989349ef9677d45fe22281f219b9afd0f9a487 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:20:56 +0000 Subject: [PATCH 04/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 22 ++++------ .../migraphx/kernels/channelwise_conv.hpp | 44 +++++++++---------- 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 7cbe07938ae..ad535485512 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,28 +56,24 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const - { - return {"gpu::channelwise_conv", "channelwise_conv"}; - } + std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); - std::vector kernel_sizes(x_lens.begin() + 2, - x_lens.begin() + 2 + num_spatial); + std::vector kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); std::vector spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end()); - auto num_channels = out_s.lens()[0] * out_s.lens()[1]; + auto num_channels = out_s.lens()[0] * out_s.lens()[1]; std::size_t block_size = 256; options.set_launch_params(v, num_channels * block_size, block_size); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index b72056ba15c..fef75d5573e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -31,21 +31,18 @@ namespace migraphx { template -__device__ void channelwise_conv(KernelLens kernel_lens, - SpatialLens, - Output output, - Input1 x, - Input2 w) +__device__ void +channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, Input2 w) { constexpr index_int NS = array_size(KernelLens{}); constexpr index_int kernel_total = KernelLens{}.product(); constexpr index_int spatial_total = SpatialLens{}.product(); constexpr index_int product_total = kernel_total * spatial_total; - constexpr auto out_spatial_lens = return_array_c([] { - constexpr auto kl = KernelLens{}; - constexpr auto sl = SpatialLens{}; - constexpr index_int ns = array_size(KernelLens{}); + constexpr auto out_spatial_lens = return_array_c([] { + constexpr auto kl = KernelLens{}; + constexpr auto sl = SpatialLens{}; + constexpr index_int ns = array_size(KernelLens{}); array result; for(index_int i = 0; i < ns; i++) result[i] = sl[i] - kl[i] + 1; @@ -53,10 +50,10 @@ __device__ void channelwise_conv(KernelLens kernel_lens, }); constexpr index_int out_spatial_total = out_spatial_lens.product(); - constexpr auto prod_lens = return_array_c([] { - constexpr auto kl = KernelLens{}; - constexpr auto sl = SpatialLens{}; - constexpr index_int ns = array_size(KernelLens{}); + constexpr auto prod_lens = return_array_c([] { + constexpr auto kl = KernelLens{}; + constexpr auto sl = SpatialLens{}; + constexpr index_int ns = array_size(KernelLens{}); array result; for(index_int i = 0; i < ns; i++) result[i] = kl[i]; @@ -79,16 +76,15 @@ __device__ void channelwise_conv(KernelLens kernel_lens, for(index_int i = idx.local; i < product_total; i += idx.nlocal()) { auto prod_multi = prod_lens.multi(i); - auto bcast_idx = - generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { - if constexpr(d == 0) - return n; - else if constexpr(d == 1) - return c; - else - return prod_multi[d - _c<2>]; - }); - smem[i] = x[bcast_idx] * w[bcast_idx]; + auto bcast_idx = generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { + if constexpr(d == 0) + return n; + else if constexpr(d == 1) + return c; + else + return prod_multi[d - _c<2>]; + }); + smem[i] = x[bcast_idx] * w[bcast_idx]; } __syncthreads(); @@ -112,7 +108,7 @@ __device__ void channelwise_conv(KernelLens kernel_lens, acc += smem_view[smem_idx]; } - auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { + auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { if constexpr(d == 0) return n; else if constexpr(d == 1) From 1792edbb745ae0465e9e0024fb6a8eb996c16334 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:23:08 +0000 Subject: [PATCH 05/84] Update slice functions --- .../include/migraphx/kernels/slice.hpp | 261 +++++++++--------- 1 file changed, 135 insertions(+), 126 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp index 90c8b6a7dd6..00db73aebd0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp @@ -21,129 +21,138 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP -#define MIGRAPHX_GUARD_KERNELS_SLICE_HPP - -#include -#include -#include - -namespace migraphx { - -template -constexpr auto slice_make_multi_lens(Shape, Size) -{ - return return_array_c([] { - auto n = Size{} - _c<1>; - auto i = Shape{}.multi(n); - using type = typename decltype(i)::value_type; - return i + type{1}; - }); -} - -template -constexpr auto slice_make_multi_lens(Shape, integral_const_array x) -{ - return x; -} - -template -constexpr auto make_slice(Shape, Select select) -{ - auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int { - if(select(x, ii, Shape{}.lens.size())) - return x; - return 1; - }); - return make_shape(inner_lens, Shape{}.strides); -} - -template -constexpr auto make_slice(Shape input, Select select, Size size) -{ - auto as = make_slice(input, select); - auto lens = slice_make_multi_lens(as, size); - return make_shape(lens, Shape{}.strides); -} - -template -struct slice_size_transform -{ - F f; - - template - constexpr auto operator()(Ts... xs) const - { - return f(xs...); - } -}; -MIGRAPHX_AUTO_DEDUCE(slice_size_transform); - -template -constexpr auto make_slice(Shape input, Select select, slice_size_transform t) -{ - auto as = make_slice(input, select); - auto lens = slice_make_multi_lens(as, decltype(t(input, as)){}); - return make_shape(lens, Shape{}.strides); -} - -template -constexpr auto nslices(Shape input, Ss... ss) -{ - auto as = make_slice(input, ss...); - return input.elements() / as.elements(); -} - -template -constexpr auto slice_group() -{ - return slice_size_transform{[](auto input, auto s) { - auto r = return_array_c([] { - auto lens = decltype(s){}.lens.base(); - lens.back() *= N; - lens -= 1; - return decltype(input){}.lens.carry(lens) + index_int{1}; - }); - return r; - }}; -} - -template -constexpr auto slice_split() -{ - return slice_size_transform{[](auto, auto s) { return s.elements() / _c; }}; -} - -template -constexpr auto slice_axes() -{ - return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); }; -} - -template -constexpr auto slice_tensor(Input input, T start, Ss... ss) -{ - constexpr auto inner_shape = make_slice(get_shape_c{}, ss...); - auto outer_lens = transform(get_shape_c{}.lens, - inner_shape.lens, - [=](auto x, auto inner) { return 1 + x - inner; }); - auto outer_shape = make_shape(outer_lens, get_shape_c{}.strides); - auto offset = outer_shape.index(start); - MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c{}.element_space()); - return make_tensor_view(input.data() + offset, inner_shape); -} - -template -constexpr auto slice_schedule(index idx, Ss... ss) -{ - return [=](auto... xs) { - return [=](auto f) { - // TODO: Assert nslices is the same for all xs - constexpr auto n = nslices(get_shape_c()(xs...))>{}, ss...); - Schedule{idx}.group_stride(n, [&](auto i) { f(slice_tensor(xs, i, ss...)...); }); - }; - }; -} - -} // namespace migraphx -#endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP + #ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP + #define MIGRAPHX_GUARD_KERNELS_SLICE_HPP + + #include + #include + #include + + namespace migraphx { + + template + constexpr auto slice_make_multi_lens(Shape, Size) + { + return return_array_c([] { + auto n = Size{} - _c<1>; + auto i = Shape{}.multi(n); + using type = typename decltype(i)::value_type; + return i + type{1}; + }); + } + + template + constexpr auto slice_make_multi_lens(Shape, integral_const_array x) + { + return x; + } + + template + constexpr auto make_slice(Shape, Select select) + { + auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int { + if(select(x, ii, Shape{}.lens.size())) + return x; + return 1; + }); + return make_shape(inner_lens, Shape{}.strides); + } + + template + constexpr auto make_slice(Shape input, Select select, Size size) + { + auto as = make_slice(input, select); + auto lens = slice_make_multi_lens(as, size); + return make_shape(lens, Shape{}.strides); + } + + template + struct slice_size_transform + { + F f; + + template + constexpr auto operator()(Ts... xs) const + { + return f(xs...); + } + }; + MIGRAPHX_AUTO_DEDUCE(slice_size_transform); + + template + constexpr auto make_slice(Shape input, Select select, slice_size_transform t) + { + auto as = make_slice(input, select); + auto lens = slice_make_multi_lens(as, decltype(t(input, as)){}); + return make_shape(lens, Shape{}.strides); + } + + template + constexpr auto nslices(Shape input, Ss... ss) + { + auto as = make_slice(input, ss...); + return input.elements() / as.elements(); + } + + template + constexpr auto slice_group() + { + return slice_size_transform{[](auto input, auto s) { + auto r = return_array_c([] { + auto lens = decltype(s){}.lens.base(); + lens.back() *= N; + lens -= 1; + return decltype(input){}.lens.carry(lens) + index_int{1}; + }); + return r; + }}; + } + + template + constexpr auto slice_split() + { + return slice_size_transform{[](auto, auto s) { return s.elements() / _c; }}; + } + + template + constexpr auto slice_axes() + { + return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); }; + } + + template + constexpr auto slice_tensor(Input input, T start, Ss... ss) + { + constexpr auto inner_shape = make_slice(get_shape_c{}, ss...); + auto outer_lens = transform( + get_shape_c{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; }); + // TODO: Handle non-divisble dimensions + auto outer_shape = make_shape(outer_lens, get_shape_c{}.strides * inner_shape.lens); + auto offset = outer_shape.index(start); + MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() == + input.get_shape().elements()); + MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c{}.element_space()); + return make_tensor_view(input.data() + offset, inner_shape); + } + + template + constexpr auto slice_schedule(index idx, Ss... ss) + { + return [=](auto... xs) { + return [=](auto f) { + constexpr auto first = get_shape_c()(xs...))>{}; + constexpr auto n = nslices(first, ss...); + MIGRAPHX_ASSERT(((n == nslices(get_shape_c{}, ss...)) and ...)); + Schedule{idx}.group_stride(n, [&](auto i) { + MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n == + xs.get_shape().elements()) and + ...)); + f(slice_tensor(xs, i, ss...)...); + }); + }; + }; + } + + } // namespace migraphx + #endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP + \ No newline at end of file From 030497288acaa717b113619159a78b809b3f61c2 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:23:11 +0000 Subject: [PATCH 06/84] Format --- .../include/migraphx/kernels/slice.hpp | 269 +++++++++--------- 1 file changed, 134 insertions(+), 135 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp index 00db73aebd0..4bc4d6354b6 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp @@ -21,138 +21,137 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ - #ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP - #define MIGRAPHX_GUARD_KERNELS_SLICE_HPP - - #include - #include - #include - - namespace migraphx { - - template - constexpr auto slice_make_multi_lens(Shape, Size) - { - return return_array_c([] { - auto n = Size{} - _c<1>; - auto i = Shape{}.multi(n); - using type = typename decltype(i)::value_type; - return i + type{1}; - }); - } - - template - constexpr auto slice_make_multi_lens(Shape, integral_const_array x) - { - return x; - } - - template - constexpr auto make_slice(Shape, Select select) - { - auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int { - if(select(x, ii, Shape{}.lens.size())) - return x; - return 1; - }); - return make_shape(inner_lens, Shape{}.strides); - } - - template - constexpr auto make_slice(Shape input, Select select, Size size) - { - auto as = make_slice(input, select); - auto lens = slice_make_multi_lens(as, size); - return make_shape(lens, Shape{}.strides); - } - - template - struct slice_size_transform - { - F f; - - template - constexpr auto operator()(Ts... xs) const - { - return f(xs...); - } - }; - MIGRAPHX_AUTO_DEDUCE(slice_size_transform); - - template - constexpr auto make_slice(Shape input, Select select, slice_size_transform t) - { - auto as = make_slice(input, select); - auto lens = slice_make_multi_lens(as, decltype(t(input, as)){}); - return make_shape(lens, Shape{}.strides); - } - - template - constexpr auto nslices(Shape input, Ss... ss) - { - auto as = make_slice(input, ss...); - return input.elements() / as.elements(); - } - - template - constexpr auto slice_group() - { - return slice_size_transform{[](auto input, auto s) { - auto r = return_array_c([] { - auto lens = decltype(s){}.lens.base(); - lens.back() *= N; - lens -= 1; - return decltype(input){}.lens.carry(lens) + index_int{1}; - }); - return r; - }}; - } - - template - constexpr auto slice_split() - { - return slice_size_transform{[](auto, auto s) { return s.elements() / _c; }}; - } - - template - constexpr auto slice_axes() - { - return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); }; - } - - template - constexpr auto slice_tensor(Input input, T start, Ss... ss) - { - constexpr auto inner_shape = make_slice(get_shape_c{}, ss...); - auto outer_lens = transform( - get_shape_c{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; }); - // TODO: Handle non-divisble dimensions - auto outer_shape = make_shape(outer_lens, get_shape_c{}.strides * inner_shape.lens); - auto offset = outer_shape.index(start); - MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() == - input.get_shape().elements()); - MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c{}.element_space()); - return make_tensor_view(input.data() + offset, inner_shape); - } - - template - constexpr auto slice_schedule(index idx, Ss... ss) - { - return [=](auto... xs) { - return [=](auto f) { - constexpr auto first = get_shape_c()(xs...))>{}; - constexpr auto n = nslices(first, ss...); - MIGRAPHX_ASSERT(((n == nslices(get_shape_c{}, ss...)) and ...)); - Schedule{idx}.group_stride(n, [&](auto i) { - MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n == - xs.get_shape().elements()) and - ...)); - f(slice_tensor(xs, i, ss...)...); - }); - }; - }; - } - - } // namespace migraphx - #endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP - \ No newline at end of file +#ifndef MIGRAPHX_GUARD_KERNELS_SLICE_HPP +#define MIGRAPHX_GUARD_KERNELS_SLICE_HPP + +#include +#include +#include + +namespace migraphx { + +template +constexpr auto slice_make_multi_lens(Shape, Size) +{ + return return_array_c([] { + auto n = Size{} - _c<1>; + auto i = Shape{}.multi(n); + using type = typename decltype(i)::value_type; + return i + type{1}; + }); +} + +template +constexpr auto slice_make_multi_lens(Shape, integral_const_array x) +{ + return x; +} + +template +constexpr auto make_slice(Shape, Select select) +{ + auto inner_lens = transform_i(Shape{}.lens, [=](index_int x, index_int ii) -> index_int { + if(select(x, ii, Shape{}.lens.size())) + return x; + return 1; + }); + return make_shape(inner_lens, Shape{}.strides); +} + +template +constexpr auto make_slice(Shape input, Select select, Size size) +{ + auto as = make_slice(input, select); + auto lens = slice_make_multi_lens(as, size); + return make_shape(lens, Shape{}.strides); +} + +template +struct slice_size_transform +{ + F f; + + template + constexpr auto operator()(Ts... xs) const + { + return f(xs...); + } +}; +MIGRAPHX_AUTO_DEDUCE(slice_size_transform); + +template +constexpr auto make_slice(Shape input, Select select, slice_size_transform t) +{ + auto as = make_slice(input, select); + auto lens = slice_make_multi_lens(as, decltype(t(input, as)){}); + return make_shape(lens, Shape{}.strides); +} + +template +constexpr auto nslices(Shape input, Ss... ss) +{ + auto as = make_slice(input, ss...); + return input.elements() / as.elements(); +} + +template +constexpr auto slice_group() +{ + return slice_size_transform{[](auto input, auto s) { + auto r = return_array_c([] { + auto lens = decltype(s){}.lens.base(); + lens.back() *= N; + lens -= 1; + return decltype(input){}.lens.carry(lens) + index_int{1}; + }); + return r; + }}; +} + +template +constexpr auto slice_split() +{ + return slice_size_transform{[](auto, auto s) { return s.elements() / _c; }}; +} + +template +constexpr auto slice_axes() +{ + return [](auto, auto i, auto n) { return ((Axes < 0 ? i == (n + Axes) : i == Axes) or ...); }; +} + +template +constexpr auto slice_tensor(Input input, T start, Ss... ss) +{ + constexpr auto inner_shape = make_slice(get_shape_c{}, ss...); + auto outer_lens = transform( + get_shape_c{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; }); + // TODO: Handle non-divisble dimensions + auto outer_shape = make_shape(outer_lens, get_shape_c{}.strides * inner_shape.lens); + auto offset = outer_shape.index(start); + MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() == + input.get_shape().elements()); + MIGRAPHX_ASSERT((offset + inner_shape.element_space()) <= get_shape_c{}.element_space()); + return make_tensor_view(input.data() + offset, inner_shape); +} + +template +constexpr auto slice_schedule(index idx, Ss... ss) +{ + return [=](auto... xs) { + return [=](auto f) { + constexpr auto first = get_shape_c()(xs...))>{}; + constexpr auto n = nslices(first, ss...); + MIGRAPHX_ASSERT(((n == nslices(get_shape_c{}, ss...)) and ...)); + Schedule{idx}.group_stride(n, [&](auto i) { + MIGRAPHX_ASSERT(((slice_tensor(xs, i, ss...).get_shape().elements() * n == + xs.get_shape().elements()) and + ...)); + f(slice_tensor(xs, i, ss...)...); + }); + }; + }; +} + +} // namespace migraphx +#endif // MIGRAPHX_GUARD_KERNELS_SLICE_HPP From 1389ae5182e13f83d2363cc5a2f99ffa1158bb51 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:36:27 +0000 Subject: [PATCH 07/84] Update to use slices instead --- .../migraphx/kernels/channelwise_conv.hpp | 97 +++++++++---------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index fef75d5573e..8aac289c2d7 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -27,6 +27,8 @@ #include #include +#include +#include namespace migraphx { @@ -39,7 +41,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I constexpr index_int spatial_total = SpatialLens{}.product(); constexpr index_int product_total = kernel_total * spatial_total; - constexpr auto out_spatial_lens = return_array_c([] { + constexpr auto out_spatial_lens = return_array_c([] { constexpr auto kl = KernelLens{}; constexpr auto sl = SpatialLens{}; constexpr index_int ns = array_size(KernelLens{}); @@ -50,7 +52,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I }); constexpr index_int out_spatial_total = out_spatial_lens.product(); - constexpr auto prod_lens = return_array_c([] { + constexpr auto prod_lens = return_array_c([] { constexpr auto kl = KernelLens{}; constexpr auto sl = SpatialLens{}; constexpr index_int ns = array_size(KernelLens{}); @@ -61,63 +63,58 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I result[ns + i] = sl[i]; return result; }); - constexpr auto smem_shape = make_shape(prod_lens); + constexpr auto prod_strides = calculate_strides(prod_lens); using T = typename Output::type; - __shared__ T smem[product_total]; + __shared__ uninitialized_buffer smem; - auto idx = make_index(); + auto idx = make_index(); + auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; }; - index_int C = output.get_shape().lens[1]; - auto n = idx.group / C; - auto c = idx.group % C; - - // Phase 1: elementwise multiply into shared memory - for(index_int i = idx.local; i < product_total; i += idx.nlocal()) - { - auto prod_multi = prod_lens.multi(i); - auto bcast_idx = generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { - if constexpr(d == 0) - return n; - else if constexpr(d == 1) - return c; - else - return prod_multi[d - _c<2>]; - }); - smem[i] = x[bcast_idx] * w[bcast_idx]; - } + slice_schedule(idx, keep_non_batch)(x, w, output)( + [&](auto x_ch, auto w_ch, auto out_ch) { + // Phase 1: elementwise multiply into shared memory + idx.local_stride(_c, [&](auto i) { + auto prod_multi = prod_lens.multi(i); + auto ch_idx = + generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return prod_multi[d - _c<2>]; + }); + smem[i] = x_ch[ch_idx] * w_ch[ch_idx]; + }); - __syncthreads(); + __syncthreads(); - auto smem_view = make_tensor_view(&smem[0], smem_shape); + // Phase 2: sliding window reduce from shared memory + idx.local_stride(_c, [&](auto j) { + auto out_spatial = out_spatial_lens.multi(j); + T acc = 0; + for(index_int ki = 0; ki < kernel_total; ki++) + { + auto k_multi = kernel_lens.multi(ki); + auto smem_idx = generate_array( + _c<2 * NS>, [&](auto d) -> index_int { + if constexpr(d < NS) + return k_multi[d]; + else + return out_spatial[d - _c] + k_multi[d - _c]; + }); + acc += smem[smem_idx.dot(prod_strides)]; + } - // Phase 2: sliding window reduce from shared memory - for(index_int j = idx.local; j < out_spatial_total; j += idx.nlocal()) - { - auto out_spatial = out_spatial_lens.multi(j); - T acc = 0; - for(index_int ki = 0; ki < kernel_total; ki++) - { - auto k_multi = kernel_lens.multi(ki); - auto smem_idx = generate_array(_c<2 * NS>, [&](auto d) -> index_int { - if constexpr(d < NS) - return k_multi[d]; - else - return out_spatial[d - _c] + k_multi[d - _c]; + auto out_idx = + generate_array(_c<2 + NS>, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return out_spatial[d - _c<2>]; + }); + out_ch[out_idx] = acc; }); - acc += smem_view[smem_idx]; - } - - auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { - if constexpr(d == 0) - return n; - else if constexpr(d == 1) - return c; - else - return out_spatial[d - _c<2>]; }); - output[out_idx] = acc; - } } } // namespace migraphx From 9c9b9a54ca93ffe4cb84e41979e24118c74cce02 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:36:30 +0000 Subject: [PATCH 08/84] Format --- .../migraphx/kernels/channelwise_conv.hpp | 77 +++++++++---------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 8aac289c2d7..ac50e18aeaa 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -41,7 +41,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I constexpr index_int spatial_total = SpatialLens{}.product(); constexpr index_int product_total = kernel_total * spatial_total; - constexpr auto out_spatial_lens = return_array_c([] { + constexpr auto out_spatial_lens = return_array_c([] { constexpr auto kl = KernelLens{}; constexpr auto sl = SpatialLens{}; constexpr index_int ns = array_size(KernelLens{}); @@ -52,7 +52,7 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I }); constexpr index_int out_spatial_total = out_spatial_lens.product(); - constexpr auto prod_lens = return_array_c([] { + constexpr auto prod_lens = return_array_c([] { constexpr auto kl = KernelLens{}; constexpr auto sl = SpatialLens{}; constexpr index_int ns = array_size(KernelLens{}); @@ -71,50 +71,47 @@ channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, I auto idx = make_index(); auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; }; - slice_schedule(idx, keep_non_batch)(x, w, output)( - [&](auto x_ch, auto w_ch, auto out_ch) { - // Phase 1: elementwise multiply into shared memory - idx.local_stride(_c, [&](auto i) { - auto prod_multi = prod_lens.multi(i); - auto ch_idx = - generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return prod_multi[d - _c<2>]; - }); - smem[i] = x_ch[ch_idx] * w_ch[ch_idx]; + slice_schedule(idx, + keep_non_batch)(x, w, output)([&](auto x_ch, auto w_ch, auto out_ch) { + // Phase 1: elementwise multiply into shared memory + idx.local_stride(_c, [&](auto i) { + auto prod_multi = prod_lens.multi(i); + auto ch_idx = generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return prod_multi[d - _c<2>]; }); + smem[i] = x_ch[ch_idx] * w_ch[ch_idx]; + }); - __syncthreads(); + __syncthreads(); - // Phase 2: sliding window reduce from shared memory - idx.local_stride(_c, [&](auto j) { - auto out_spatial = out_spatial_lens.multi(j); - T acc = 0; - for(index_int ki = 0; ki < kernel_total; ki++) - { - auto k_multi = kernel_lens.multi(ki); - auto smem_idx = generate_array( - _c<2 * NS>, [&](auto d) -> index_int { - if constexpr(d < NS) - return k_multi[d]; - else - return out_spatial[d - _c] + k_multi[d - _c]; - }); - acc += smem[smem_idx.dot(prod_strides)]; - } + // Phase 2: sliding window reduce from shared memory + idx.local_stride(_c, [&](auto j) { + auto out_spatial = out_spatial_lens.multi(j); + T acc = 0; + for(index_int ki = 0; ki < kernel_total; ki++) + { + auto k_multi = kernel_lens.multi(ki); + auto smem_idx = generate_array(_c<2 * NS>, [&](auto d) -> index_int { + if constexpr(d < NS) + return k_multi[d]; + else + return out_spatial[d - _c] + k_multi[d - _c]; + }); + acc += smem[smem_idx.dot(prod_strides)]; + } - auto out_idx = - generate_array(_c<2 + NS>, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return out_spatial[d - _c<2>]; - }); - out_ch[out_idx] = acc; + auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return out_spatial[d - _c<2>]; }); + out_ch[out_idx] = acc; }); + }); } } // namespace migraphx From 207e5d6d85d87fd673ac38040983e05ab5c550c6 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 11:53:43 -0600 Subject: [PATCH 09/84] Add reduce_schedule for outer batches --- src/targets/gpu/jit/channelwise_conv.cpp | 23 ++-- .../migraphx/kernels/channelwise_conv.hpp | 128 ++++++++---------- .../include/migraphx/kernels/index.hpp | 29 ++++ .../include/migraphx/kernels/reduce.hpp | 20 +-- src/targets/gpu/prefuse_ops.cpp | 64 ++------- 5 files changed, 123 insertions(+), 141 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index ad535485512..2d7ce51a853 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,22 +56,27 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } + std::vector names() const + { + return {"gpu::channelwise_conv", "channelwise_conv"}; + } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& w_s = inputs.at(1); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); - std::vector kernel_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); - std::vector spatial_sizes(x_lens.begin() + 2 + num_spatial, x_lens.end()); + auto w_lens = w_s.lens(); + std::vector kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial); + std::vector spatial_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); auto num_channels = out_s.lens()[0] * out_s.lens()[1]; std::size_t block_size = 256; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index ac50e18aeaa..2a9795682a5 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -26,90 +26,78 @@ #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP #include -#include +#include #include +#include +#include #include namespace migraphx { -template +template +__device__ void per_block_pooling_reduce(index idx, Output output, F f) +{ + constexpr auto nelements = get_shape_c{}.elements(); + idx.local_stride(nelements, [&](auto i) { + auto out_idx = get_shape_c{}.multi(i); + auto slicer = [](auto input) { return reduce_slice(input, 0); }; + auto r = reduce::lane::make(idx, slicer); + r.outer([&] { output[out_idx] = f(out_idx, r); }); + }); +} + +template __device__ void -channelwise_conv(KernelLens kernel_lens, SpatialLens, Output output, Input1 x, Input2 w) +channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) { - constexpr index_int NS = array_size(KernelLens{}); constexpr index_int kernel_total = KernelLens{}.product(); constexpr index_int spatial_total = SpatialLens{}.product(); - constexpr index_int product_total = kernel_total * spatial_total; - - constexpr auto out_spatial_lens = return_array_c([] { - constexpr auto kl = KernelLens{}; - constexpr auto sl = SpatialLens{}; - constexpr index_int ns = array_size(KernelLens{}); - array result; - for(index_int i = 0; i < ns; i++) - result[i] = sl[i] - kl[i] + 1; - return result; - }); - constexpr index_int out_spatial_total = out_spatial_lens.product(); - - constexpr auto prod_lens = return_array_c([] { - constexpr auto kl = KernelLens{}; - constexpr auto sl = SpatialLens{}; - constexpr index_int ns = array_size(KernelLens{}); - array result; - for(index_int i = 0; i < ns; i++) - result[i] = kl[i]; - for(index_int i = 0; i < ns; i++) - result[ns + i] = sl[i]; - return result; - }); - constexpr auto prod_strides = calculate_strides(prod_lens); + + constexpr index_int N = get_shape_c{}.lens[0]; + constexpr index_int C_out = get_shape_c{}.lens[1]; + constexpr index_int C_in = get_shape_c{}.lens[1]; + + constexpr auto smem_shape = make_packed_shape(make_slice(get_shape_c{}, + [](auto, auto i, auto) { return i >= 2; })); + constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c{}, + [](auto, auto i, auto) { return i >= 2; })); + + constexpr auto out_nc = make_shape(index_ints{}); + constexpr auto co_cin = make_shape(index_ints{}); + constexpr auto in_nc = make_shape(index_ints{}); using T = typename Output::type; - __shared__ uninitialized_buffer smem; - - auto idx = make_index(); - auto keep_non_batch = [](auto, auto i, auto) { return i >= 2; }; - - slice_schedule(idx, - keep_non_batch)(x, w, output)([&](auto x_ch, auto w_ch, auto out_ch) { - // Phase 1: elementwise multiply into shared memory - idx.local_stride(_c, [&](auto i) { - auto prod_multi = prod_lens.multi(i); - auto ch_idx = generate_array(_c<2 + 2 * NS>, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return prod_multi[d - _c<2>]; - }); - smem[i] = x_ch[ch_idx] * w_ch[ch_idx]; - }); + __shared__ uninitialized_buffer smem; + + auto idx = make_index(); + auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; + + slice_schedule(idx, keep_spatial)(output)([&](auto out_ch) { + auto nc_multi = out_nc.multi(idx.group); + auto n = nc_multi[0]; + auto co = nc_multi[1]; + auto c_in = co_cin.multi(co)[1]; + + auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial); + auto w_ch = slice_tensor(w, co, keep_spatial); + + // Phase 1: copy input channel into shared memory + auto smem_input = make_tensor_view(smem.data(), smem_shape); + local_tensor_copy(idx, x_ch, smem_input); + + // Phase 2: copy weights into registers + array wregs_arr; + auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape); + copy(w_ch.begin(), w_ch.end(), wregs.begin()); __syncthreads(); - // Phase 2: sliding window reduce from shared memory - idx.local_stride(_c, [&](auto j) { - auto out_spatial = out_spatial_lens.multi(j); - T acc = 0; - for(index_int ki = 0; ki < kernel_total; ki++) - { - auto k_multi = kernel_lens.multi(ki); - auto smem_idx = generate_array(_c<2 * NS>, [&](auto d) -> index_int { - if constexpr(d < NS) - return k_multi[d]; - else - return out_spatial[d - _c] + k_multi[d - _c]; - }); - acc += smem[smem_idx.dot(prod_strides)]; - } - - auto out_idx = generate_array(_c<2 + NS>, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return out_spatial[d - _c<2>]; - }); - out_ch[out_idx] = acc; + // Phase 3: sliding window multiply-reduce + per_block_pooling_reduce(idx, out_ch, [&](auto out_idx, auto r) { + return r.reduce(op::sum{}, T{0}, [&](auto ki) { + auto k_multi = wregs_shape.multi(ki); + return smem_input[out_idx + k_multi] * wregs[k_multi]; + })(reduce::make_indices(_c)); }); }); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp index 77da7283190..4b9de7ae7ce 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp @@ -267,6 +267,12 @@ struct index } } + template + __device__ void device_stride(N n, F f) const + { + for_stride(_c<0>, n, _c<1>, f); + } + template __device__ void global_stride(N n, F f) const { @@ -333,5 +339,28 @@ struct per_block } }; +struct per_device +{ + index idx; + + constexpr auto local() const { return idx.global; } + + constexpr auto nlocal() const { return idx.nglobal(); } + + constexpr auto size() const { return _c<1>; } + + template + constexpr void group_stride(N n, F f) const + { + return idx.device_stride(n, f); + } + + template + constexpr void local_stride(N n, F f) const + { + return idx.global_stride(n, f); + } +}; + } // namespace migraphx #endif // MIGRAPHX_GUARD_KERNELS_INDEX_HPP diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp index 59bf17b3eda..ac29a3174c1 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp @@ -516,12 +516,13 @@ struct block return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - idx.global_stride(nelements * idx.nlocal(), [&](auto i) { + schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -570,12 +571,13 @@ struct block_large return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - idx.global_stride(nelements * idx.nlocal(), [&](auto i) { + schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -648,12 +650,13 @@ struct subwave return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - idx.global_stride(nelements * idx.nlocal_subwave(), [&](auto i) { + schedule.local_stride(nelements * idx.nlocal_subwave(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal_subwave()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -709,12 +712,13 @@ struct lane return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - idx.global_stride(nelements, [&](auto i) { + schedule.local_stride(nelements, [&](auto i) { const auto out_idx = get_shape_c{}.multi(i); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index e3a39d05fb7..34f19869660 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -27,11 +27,9 @@ #include #include #include -#include #include #include #include -#include #ifdef MIGRAPHX_USE_COMPOSABLEKERNEL #include #endif @@ -254,16 +252,13 @@ struct channelwise_conv shape compute_shape(std::vector inputs) const { check_shapes{inputs, *this}.has(2); - auto lens = inputs.front().lens(); + auto x_lens = inputs[0].lens(); + auto w_lens = inputs[1].lens(); std::vector out_lens; - out_lens.push_back(lens[0]); - out_lens.push_back(lens[1]); + out_lens.push_back(x_lens[0]); + out_lens.push_back(w_lens[0]); for(std::size_t d = 0; d < num_spatial; ++d) - { - auto kernel_size = lens[2 + d]; - auto spatial_size = lens[2 + num_spatial + d]; - out_lens.push_back(spatial_size - kernel_size + 1); - } + out_lens.push_back(x_lens[2 + d] - w_lens[2 + d] + 1); return {inputs.front().type(), out_lens}; } }; @@ -297,51 +292,12 @@ struct find_channelwise_convolution void apply(module& m, const match::matcher_result& r) const { - auto ins = r.result; - - auto input = ins->inputs().front(); - auto weights = ins->inputs().back(); - - auto w_lens = weights->get_shape().lens(); - auto x_lens = input->get_shape().lens(); - auto ndim = ins->get_shape().ndim(); - auto num_spatial = ndim - 2; - - // Build product shape: [N, C, k_0, ..., k_{ns-1}, s_0, ..., s_{ns-1}] - std::vector prod_lens; - prod_lens.push_back(x_lens[0]); - prod_lens.push_back(w_lens[0]); - for(std::size_t d = 2; d < ndim; ++d) - prod_lens.push_back(w_lens[d]); - for(std::size_t d = 2; d < ndim; ++d) - prod_lens.push_back(x_lens[d]); - - // Unsqueeze input: [N, C_in, H, W] -> [N, C_in, 1, ..., 1, H, W] - std::vector input_unsq_axes(num_spatial); - std::iota(input_unsq_axes.begin(), input_unsq_axes.end(), 2); - auto unsq_input = - m.insert_instruction(ins, make_op("unsqueeze", {{"axes", input_unsq_axes}}), input); - - // Broadcast input to product shape - auto bcast_input = m.insert_instruction( - ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_input); - - // Squeeze weight axis 1: [C_out, 1, k_0, ...] -> [C_out, k_0, ...] - auto sq_weights = m.insert_instruction(ins, make_op("squeeze", {{"axes", {1}}}), weights); - - // Unsqueeze weight: [C_out, k_0, ...] -> [1, C_out, k_0, ..., 1, ..., 1] - std::vector w_unsq_axes; - w_unsq_axes.push_back(0); - for(std::size_t d = 0; d < num_spatial; ++d) - w_unsq_axes.push_back(static_cast(2 + num_spatial + d)); - auto unsq_weights = - m.insert_instruction(ins, make_op("unsqueeze", {{"axes", w_unsq_axes}}), sq_weights); - - // Broadcast weight to product shape - auto bcast_weights = m.insert_instruction( - ins, make_op("multibroadcast", {{"out_lens", prod_lens}}), unsq_weights); + auto ins = r.result; + auto input = ins->inputs().front(); + auto weights = ins->inputs().back(); + auto num_spatial = ins->get_shape().ndim() - 2; - m.replace_instruction(ins, channelwise_conv{num_spatial}, bcast_input, bcast_weights); + m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights); } }; From cdae8f459993bb67518b96aec2037d1fd478126b Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 11:53:46 -0600 Subject: [PATCH 10/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 22 +++++++++---------- .../migraphx/kernels/channelwise_conv.hpp | 11 +++++----- .../include/migraphx/kernels/reduce.hpp | 16 +++++++------- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 2d7ce51a853..e79116d9003 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,27 +56,25 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const - { - return {"gpu::channelwise_conv", "channelwise_conv"}; - } + std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& w_s = inputs.at(1); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& w_s = inputs.at(1); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); auto w_lens = w_s.lens(); std::vector kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial); - std::vector spatial_sizes(x_lens.begin() + 2, x_lens.begin() + 2 + num_spatial); + std::vector spatial_sizes(x_lens.begin() + 2, + x_lens.begin() + 2 + num_spatial); auto num_channels = out_s.lens()[0] * out_s.lens()[1]; std::size_t block_size = 256; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 2a9795682a5..4b3f8e2d9a7 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -47,8 +47,7 @@ __device__ void per_block_pooling_reduce(index idx, Output output, F f) } template -__device__ void -channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) +__device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) { constexpr index_int kernel_total = KernelLens{}.product(); constexpr index_int spatial_total = SpatialLens{}.product(); @@ -57,10 +56,10 @@ channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) constexpr index_int C_out = get_shape_c{}.lens[1]; constexpr index_int C_in = get_shape_c{}.lens[1]; - constexpr auto smem_shape = make_packed_shape(make_slice(get_shape_c{}, - [](auto, auto i, auto) { return i >= 2; })); - constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c{}, - [](auto, auto i, auto) { return i >= 2; })); + constexpr auto smem_shape = make_packed_shape( + make_slice(get_shape_c{}, [](auto, auto i, auto) { return i >= 2; })); + constexpr auto wregs_shape = make_packed_shape( + make_slice(get_shape_c{}, [](auto, auto i, auto) { return i >= 2; })); constexpr auto out_nc = make_shape(index_ints{}); constexpr auto co_cin = make_shape(index_ints{}); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp index ac29a3174c1..0abae0363d7 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp @@ -516,11 +516,11 @@ struct block return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); @@ -571,11 +571,11 @@ struct block_large return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); @@ -650,11 +650,11 @@ struct subwave return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); schedule.local_stride(nelements * idx.nlocal_subwave(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal_subwave()); @@ -712,11 +712,11 @@ struct lane return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; + auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); schedule.local_stride(nelements, [&](auto i) { const auto out_idx = get_shape_c{}.multi(i); From b51b82fe47261fbe1838ee63ae21c802e5500c37 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 12:08:36 -0600 Subject: [PATCH 11/84] Use pooling_reduce --- .../include/migraphx/kernels/channelwise_conv.hpp | 15 ++------------- .../kernels/include/migraphx/kernels/pooling.hpp | 6 +++--- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 4b3f8e2d9a7..fb070c6279d 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -31,21 +31,10 @@ #include #include #include +#include namespace migraphx { -template -__device__ void per_block_pooling_reduce(index idx, Output output, F f) -{ - constexpr auto nelements = get_shape_c{}.elements(); - idx.local_stride(nelements, [&](auto i) { - auto out_idx = get_shape_c{}.multi(i); - auto slicer = [](auto input) { return reduce_slice(input, 0); }; - auto r = reduce::lane::make(idx, slicer); - r.outer([&] { output[out_idx] = f(out_idx, r); }); - }); -} - template __device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) { @@ -92,7 +81,7 @@ __device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x __syncthreads(); // Phase 3: sliding window multiply-reduce - per_block_pooling_reduce(idx, out_ch, [&](auto out_idx, auto r) { + pooling_reduce(out_ch, [&](auto out_idx, auto r) { return r.reduce(op::sum{}, T{0}, [&](auto ki) { auto k_multi = wregs_shape.multi(ki); return smem_input[out_idx + k_multi] * wregs[k_multi]; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp index 76bb7c3cb6b..5a236084c47 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp @@ -186,18 +186,18 @@ constexpr window make_window(Window w, Stride s, Paddin return {w, s, p}; } -template +template __device__ void pooling_reduce(Output output, F f) { if constexpr(GroupSize < 2) { - Algo::template run( + Algo::template run( [&](auto out_idx, auto r) { r.outer([&] { output[out_idx] = f(out_idx, r); }); }); } else { auto goutput = as_vec(output, output.get_shape().lens.size() - _c<1>); - Algo::template run([&](auto out_idx, auto r) { + Algo::template run([&](auto out_idx, auto r) { auto i = out_idx; i.back() *= GroupSize; auto result = vec_generate([&](auto) { From b5f4f0f47c5fcb6062a9eb863a029b8c63724cf2 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 12:08:40 -0600 Subject: [PATCH 12/84] Format --- src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp index 5a236084c47..410deefb9be 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp @@ -186,7 +186,7 @@ constexpr window make_window(Window w, Stride s, Paddin return {w, s, p}; } -template +template __device__ void pooling_reduce(Output output, F f) { if constexpr(GroupSize < 2) From 15fd39f27bcb7fd929c2357c174fdeeefe73b15c Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 21:23:51 +0000 Subject: [PATCH 13/84] Some refactoring to use tiling --- src/targets/gpu/jit/channelwise_conv.cpp | 65 +++++--- .../migraphx/kernels/channelwise_conv.hpp | 147 +++++++++++++----- test/verify/test_channelwise_conv.cpp | 35 +++++ 3 files changed, 188 insertions(+), 59 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index e79116d9003..32aae6c2c2a 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -44,7 +44,7 @@ extern "C" { MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) { transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { - channelwise_conv(index_ints<${kernel}>{}, index_ints<${spatial}>{}, output, x, w); + channelwise_conv(index_ints<${tile}>{}, output, x, w); }); } @@ -56,34 +56,57 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } + std::vector names() const + { + return {"gpu::channelwise_conv", "channelwise_conv"}; + } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& w_s = inputs.at(1); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& w_s = inputs.at(1); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; - auto x_lens = x_s.lens(); - auto w_lens = w_s.lens(); - std::vector kernel_sizes(w_lens.begin() + 2, w_lens.begin() + 2 + num_spatial); - std::vector spatial_sizes(x_lens.begin() + 2, - x_lens.begin() + 2 + num_spatial); - - auto num_channels = out_s.lens()[0] * out_s.lens()[1]; - std::size_t block_size = 256; - - options.set_launch_params(v, num_channels * block_size, block_size); + auto x_lens = x_s.lens(); + auto w_lens = w_s.lens(); + auto out_lens = out_s.lens(); + + // Tile dimensions: for 2D use 8xH, 32xW; for 1D use 256 + std::vector tile_sizes(num_spatial); + if(num_spatial == 1) + { + tile_sizes[0] = 256; + } + else + { + tile_sizes[0] = 8; + tile_sizes[num_spatial - 1] = 32; + for(std::size_t d = 1; d + 1 < num_spatial; ++d) + tile_sizes[d] = 1; + } + + std::size_t block_size = 1; + for(auto t : tile_sizes) + block_size *= t; + + // Compute number of tiles per spatial dim: ceil(out_spatial / tile) + std::size_t num_blocks = out_lens[0] * out_lens[1]; + for(std::size_t d = 0; d < num_spatial; ++d) + { + auto out_spatial = out_lens[2 + d]; + num_blocks *= (out_spatial + tile_sizes[d] - 1) / tile_sizes[d]; + } + + options.set_launch_params(v, num_blocks * block_size, block_size); auto src = interpolate_string(channelwise_conv_kernel, - {{"kernel", to_string_range(kernel_sizes)}, - {"spatial", to_string_range(spatial_sizes)}}); + {{"tile", to_string_range(tile_sizes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index fb070c6279d..cf88dcd8ae0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -35,58 +35,129 @@ namespace migraphx { -template -__device__ void channelwise_conv(KernelLens, SpatialLens, Output output, Input x, Weights w) +template +constexpr bool in_bounds(Pos pos, Lens lens) { - constexpr index_int kernel_total = KernelLens{}.product(); - constexpr index_int spatial_total = SpatialLens{}.product(); + for(index_int d = 0; d < pos.size(); d++) + { + if(pos[d] >= lens[d]) + return false; + } + return true; +} + +template +__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) +{ + auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; constexpr index_int N = get_shape_c{}.lens[0]; constexpr index_int C_out = get_shape_c{}.lens[1]; constexpr index_int C_in = get_shape_c{}.lens[1]; - constexpr auto smem_shape = make_packed_shape( - make_slice(get_shape_c{}, [](auto, auto i, auto) { return i >= 2; })); - constexpr auto wregs_shape = make_packed_shape( - make_slice(get_shape_c{}, [](auto, auto i, auto) { return i >= 2; })); + // Derive spatial and kernel lens from input shapes (already full-rank) + constexpr auto spatial_lens = make_slice(get_shape_c{}, keep_spatial).lens; + constexpr auto kernel_lens = make_slice(get_shape_c{}, keep_spatial).lens; + constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c{}, keep_spatial)); + + constexpr index_int kernel_total = kernel_lens.product(); constexpr auto out_nc = make_shape(index_ints{}); constexpr auto co_cin = make_shape(index_ints{}); constexpr auto in_nc = make_shape(index_ints{}); + // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix + constexpr auto tile_lens = return_array_c([] { + constexpr auto sl = decltype(spatial_lens){}; + constexpr auto tl = TileLens{}; + constexpr index_int nd = sl.size(); + constexpr index_int ns = array_size(TileLens{}); + array result; + result[0] = 1; + result[1] = 1; + for(index_int i = 0; i < ns; i++) + result[2 + i] = tl[i]; + return result; + }); + constexpr auto halo_lens = transform(tile_lens, kernel_lens, + [](auto t, auto k) { return t + k - 1; }); + constexpr auto out_spatial_lens = transform(spatial_lens, kernel_lens, + [](auto s, auto k) { return s - k + 1; }); + constexpr auto tiles_per_dim = transform(out_spatial_lens, tile_lens, + [](auto o, auto t) { return (o + t - 1) / t; }); + + constexpr auto tile_shape = make_shape(tile_lens); + constexpr auto halo_shape = make_shape(halo_lens); + constexpr index_int halo_total = halo_lens.product(); + constexpr index_int tile_total = tile_lens.product(); + + // Block shape: [N, C_out, tiles_h, tiles_w] + constexpr auto block_lens = return_array_c([] { + constexpr auto tpd = decltype(tiles_per_dim){}; + constexpr index_int nd = tpd.size(); + array result; + for(index_int i = 0; i < nd; i++) + result[i] = tpd[i]; + result[0] = N; + result[1] = C_out; + return result; + }); + constexpr auto block_shape = make_shape(block_lens); + using T = typename Output::type; - __shared__ uninitialized_buffer smem; + __shared__ uninitialized_buffer smem; - auto idx = make_index(); - auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; + auto idx = make_index(); + + // Decompose block index + auto block_multi = block_shape.multi(idx.group); + auto n = block_multi[0]; + auto co = block_multi[1]; + auto c_in = co_cin.multi(co)[1]; + + auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial); + auto w_ch = slice_tensor(w, co, keep_spatial); + auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial); + + // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW] + constexpr index_int NDIM = spatial_lens.size(); + auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tile_lens[d]; + }); + + // Phase 1: load halo tile into shared memory with bounds checking + auto smem_view = make_tensor_view(smem.data(), halo_shape); + idx.local_stride(_c, [&](auto i) { + auto halo_multi = halo_shape.multi(index_int{i}); + auto src_pos = tile_origin + halo_multi; + smem.data()[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; + }); + + // Phase 2: copy weights into registers + array wregs_arr; + auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape); + copy(w_ch.begin(), w_ch.end(), wregs.begin()); + + __syncthreads(); + + // Phase 3: compute output tile with bounds checking + idx.local_stride(_c, [&](auto j) { + auto tile_multi = tile_shape.multi(index_int{j}); + auto out_pos = tile_origin + tile_multi; + if(not in_bounds(out_pos, out_spatial_lens)) + return; + + T acc = 0; + for(index_int ki = 0; ki < kernel_total; ki++) + { + auto k_multi = wregs_shape.multi(ki); + acc += smem_view[tile_multi + k_multi] * wregs[k_multi]; + } - slice_schedule(idx, keep_spatial)(output)([&](auto out_ch) { - auto nc_multi = out_nc.multi(idx.group); - auto n = nc_multi[0]; - auto co = nc_multi[1]; - auto c_in = co_cin.multi(co)[1]; - - auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial); - auto w_ch = slice_tensor(w, co, keep_spatial); - - // Phase 1: copy input channel into shared memory - auto smem_input = make_tensor_view(smem.data(), smem_shape); - local_tensor_copy(idx, x_ch, smem_input); - - // Phase 2: copy weights into registers - array wregs_arr; - auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape); - copy(w_ch.begin(), w_ch.end(), wregs.begin()); - - __syncthreads(); - - // Phase 3: sliding window multiply-reduce - pooling_reduce(out_ch, [&](auto out_idx, auto r) { - return r.reduce(op::sum{}, T{0}, [&](auto ki) { - auto k_multi = wregs_shape.multi(ki); - return smem_input[out_idx + k_multi] * wregs[k_multi]; - })(reduce::make_indices(_c)); - }); + out_ch[out_pos] = acc; }); } diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp index e3483480d8e..91731d1d2f2 100644 --- a/test/verify/test_channelwise_conv.cpp +++ b/test/verify/test_channelwise_conv.cpp @@ -100,3 +100,38 @@ struct test_channelwise_conv_1d : verify_program }; template struct test_channelwise_conv_1d; template struct test_channelwise_conv_1d; + +template +struct test_channelwise_conv_large : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_large; +template struct test_channelwise_conv_large; + +template +struct test_channelwise_conv_non_divisible + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_non_divisible; +template struct test_channelwise_conv_non_divisible; From b61daa34fcb0a465770137034d9728ff1cbfa70d Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 21:23:56 +0000 Subject: [PATCH 14/84] FOrmat --- src/targets/gpu/jit/channelwise_conv.cpp | 25 +++++++--------- .../migraphx/kernels/channelwise_conv.hpp | 29 ++++++++++--------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 32aae6c2c2a..5d27c6038a1 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,21 +56,18 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const - { - return {"gpu::channelwise_conv", "channelwise_conv"}; - } + std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& w_s = inputs.at(1); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& x_s = inputs.at(0); + const auto& w_s = inputs.at(1); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto x_lens = x_s.lens(); @@ -85,7 +82,7 @@ struct channelwise_conv_compiler : compiler } else { - tile_sizes[0] = 8; + tile_sizes[0] = 8; tile_sizes[num_spatial - 1] = 32; for(std::size_t d = 1; d + 1 < num_spatial; ++d) tile_sizes[d] = 1; @@ -105,8 +102,8 @@ struct channelwise_conv_compiler : compiler options.set_launch_params(v, num_blocks * block_size, block_size); - auto src = interpolate_string(channelwise_conv_kernel, - {{"tile", to_string_range(tile_sizes)}}); + auto src = + interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index cf88dcd8ae0..3c5715b6433 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -58,7 +58,8 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) // Derive spatial and kernel lens from input shapes (already full-rank) constexpr auto spatial_lens = make_slice(get_shape_c{}, keep_spatial).lens; constexpr auto kernel_lens = make_slice(get_shape_c{}, keep_spatial).lens; - constexpr auto wregs_shape = make_packed_shape(make_slice(get_shape_c{}, keep_spatial)); + constexpr auto wregs_shape = + make_packed_shape(make_slice(get_shape_c{}, keep_spatial)); constexpr index_int kernel_total = kernel_lens.product(); @@ -67,11 +68,11 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) constexpr auto in_nc = make_shape(index_ints{}); // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix - constexpr auto tile_lens = return_array_c([] { - constexpr auto sl = decltype(spatial_lens){}; - constexpr auto tl = TileLens{}; - constexpr index_int nd = sl.size(); - constexpr index_int ns = array_size(TileLens{}); + constexpr auto tile_lens = return_array_c([] { + constexpr auto sl = decltype(spatial_lens){}; + constexpr auto tl = TileLens{}; + constexpr index_int nd = sl.size(); + constexpr index_int ns = array_size(TileLens{}); array result; result[0] = 1; result[1] = 1; @@ -79,12 +80,12 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) result[2 + i] = tl[i]; return result; }); - constexpr auto halo_lens = transform(tile_lens, kernel_lens, - [](auto t, auto k) { return t + k - 1; }); - constexpr auto out_spatial_lens = transform(spatial_lens, kernel_lens, - [](auto s, auto k) { return s - k + 1; }); - constexpr auto tiles_per_dim = transform(out_spatial_lens, tile_lens, - [](auto o, auto t) { return (o + t - 1) / t; }); + constexpr auto halo_lens = + transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; }); + constexpr auto out_spatial_lens = + transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; }); + constexpr auto tiles_per_dim = + transform(out_spatial_lens, tile_lens, [](auto o, auto t) { return (o + t - 1) / t; }); constexpr auto tile_shape = make_shape(tile_lens); constexpr auto halo_shape = make_shape(halo_lens); @@ -92,7 +93,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) constexpr index_int tile_total = tile_lens.product(); // Block shape: [N, C_out, tiles_h, tiles_w] - constexpr auto block_lens = return_array_c([] { + constexpr auto block_lens = return_array_c([] { constexpr auto tpd = decltype(tiles_per_dim){}; constexpr index_int nd = tpd.size(); array result; @@ -121,7 +122,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW] constexpr index_int NDIM = spatial_lens.size(); - auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + auto tile_origin = generate_array(_c, [&](auto d) -> index_int { if constexpr(d < 2) return 0; else From c9d258f69ab2f32d00faa2aa1b4c872550f00a69 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 22:06:25 +0000 Subject: [PATCH 15/84] Access directly --- .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 3c5715b6433..e36fd52d352 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -134,7 +134,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; - smem.data()[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; + smem[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; }); // Phase 2: copy weights into registers From 6d979f5ecdd34d1b6ead3e2572624c6f4d81be9e Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 22:06:34 +0000 Subject: [PATCH 16/84] Format --- .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index e36fd52d352..c6c7f82b51a 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -134,7 +134,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; - smem[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; + smem[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; }); // Phase 2: copy weights into registers From ecbce52bcdf701f09f95bb1d8705e8ed2ef391ee Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 16:23:54 -0600 Subject: [PATCH 17/84] Add join --- .../gpu/kernels/include/migraphx/kernels/array.hpp | 12 ++++++++++++ .../include/migraphx/kernels/channelwise_conv.hpp | 14 ++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp index 9c2684f90ac..10270d20c2c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp @@ -452,6 +452,18 @@ constexpr auto transform(integral_const_array, integral_const_array{}; } +template +constexpr auto join(integral_const_array, integral_const_array) +{ + return integral_const_array{}; +} + +template +constexpr auto join(integral_const_array, integral_const_array, Arrays...) +{ + return join(integral_const_array{}, Arrays{}...); +} + template constexpr auto return_array_c(F f) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index c6c7f82b51a..ecdec9466e1 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -68,18 +68,8 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) constexpr auto in_nc = make_shape(index_ints{}); // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix - constexpr auto tile_lens = return_array_c([] { - constexpr auto sl = decltype(spatial_lens){}; - constexpr auto tl = TileLens{}; - constexpr index_int nd = sl.size(); - constexpr index_int ns = array_size(TileLens{}); - array result; - result[0] = 1; - result[1] = 1; - for(index_int i = 0; i < ns; i++) - result[2 + i] = tl[i]; - return result; - }); + constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{}); + constexpr auto halo_lens = transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; }); constexpr auto out_spatial_lens = From 4bd655655782f477477a7cd9606341d306afcde2 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 17:31:17 -0600 Subject: [PATCH 18/84] Update tuning --- src/targets/gpu/jit/channelwise_conv.cpp | 54 ++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 5d27c6038a1..58cf4532edb 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -82,8 +82,8 @@ struct channelwise_conv_compiler : compiler } else { - tile_sizes[0] = 8; - tile_sizes[num_spatial - 1] = 32; + tile_sizes[0] = v.get("tile_h", 8); + tile_sizes[num_spatial - 1] = v.get("tile_w", 32); for(std::size_t d = 1; d + 1 < num_spatial; ++d) tile_sizes[d] = 1; } @@ -108,9 +108,55 @@ struct channelwise_conv_compiler : compiler return compile_hip_code_object(ctx, src, options); } - compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const { - return compile_op(ctx, to_shapes(ins->inputs()), op.to_value()); + auto v = op.to_value(); + for(const auto& x : solution) + v.insert(x); + return compile_op(ctx, to_shapes(ins->inputs()), v); + } + + optional get_tuning_config(const context& ctx, + instruction_ref ins, + const operation& op, + bool exhaustive) const + { + tuning_config tc; + auto shapes = to_shapes(ins->inputs()); + tc.problem = to_value(shapes); + if(exhaustive) + { + std::vector sizes; + for(auto i:range(1, 64)) + sizes.push_back(i*4); + for(auto tile_h:sizes) + { + for(auto tile_w:sizes) + { + auto block_size = tile_h * tile_w; + if(block_size > 1024) + continue; + if(block_size < ctx.get_current_device().get_wavefront_size()) + continue; + if((block_size % ctx.get_current_device().get_wavefront_size()) != 0) + continue; + tc.solutions.push_back({{"tile_h", tile_h}, {"tile_w", tile_w}}); + } + } + } + else + { + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}}); + tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}}); + tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}}); + tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}}); + // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}}); + tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}}); + + // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}}); + // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}}); + } + return tc; } }; From d1da33357292ce8cb35a794281c57e1a38e73164 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 16 Feb 2026 17:31:20 -0600 Subject: [PATCH 19/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 58cf4532edb..76071ba81a3 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -108,9 +108,10 @@ struct channelwise_conv_compiler : compiler return compile_hip_code_object(ctx, src, options); } - compiler_replace compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const + compiler_replace + compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const { - auto v = op.to_value(); + auto v = op.to_value(); for(const auto& x : solution) v.insert(x); return compile_op(ctx, to_shapes(ins->inputs()), v); @@ -122,16 +123,16 @@ struct channelwise_conv_compiler : compiler bool exhaustive) const { tuning_config tc; - auto shapes = to_shapes(ins->inputs()); - tc.problem = to_value(shapes); + auto shapes = to_shapes(ins->inputs()); + tc.problem = to_value(shapes); if(exhaustive) { std::vector sizes; - for(auto i:range(1, 64)) - sizes.push_back(i*4); - for(auto tile_h:sizes) + for(auto i : range(1, 64)) + sizes.push_back(i * 4); + for(auto tile_h : sizes) { - for(auto tile_w:sizes) + for(auto tile_w : sizes) { auto block_size = tile_h * tile_w; if(block_size > 1024) From 9cc6906b90812f0a529fd2ee188b05f4b1cf2e9c Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 14:33:02 +0000 Subject: [PATCH 20/84] Add multi-output --- src/targets/gpu/jit/channelwise_conv.cpp | 71 +++++++++++-------- .../migraphx/kernels/channelwise_conv.hpp | 39 +++++----- 2 files changed, 61 insertions(+), 49 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 76071ba81a3..e95efbea4e3 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -44,7 +44,7 @@ extern "C" { MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) { transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { - channelwise_conv(index_ints<${tile}>{}, output, x, w); + channelwise_conv(index_ints<${tile}>{}, index_ints<${output_tile}>{}, output, x, w); }); } @@ -56,54 +56,61 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } + std::vector names() const + { + return {"gpu::channelwise_conv", "channelwise_conv"}; + } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& x_s = inputs.at(0); - const auto& w_s = inputs.at(1); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; - auto x_lens = x_s.lens(); - auto w_lens = w_s.lens(); auto out_lens = out_s.lens(); - // Tile dimensions: for 2D use 8xH, 32xW; for 1D use 256 + // Thread block tile dimensions std::vector tile_sizes(num_spatial); if(num_spatial == 1) { - tile_sizes[0] = 256; + tile_sizes[0] = v.get("tile_w", std::size_t{256}); } else { - tile_sizes[0] = v.get("tile_h", 8); - tile_sizes[num_spatial - 1] = v.get("tile_w", 32); + tile_sizes[0] = v.get("tile_h", std::size_t{8}); + tile_sizes[num_spatial - 1] = v.get("tile_w", std::size_t{32}); for(std::size_t d = 1; d + 1 < num_spatial; ++d) tile_sizes[d] = 1; } + // Outputs per thread along W (last spatial dim) + auto outputs_per_thread = v.get("outputs_per_thread", std::size_t{4}); + + // Output tile = thread tile with last dim scaled by outputs_per_thread + std::vector output_tile_sizes = tile_sizes; + output_tile_sizes.back() *= outputs_per_thread; + std::size_t block_size = 1; for(auto t : tile_sizes) block_size *= t; - // Compute number of tiles per spatial dim: ceil(out_spatial / tile) + // Blocks: N * C_out * prod(ceil(out_spatial / output_tile)) std::size_t num_blocks = out_lens[0] * out_lens[1]; for(std::size_t d = 0; d < num_spatial; ++d) { auto out_spatial = out_lens[2 + d]; - num_blocks *= (out_spatial + tile_sizes[d] - 1) / tile_sizes[d]; + num_blocks *= (out_spatial + output_tile_sizes[d] - 1) / output_tile_sizes[d]; } options.set_launch_params(v, num_blocks * block_size, block_size); - auto src = - interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}}); + auto src = interpolate_string(channelwise_conv_kernel, + {{"tile", to_string_range(tile_sizes)}, + {"output_tile", to_string_range(output_tile_sizes)}}); return compile_hip_code_object(ctx, src, options); } @@ -141,21 +148,27 @@ struct channelwise_conv_compiler : compiler continue; if((block_size % ctx.get_current_device().get_wavefront_size()) != 0) continue; - tc.solutions.push_back({{"tile_h", tile_h}, {"tile_w", tile_w}}); + for(auto opt : {1, 2, 4, 8}) + tc.solutions.push_back( + {{"tile_h", tile_h}, {"tile_w", tile_w}, {"outputs_per_thread", opt}}); } } } else { - tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}}); - tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}}); - tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}}); - tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}}); - // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}}); - tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}}); - - // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}}); - // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); + // for(auto opt : {1, 2}) + // { + // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + // tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + // tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + // tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}}); + // // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}}); + // tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}}); + + // // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + // // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}}); + // } } return tc; } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index ecdec9466e1..1e8d2bb0093 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -46,8 +46,10 @@ constexpr bool in_bounds(Pos pos, Lens lens) return true; } -template -__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) +// TileLens: thread block tile (determines block_size = product) +// OutputLens: output region per block (>= TileLens, multiple along W) +template +__device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, Weights w) { auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; @@ -55,7 +57,6 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) constexpr index_int C_out = get_shape_c{}.lens[1]; constexpr index_int C_in = get_shape_c{}.lens[1]; - // Derive spatial and kernel lens from input shapes (already full-rank) constexpr auto spatial_lens = make_slice(get_shape_c{}, keep_spatial).lens; constexpr auto kernel_lens = make_slice(get_shape_c{}, keep_spatial).lens; constexpr auto wregs_shape = @@ -67,23 +68,22 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) constexpr auto co_cin = make_shape(index_ints{}); constexpr auto in_nc = make_shape(index_ints{}); - // All full-rank (2+NS)-dim with [1, 1, ...] batch/channel prefix - constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{}); - + // Full-rank output region per block + constexpr auto output_lens = join(index_ints<1, 1>{}, OutputLens{}); constexpr auto halo_lens = - transform(tile_lens, kernel_lens, [](auto t, auto k) { return t + k - 1; }); + transform(output_lens, kernel_lens, [](auto o, auto k) { return o + k - 1; }); constexpr auto out_spatial_lens = transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; }); constexpr auto tiles_per_dim = - transform(out_spatial_lens, tile_lens, [](auto o, auto t) { return (o + t - 1) / t; }); + transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); - constexpr auto tile_shape = make_shape(tile_lens); - constexpr auto halo_shape = make_shape(halo_lens); - constexpr index_int halo_total = halo_lens.product(); - constexpr index_int tile_total = tile_lens.product(); + constexpr auto output_shape = make_shape(output_lens); + constexpr auto halo_shape = make_shape(halo_lens); + constexpr index_int halo_total = halo_lens.product(); + constexpr index_int output_total = output_lens.product(); // Block shape: [N, C_out, tiles_h, tiles_w] - constexpr auto block_lens = return_array_c([] { + constexpr auto block_lens = return_array_c([] { constexpr auto tpd = decltype(tiles_per_dim){}; constexpr index_int nd = tpd.size(); array result; @@ -110,13 +110,12 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) auto w_ch = slice_tensor(w, co, keep_spatial); auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial); - // Tile origin: [0, 0, tile_row * TileH, tile_col * TileW] constexpr index_int NDIM = spatial_lens.size(); auto tile_origin = generate_array(_c, [&](auto d) -> index_int { if constexpr(d < 2) return 0; else - return block_multi[d] * tile_lens[d]; + return block_multi[d] * output_lens[d]; }); // Phase 1: load halo tile into shared memory with bounds checking @@ -134,10 +133,10 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) __syncthreads(); - // Phase 3: compute output tile with bounds checking - idx.local_stride(_c, [&](auto j) { - auto tile_multi = tile_shape.multi(index_int{j}); - auto out_pos = tile_origin + tile_multi; + // Phase 3: compute output region (each thread handles output_total / block_size elements) + idx.local_stride(_c, [&](auto j) { + auto out_multi = output_shape.multi(index_int{j}); + auto out_pos = tile_origin + out_multi; if(not in_bounds(out_pos, out_spatial_lens)) return; @@ -145,7 +144,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) for(index_int ki = 0; ki < kernel_total; ki++) { auto k_multi = wregs_shape.multi(ki); - acc += smem_view[tile_multi + k_multi] * wregs[k_multi]; + acc += smem_view[out_multi + k_multi] * wregs[k_multi]; } out_ch[out_pos] = acc; From 0942c87c093c3ec48c184a13372d9fca12bbbef1 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 14:33:07 +0000 Subject: [PATCH 21/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 37 ++++++++++--------- .../migraphx/kernels/channelwise_conv.hpp | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index e95efbea4e3..c16cb3422ee 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -56,19 +56,16 @@ MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) struct channelwise_conv_compiler : compiler { - std::vector names() const - { - return {"gpu::channelwise_conv", "channelwise_conv"}; - } + std::vector names() const { return {"gpu::channelwise_conv", "channelwise_conv"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - auto num_spatial = v.at("num_spatial").to(); - const auto& out_s = inputs.back(); - options.inputs = inputs; - options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + auto num_spatial = v.at("num_spatial").to(); + const auto& out_s = inputs.back(); + options.inputs = inputs; + options.output = out_s; + options.kernel_name = "channelwise_conv_kernel"; options.virtual_inputs = inputs; auto out_lens = out_s.lens(); @@ -159,15 +156,19 @@ struct channelwise_conv_compiler : compiler tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); // for(auto opt : {1, 2}) // { - // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - // tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - // tc.solutions.push_back({{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - // tc.solutions.push_back({{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}}); - // // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}}); - // tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}}); - - // // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - // // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}}); + // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", + // opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, + // {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w", + // 32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24}, + // {"tile_w", 16}, {"outputs_per_thread", opt}}); + // // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", + // opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, + // {"outputs_per_thread", opt}}); + + // // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", + // opt}}); + // // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", + // opt}}); // } } return tc; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 1e8d2bb0093..1b5641fbc62 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -83,7 +83,7 @@ __device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, W constexpr index_int output_total = output_lens.product(); // Block shape: [N, C_out, tiles_h, tiles_w] - constexpr auto block_lens = return_array_c([] { + constexpr auto block_lens = return_array_c([] { constexpr auto tpd = decltype(tiles_per_dim){}; constexpr index_int nd = tpd.size(); array result; From ca147d2c921f42dce453f238f4403aac308bfccc Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:34:53 +0000 Subject: [PATCH 22/84] Add spatial tiler --- src/targets/gpu/jit/channelwise_conv.cpp | 4 +- .../migraphx/kernels/channelwise_conv.hpp | 125 ++---------- .../migraphx/kernels/spatial_tiler.hpp | 178 ++++++++++++++++++ 3 files changed, 197 insertions(+), 110 deletions(-) create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index c16cb3422ee..8290ffe6114 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -44,7 +44,7 @@ extern "C" { MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) { transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { - channelwise_conv(index_ints<${tile}>{}, index_ints<${output_tile}>{}, output, x, w); + channelwise_conv, ${ntiles}>(index_ints<${tile}>{}, output, x, w); }); } @@ -107,7 +107,7 @@ struct channelwise_conv_compiler : compiler auto src = interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}, - {"output_tile", to_string_range(output_tile_sizes)}}); + {"ntiles", std::to_string(outputs_per_thread)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 1b5641fbc62..ecb39860789 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -25,128 +25,37 @@ #ifndef MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP #define MIGRAPHX_GUARD_KERNELS_CHANNELWISE_CONV_HPP -#include +#include #include -#include #include -#include -#include -#include namespace migraphx { -template -constexpr bool in_bounds(Pos pos, Lens lens) +template +__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) { - for(index_int d = 0; d < pos.size(); d++) - { - if(pos[d] >= lens[d]) - return false; - } - return true; -} - -// TileLens: thread block tile (determines block_size = product) -// OutputLens: output region per block (>= TileLens, multiple along W) -template -__device__ void channelwise_conv(TileLens, OutputLens, Output output, Input x, Weights w) -{ - auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; - - constexpr index_int N = get_shape_c{}.lens[0]; - constexpr index_int C_out = get_shape_c{}.lens[1]; - constexpr index_int C_in = get_shape_c{}.lens[1]; - - constexpr auto spatial_lens = make_slice(get_shape_c{}, keep_spatial).lens; - constexpr auto kernel_lens = make_slice(get_shape_c{}, keep_spatial).lens; - constexpr auto wregs_shape = - make_packed_shape(make_slice(get_shape_c{}, keep_spatial)); + auto idx = make_index(); + auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}); - constexpr index_int kernel_total = kernel_lens.product(); - - constexpr auto out_nc = make_shape(index_ints{}); - constexpr auto co_cin = make_shape(index_ints{}); - constexpr auto in_nc = make_shape(index_ints{}); - - // Full-rank output region per block - constexpr auto output_lens = join(index_ints<1, 1>{}, OutputLens{}); - constexpr auto halo_lens = - transform(output_lens, kernel_lens, [](auto o, auto k) { return o + k - 1; }); - constexpr auto out_spatial_lens = - transform(spatial_lens, kernel_lens, [](auto s, auto k) { return s - k + 1; }); - constexpr auto tiles_per_dim = - transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); - - constexpr auto output_shape = make_shape(output_lens); - constexpr auto halo_shape = make_shape(halo_lens); - constexpr index_int halo_total = halo_lens.product(); - constexpr index_int output_total = output_lens.product(); - - // Block shape: [N, C_out, tiles_h, tiles_w] - constexpr auto block_lens = return_array_c([] { - constexpr auto tpd = decltype(tiles_per_dim){}; - constexpr index_int nd = tpd.size(); - array result; - for(index_int i = 0; i < nd; i++) - result[i] = tpd[i]; - result[0] = N; - result[1] = C_out; - return result; - }); - constexpr auto block_shape = make_shape(block_lens); + __shared__ decltype(tiler.template shared_allocate()) smem; - using T = typename Output::type; - __shared__ uninitialized_buffer smem; + auto x_ch = tiler.copy(x, smem); + auto w_ch = tiler.slice(w); + auto out_ch = tiler.slice(output); - auto idx = make_index(); - - // Decompose block index - auto block_multi = block_shape.multi(idx.group); - auto n = block_multi[0]; - auto co = block_multi[1]; - auto c_in = co_cin.multi(co)[1]; - - auto x_ch = slice_tensor(x, in_nc.index(make_array(n, c_in)), keep_spatial); - auto w_ch = slice_tensor(w, co, keep_spatial); - auto out_ch = slice_tensor(output, out_nc.index(make_array(n, co)), keep_spatial); - - constexpr index_int NDIM = spatial_lens.size(); - auto tile_origin = generate_array(_c, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return block_multi[d] * output_lens[d]; - }); - - // Phase 1: load halo tile into shared memory with bounds checking - auto smem_view = make_tensor_view(smem.data(), halo_shape); - idx.local_stride(_c, [&](auto i) { - auto halo_multi = halo_shape.multi(index_int{i}); - auto src_pos = tile_origin + halo_multi; - smem[i] = in_bounds(src_pos, spatial_lens) ? T{x_ch[src_pos]} : T{0}; - }); - - // Phase 2: copy weights into registers - array wregs_arr; - auto wregs = make_tensor_view(wregs_arr.begin(), wregs_shape); + using T = typename Output::type; + array wregs_arr; + auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape())); copy(w_ch.begin(), w_ch.end(), wregs.begin()); __syncthreads(); - // Phase 3: compute output region (each thread handles output_total / block_size elements) - idx.local_stride(_c, [&](auto j) { - auto out_multi = output_shape.multi(index_int{j}); - auto out_pos = tile_origin + out_multi; - if(not in_bounds(out_pos, out_spatial_lens)) - return; - + tiler.for_each([&](auto out_pos, auto out_multi) { T acc = 0; - for(index_int ki = 0; ki < kernel_total; ki++) - { - auto k_multi = wregs_shape.multi(ki); - acc += smem_view[out_multi + k_multi] * wregs[k_multi]; - } - + repeat(wregs.get_shape().elements(), [&](auto ki) { + auto k_multi = wregs.get_shape().multi(ki); + acc += x_ch[out_multi + k_multi] * wregs[k_multi]; + }); out_ch[out_pos] = acc; }); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp new file mode 100644 index 00000000000..132f29ac347 --- /dev/null +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -0,0 +1,178 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +#ifndef MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP +#define MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP + +#include +#include +#include +#include +#include + +namespace migraphx { + +template +constexpr bool in_bounds(Pos pos, Lens lens) +{ + for(index_int d = 0; d < pos.size(); d++) + { + if(pos[d] >= lens[d]) + return false; + } + return true; +} + +template +struct spatial_tiler +{ + static constexpr auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; + + // Full-rank tile lens: [1, 1, TileH, TileW] + static constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{}); + + // Output region per block: tile with last dim scaled by NTiles + static constexpr auto output_lens = return_array_c([] { + auto result = decltype(tile_lens){}; + constexpr auto nd = result.size(); + array r; + for(index_int i = 0; i < nd; i++) + r[i] = result[i]; + r[nd - 1] *= NTiles; + return r; + }); + + static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens; + + static constexpr auto tiles_per_dim = transform( + out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); + + static constexpr auto block_lens = return_array_c([] { + constexpr auto tpd = decltype(tiles_per_dim){}; + constexpr index_int nd = tpd.size(); + constexpr auto olens = OutputShape{}.lens; + array result; + for(index_int i = 0; i < nd; i++) + result[i] = tpd[i]; + result[0] = olens[0]; + result[1] = olens[1]; + return result; + }); + static constexpr auto block_shape = make_shape(block_lens); + + static constexpr auto output_shape = make_shape(output_lens); + static constexpr index_int output_total = output_lens.product(); + static constexpr index_int tiles_total = tiles_per_dim.product(); + static constexpr index_int NDIM = out_spatial_lens.size(); + + index idx; + array tile_origin; + + // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial) + template + static constexpr auto halo_lens_for() + { + constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens; + constexpr auto halo_extra = transform( + input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; }); + return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; }); + } + + // Type for shared memory allocation + template + __device__ auto shared_allocate() const + { + using T = typename Input::type; + constexpr auto hl = halo_lens_for>(); + constexpr index_int halo_total_v = hl.product(); + return uninitialized_buffer{}; + } + + // Slice a tensor to per-channel spatial view + template + __device__ auto slice(Tensor t) const + { + constexpr auto n_ch = nslices(get_shape_c{}, keep_spatial); + return slice_tensor(t, (idx.group / tiles_total) % index_int{n_ch}, keep_spatial); + } + + // Copy input halo tile into shared memory, return tensor_view over smem + template + __device__ auto copy(Input input, Smem& smem) const + { + using T = typename Input::type; + constexpr auto hl = halo_lens_for>(); + constexpr auto halo_shape = make_shape(hl); + constexpr index_int halo_total_v = hl.product(); + constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial).lens; + + constexpr auto n_out = nslices(OutputShape{}, keep_spatial); + constexpr auto n_in = nslices(get_shape_c{}, keep_spatial); + constexpr auto groups = n_out / n_in; + auto channel_idx = idx.group / tiles_total; + auto input_ch = slice_tensor( + input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial); + + idx.local_stride(_c, [&](auto i) { + auto halo_multi = halo_shape.multi(index_int{i}); + auto src_pos = tile_origin + halo_multi; + smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0}; + }); + + return make_tensor_view(smem.data(), halo_shape); + } + + // Iterate over output tile positions with bounds checking + template + __device__ void for_each(F f) const + { + idx.local_stride(_c, [&](auto j) { + auto out_multi = output_shape.multi(index_int{j}); + auto out_pos = tile_origin + out_multi; + if(not in_bounds(out_pos, out_spatial_lens)) + return; + f(out_pos, out_multi); + }); + } +}; + +template +__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) +{ + using tiler_type = spatial_tiler; + + auto block_multi = tiler_type::block_shape.multi(idx.group); + auto tile_origin = + generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tiler_type::output_lens[d]; + }); + + return tiler_type{idx, tile_origin}; +} + +} // namespace migraphx +#endif // MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP From 3b17a09e31bdef122125344247360791d97940a7 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:34:56 +0000 Subject: [PATCH 23/84] Format --- .../migraphx/kernels/channelwise_conv.hpp | 2 +- .../migraphx/kernels/spatial_tiler.hpp | 37 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index ecb39860789..fadf92159c0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -43,7 +43,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) auto w_ch = tiler.slice(w); auto out_ch = tiler.slice(output); - using T = typename Output::type; + using T = typename Output::type; array wregs_arr; auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape())); copy(w_ch.begin(), w_ch.end(), wregs.begin()); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 132f29ac347..1bd43e2e8c0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -54,7 +54,7 @@ struct spatial_tiler // Output region per block: tile with last dim scaled by NTiles static constexpr auto output_lens = return_array_c([] { - auto result = decltype(tile_lens){}; + auto result = decltype(tile_lens){}; constexpr auto nd = result.size(); array r; for(index_int i = 0; i < nd; i++) @@ -65,10 +65,10 @@ struct spatial_tiler static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens; - static constexpr auto tiles_per_dim = transform( - out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); + static constexpr auto tiles_per_dim = + transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); - static constexpr auto block_lens = return_array_c([] { + static constexpr auto block_lens = return_array_c([] { constexpr auto tpd = decltype(tiles_per_dim){}; constexpr index_int nd = tpd.size(); constexpr auto olens = OutputShape{}.lens; @@ -81,10 +81,10 @@ struct spatial_tiler }); static constexpr auto block_shape = make_shape(block_lens); - static constexpr auto output_shape = make_shape(output_lens); - static constexpr index_int output_total = output_lens.product(); - static constexpr index_int tiles_total = tiles_per_dim.product(); - static constexpr index_int NDIM = out_spatial_lens.size(); + static constexpr auto output_shape = make_shape(output_lens); + static constexpr index_int output_total = output_lens.product(); + static constexpr index_int tiles_total = tiles_per_dim.product(); + static constexpr index_int NDIM = out_spatial_lens.size(); index idx; array tile_origin; @@ -94,8 +94,8 @@ struct spatial_tiler static constexpr auto halo_lens_for() { constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens; - constexpr auto halo_extra = transform( - input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; }); + constexpr auto halo_extra = + transform(input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; }); return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; }); } @@ -131,8 +131,8 @@ struct spatial_tiler constexpr auto n_in = nslices(get_shape_c{}, keep_spatial); constexpr auto groups = n_out / n_in; auto channel_idx = idx.group / tiles_total; - auto input_ch = slice_tensor( - input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial); + auto input_ch = + slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial); idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); @@ -163,13 +163,12 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) using tiler_type = spatial_tiler; auto block_multi = tiler_type::block_shape.multi(idx.group); - auto tile_origin = - generate_array(_c, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return block_multi[d] * tiler_type::output_lens[d]; - }); + auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tiler_type::output_lens[d]; + }); return tiler_type{idx, tile_origin}; } From 037d10f2e5e5a6666ef0fa997e6338a5e2be9de3 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:45:42 +0000 Subject: [PATCH 24/84] Avoid bounds check when there is no padding --- .../migraphx/kernels/spatial_tiler.hpp | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 1bd43e2e8c0..366e80aa4d2 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -86,6 +86,18 @@ struct spatial_tiler static constexpr index_int tiles_total = tiles_per_dim.product(); static constexpr index_int NDIM = out_spatial_lens.size(); + static constexpr bool is_padded = [] { + return (out_spatial_lens != tiles_per_dim * output_lens); + // constexpr auto osl = decltype(out_spatial_lens){}; + // constexpr auto tpd = decltype(tiles_per_dim){}; + // constexpr auto ol = decltype(output_lens){}; + // constexpr index_int nd = osl.size(); + // for(index_int i = 0; i < nd; i++) + // if(tpd[i] * ol[i] != osl[i]) + // return true; + // return false; + }(); + index idx; array tile_origin; @@ -137,7 +149,10 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; - smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0}; + if constexpr(is_padded) + smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0}; + else + smem[i] = input_ch[src_pos]; }); return make_tensor_view(smem.data(), halo_shape); @@ -150,8 +165,11 @@ struct spatial_tiler idx.local_stride(_c, [&](auto j) { auto out_multi = output_shape.multi(index_int{j}); auto out_pos = tile_origin + out_multi; - if(not in_bounds(out_pos, out_spatial_lens)) - return; + if constexpr(is_padded) + { + if(not in_bounds(out_pos, out_spatial_lens)) + return; + } f(out_pos, out_multi); }); } From 7bc6d7842c5c0ddbe01d83100b5bc9477bb917cf Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:47:39 +0000 Subject: [PATCH 25/84] Remove lines --- .../kernels/include/migraphx/kernels/spatial_tiler.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 366e80aa4d2..0db1f1847da 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -88,14 +88,6 @@ struct spatial_tiler static constexpr bool is_padded = [] { return (out_spatial_lens != tiles_per_dim * output_lens); - // constexpr auto osl = decltype(out_spatial_lens){}; - // constexpr auto tpd = decltype(tiles_per_dim){}; - // constexpr auto ol = decltype(output_lens){}; - // constexpr index_int nd = osl.size(); - // for(index_int i = 0; i < nd; i++) - // if(tpd[i] * ol[i] != osl[i]) - // return true; - // return false; }(); index idx; From e3077b8cb8ecbcd92000e86796c341ba5eaf9b7b Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 22:46:21 +0000 Subject: [PATCH 26/84] Use functions instead of variables --- src/targets/gpu/jit/channelwise_conv.cpp | 3 +- .../migraphx/kernels/spatial_tiler.hpp | 137 ++++++++++-------- 2 files changed, 78 insertions(+), 62 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 8290ffe6114..e2ff0c31ade 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -153,7 +153,8 @@ struct channelwise_conv_compiler : compiler } else { - tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); + // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}}); // for(auto opt : {1, 2}) // { // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 0db1f1847da..7f3d081cf29 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -47,60 +47,74 @@ constexpr bool in_bounds(Pos pos, Lens lens) template struct spatial_tiler { - static constexpr auto keep_spatial = [](auto, auto i, auto) { return i >= 2; }; + static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; } // Full-rank tile lens: [1, 1, TileH, TileW] - static constexpr auto tile_lens = join(index_ints<1, 1>{}, TileLens{}); + static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); } // Output region per block: tile with last dim scaled by NTiles - static constexpr auto output_lens = return_array_c([] { - auto result = decltype(tile_lens){}; - constexpr auto nd = result.size(); - array r; - for(index_int i = 0; i < nd; i++) - r[i] = result[i]; - r[nd - 1] *= NTiles; - return r; - }); - - static constexpr auto out_spatial_lens = make_slice(OutputShape{}, keep_spatial).lens; - - static constexpr auto tiles_per_dim = - transform(out_spatial_lens, output_lens, [](auto o, auto t) { return (o + t - 1) / t; }); - - static constexpr auto block_lens = return_array_c([] { - constexpr auto tpd = decltype(tiles_per_dim){}; - constexpr index_int nd = tpd.size(); - constexpr auto olens = OutputShape{}.lens; - array result; - for(index_int i = 0; i < nd; i++) - result[i] = tpd[i]; - result[0] = olens[0]; - result[1] = olens[1]; - return result; - }); - static constexpr auto block_shape = make_shape(block_lens); - - static constexpr auto output_shape = make_shape(output_lens); - static constexpr index_int output_total = output_lens.product(); - static constexpr index_int tiles_total = tiles_per_dim.product(); - static constexpr index_int NDIM = out_spatial_lens.size(); - - static constexpr bool is_padded = [] { - return (out_spatial_lens != tiles_per_dim * output_lens); - }(); + static constexpr auto output_lens() + { + return return_array_c([] { + auto result = decltype(tile_lens()){}; + constexpr auto nd = result.size(); + array r; + for(index_int i = 0; i < nd; i++) + r[i] = result[i]; + r[nd - 1] *= NTiles; + return r; + }); + } + + static constexpr auto out_spatial_lens() + { + return make_slice(OutputShape{}, keep_spatial()).lens; + } + + static constexpr auto tiles_per_dim() + { + return transform( + out_spatial_lens(), output_lens(), [](auto o, auto t) { return (o + t - 1) / t; }); + } + + static constexpr auto block_lens() + { + return return_array_c([] { + constexpr auto tpd = decltype(tiles_per_dim()){}; + constexpr index_int nd = tpd.size(); + constexpr auto olens = OutputShape{}.lens; + array result; + for(index_int i = 0; i < nd; i++) + result[i] = tpd[i]; + result[0] = olens[0]; + result[1] = olens[1]; + return result; + }); + } + + static constexpr auto block_shape() { return make_shape(block_lens()); } + + static constexpr auto output_shape() { return make_shape(output_lens()); } + static constexpr index_int output_total() { return output_lens().product(); } + static constexpr index_int tiles_total() { return tiles_per_dim().product(); } + static constexpr index_int NDIM() { return out_spatial_lens().size(); } + + static constexpr bool is_padded() + { + return (out_spatial_lens() != tiles_per_dim() * output_lens()); + } index idx; - array tile_origin; + array tile_origin; // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial) template static constexpr auto halo_lens_for() { - constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial).lens; + constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; constexpr auto halo_extra = - transform(input_spatial, out_spatial_lens, [](auto is, auto os) { return is - os; }); - return transform(output_lens, halo_extra, [](auto o, auto h) { return o + h; }); + transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); + return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } // Type for shared memory allocation @@ -117,8 +131,8 @@ struct spatial_tiler template __device__ auto slice(Tensor t) const { - constexpr auto n_ch = nslices(get_shape_c{}, keep_spatial); - return slice_tensor(t, (idx.group / tiles_total) % index_int{n_ch}, keep_spatial); + constexpr auto n_ch = nslices(get_shape_c{}, keep_spatial()); + return slice_tensor(t, (idx.group / tiles_total()) % index_int{n_ch}, keep_spatial()); } // Copy input halo tile into shared memory, return tensor_view over smem @@ -129,19 +143,19 @@ struct spatial_tiler constexpr auto hl = halo_lens_for>(); constexpr auto halo_shape = make_shape(hl); constexpr index_int halo_total_v = hl.product(); - constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial).lens; + constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial()).lens; - constexpr auto n_out = nslices(OutputShape{}, keep_spatial); - constexpr auto n_in = nslices(get_shape_c{}, keep_spatial); + constexpr auto n_out = nslices(OutputShape{}, keep_spatial()); + constexpr auto n_in = nslices(get_shape_c{}, keep_spatial()); constexpr auto groups = n_out / n_in; - auto channel_idx = idx.group / tiles_total; + auto channel_idx = idx.group / tiles_total(); auto input_ch = - slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial); + slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial()); idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; - if constexpr(is_padded) + if constexpr(is_padded()) smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0}; else smem[i] = input_ch[src_pos]; @@ -154,12 +168,12 @@ struct spatial_tiler template __device__ void for_each(F f) const { - idx.local_stride(_c, [&](auto j) { - auto out_multi = output_shape.multi(index_int{j}); + idx.local_stride(_c, [&](auto j) { + auto out_multi = output_shape().multi(index_int{j}); auto out_pos = tile_origin + out_multi; - if constexpr(is_padded) + if constexpr(is_padded()) { - if(not in_bounds(out_pos, out_spatial_lens)) + if(not in_bounds(out_pos, out_spatial_lens())) return; } f(out_pos, out_multi); @@ -172,13 +186,14 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) { using tiler_type = spatial_tiler; - auto block_multi = tiler_type::block_shape.multi(idx.group); - auto tile_origin = generate_array(_c, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return block_multi[d] * tiler_type::output_lens[d]; - }); + auto block_multi = tiler_type::block_shape().multi(idx.group); + auto tile_origin = + generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tiler_type::output_lens()[d]; + }); return tiler_type{idx, tile_origin}; } From 414aab469ccf5ce40632c0186e0f83c32c3e5625 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 22:46:25 +0000 Subject: [PATCH 27/84] Format --- .../migraphx/kernels/spatial_tiler.hpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 7f3d081cf29..043d391c835 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -47,7 +47,10 @@ constexpr bool in_bounds(Pos pos, Lens lens) template struct spatial_tiler { - static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; } + static constexpr auto keep_spatial() + { + return [](auto, auto i, auto) { return i >= 2; }; + } // Full-rank tile lens: [1, 1, TileH, TileW] static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); } @@ -149,8 +152,8 @@ struct spatial_tiler constexpr auto n_in = nslices(get_shape_c{}, keep_spatial()); constexpr auto groups = n_out / n_in; auto channel_idx = idx.group / tiles_total(); - auto input_ch = - slice_tensor(input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial()); + auto input_ch = slice_tensor( + input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial()); idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); @@ -187,13 +190,12 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) using tiler_type = spatial_tiler; auto block_multi = tiler_type::block_shape().multi(idx.group); - auto tile_origin = - generate_array(_c, [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return block_multi[d] * tiler_type::output_lens()[d]; - }); + auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tiler_type::output_lens()[d]; + }); return tiler_type{idx, tile_origin}; } From e56c4f16b02bd3b487112bb723e843626d7e9d4c Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 22:59:15 +0000 Subject: [PATCH 28/84] Inine methods --- src/targets/gpu/jit/channelwise_conv.cpp | 34 +++++++------- .../migraphx/kernels/spatial_tiler.hpp | 46 +++++++------------ 2 files changed, 33 insertions(+), 47 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index e2ff0c31ade..383b812e671 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -154,23 +154,23 @@ struct channelwise_conv_compiler : compiler else { // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); - tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}}); - // for(auto opt : {1, 2}) - // { - // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", - // opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, - // {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w", - // 32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24}, - // {"tile_w", 16}, {"outputs_per_thread", opt}}); - // // tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", - // opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, - // {"outputs_per_thread", opt}}); - - // // tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", - // opt}}); - // // tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", - // opt}}); - // } + // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}}); + for(auto opt : {1, 2}) + { + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", + opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, + {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w", + 32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24}, + {"tile_w", 16}, {"outputs_per_thread", opt}}); + tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", + opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, + {"outputs_per_thread", opt}}); + + tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", + opt}}); + tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", + opt}}); + } } return tc; } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 043d391c835..08e44df231e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -47,19 +47,13 @@ constexpr bool in_bounds(Pos pos, Lens lens) template struct spatial_tiler { - static constexpr auto keep_spatial() - { - return [](auto, auto i, auto) { return i >= 2; }; - } - - // Full-rank tile lens: [1, 1, TileH, TileW] - static constexpr auto tile_lens() { return join(index_ints<1, 1>{}, TileLens{}); } + static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; } // Output region per block: tile with last dim scaled by NTiles static constexpr auto output_lens() { return return_array_c([] { - auto result = decltype(tile_lens()){}; + auto result = join(index_ints<1, 1>{}, TileLens{}); constexpr auto nd = result.size(); array r; for(index_int i = 0; i < nd; i++) @@ -80,25 +74,6 @@ struct spatial_tiler out_spatial_lens(), output_lens(), [](auto o, auto t) { return (o + t - 1) / t; }); } - static constexpr auto block_lens() - { - return return_array_c([] { - constexpr auto tpd = decltype(tiles_per_dim()){}; - constexpr index_int nd = tpd.size(); - constexpr auto olens = OutputShape{}.lens; - array result; - for(index_int i = 0; i < nd; i++) - result[i] = tpd[i]; - result[0] = olens[0]; - result[1] = olens[1]; - return result; - }); - } - - static constexpr auto block_shape() { return make_shape(block_lens()); } - - static constexpr auto output_shape() { return make_shape(output_lens()); } - static constexpr index_int output_total() { return output_lens().product(); } static constexpr index_int tiles_total() { return tiles_per_dim().product(); } static constexpr index_int NDIM() { return out_spatial_lens().size(); } @@ -171,8 +146,8 @@ struct spatial_tiler template __device__ void for_each(F f) const { - idx.local_stride(_c, [&](auto j) { - auto out_multi = output_shape().multi(index_int{j}); + idx.local_stride(_c, [&](auto j) { + auto out_multi = make_shape(output_lens()).multi(index_int{j}); auto out_pos = tile_origin + out_multi; if constexpr(is_padded()) { @@ -189,7 +164,18 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) { using tiler_type = spatial_tiler; - auto block_multi = tiler_type::block_shape().multi(idx.group); + constexpr auto block_shape = make_shape(return_array_c([] { + constexpr auto tpd = decltype(tiler_type::tiles_per_dim()){}; + constexpr index_int nd = tpd.size(); + constexpr auto olens = OutputShape{}.lens; + array result; + for(index_int i = 0; i < nd; i++) + result[i] = tpd[i]; + result[0] = olens[0]; + result[1] = olens[1]; + return result; + })); + auto block_multi = block_shape.multi(idx.group); auto tile_origin = generate_array(_c, [&](auto d) -> index_int { if constexpr(d < 2) return 0; From b51c74f46d7a26ae31dbe53b10918f878a02919b Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 22:59:18 +0000 Subject: [PATCH 29/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 30 +++++++++++-------- .../migraphx/kernels/spatial_tiler.hpp | 7 +++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 383b812e671..b0eb76cb76e 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -157,19 +157,23 @@ struct channelwise_conv_compiler : compiler // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}}); for(auto opt : {1, 2}) { - tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", - opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, - {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 12}, {"tile_w", - 32}, {"outputs_per_thread", opt}}); tc.solutions.push_back({{"tile_h", 24}, - {"tile_w", 16}, {"outputs_per_thread", opt}}); - tc.solutions.push_back({{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", - opt}}); tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 4}, - {"outputs_per_thread", opt}}); - - tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", - opt}}); - tc.solutions.push_back({{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", - opt}}); + tc.solutions.push_back( + {{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}}); + + tc.solutions.push_back( + {{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}}); + tc.solutions.push_back( + {{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}}); } } return tc; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 08e44df231e..df4c7c22fbc 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -47,7 +47,10 @@ constexpr bool in_bounds(Pos pos, Lens lens) template struct spatial_tiler { - static constexpr auto keep_spatial() { return [](auto, auto i, auto) { return i >= 2; }; } + static constexpr auto keep_spatial() + { + return [](auto, auto i, auto) { return i >= 2; }; + } // Output region per block: tile with last dim scaled by NTiles static constexpr auto output_lens() @@ -175,7 +178,7 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) result[1] = olens[1]; return result; })); - auto block_multi = block_shape.multi(idx.group); + auto block_multi = block_shape.multi(idx.group); auto tile_origin = generate_array(_c, [&](auto d) -> index_int { if constexpr(d < 2) return 0; From 3d4bfe448ad3b43a43df5a7d833a76e6dd5c2c9b Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:45:46 -0600 Subject: [PATCH 30/84] Update quick tuning list --- src/targets/gpu/jit/channelwise_conv.cpp | 41 +++++++----------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index b0eb76cb76e..a1e2939c722 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -84,12 +84,12 @@ struct channelwise_conv_compiler : compiler tile_sizes[d] = 1; } - // Outputs per thread along W (last spatial dim) - auto outputs_per_thread = v.get("outputs_per_thread", std::size_t{4}); + // Outputs per lane along W (last spatial dim) + auto noutputs = v.get("noutputs", std::size_t{4}); - // Output tile = thread tile with last dim scaled by outputs_per_thread + // Output tile = lane tile with last dim scaled by noutputs std::vector output_tile_sizes = tile_sizes; - output_tile_sizes.back() *= outputs_per_thread; + output_tile_sizes.back() *= noutputs; std::size_t block_size = 1; for(auto t : tile_sizes) @@ -107,7 +107,7 @@ struct channelwise_conv_compiler : compiler auto src = interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}, - {"ntiles", std::to_string(outputs_per_thread)}}); + {"ntiles", std::to_string(noutputs)}}); return compile_hip_code_object(ctx, src, options); } @@ -123,7 +123,7 @@ struct channelwise_conv_compiler : compiler optional get_tuning_config(const context& ctx, instruction_ref ins, - const operation& op, + const operation&, bool exhaustive) const { tuning_config tc; @@ -147,34 +147,17 @@ struct channelwise_conv_compiler : compiler continue; for(auto opt : {1, 2, 4, 8}) tc.solutions.push_back( - {{"tile_h", tile_h}, {"tile_w", tile_w}, {"outputs_per_thread", opt}}); + {{"tile_h", tile_h}, {"tile_w", tile_w}, {"noutputs", opt}}); } } } else { - // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", 1}}); - // tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"outputs_per_thread", 4}}); - for(auto opt : {1, 2}) - { - tc.solutions.push_back( - {{"tile_h", 8}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 32}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 12}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 24}, {"tile_w", 16}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 20}, {"tile_w", 8}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 32}, {"tile_w", 4}, {"outputs_per_thread", opt}}); - - tc.solutions.push_back( - {{"tile_h", 16}, {"tile_w", 32}, {"outputs_per_thread", opt}}); - tc.solutions.push_back( - {{"tile_h", 64}, {"tile_w", 16}, {"outputs_per_thread", opt}}); - } + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}}); + tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}}); + tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}}); } return tc; } From a362a19e879debe0e88d529f42633188db6bb534 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Feb 2026 17:45:50 -0600 Subject: [PATCH 31/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index a1e2939c722..f8b6d3ef190 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -105,9 +105,9 @@ struct channelwise_conv_compiler : compiler options.set_launch_params(v, num_blocks * block_size, block_size); - auto src = interpolate_string(channelwise_conv_kernel, - {{"tile", to_string_range(tile_sizes)}, - {"ntiles", std::to_string(noutputs)}}); + auto src = interpolate_string( + channelwise_conv_kernel, + {{"tile", to_string_range(tile_sizes)}, {"ntiles", std::to_string(noutputs)}}); return compile_hip_code_object(ctx, src, options); } @@ -157,7 +157,7 @@ struct channelwise_conv_compiler : compiler tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}}); tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}}); tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}}); - tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}}); + tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}}); } return tc; } From 208c7ada24ac964bba6a9162e12c9005511d53cd Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 18 Feb 2026 12:38:28 -0600 Subject: [PATCH 32/84] Add another config --- src/targets/gpu/jit/channelwise_conv.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index f8b6d3ef190..d41a1e4d13d 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -158,6 +158,7 @@ struct channelwise_conv_compiler : compiler tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}}); tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}}); tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}}); + tc.solutions.push_back({{"tile_h", 56}, {"tile_w", 4}, {"noutputs", 1}}); } return tc; } From f2daa29d77310d393b38b2b680dc0ff121d6b6c8 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 18 Feb 2026 17:33:48 -0600 Subject: [PATCH 33/84] Add more configs --- src/targets/gpu/jit/channelwise_conv.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index d41a1e4d13d..cfe905a40f6 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -154,11 +154,21 @@ struct channelwise_conv_compiler : compiler else { tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}}); - tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}}); + + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 8}, {"noutputs", 8}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 16}, {"noutputs", 2}}); tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}}); + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 8}}); + tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 8}, {"noutputs", 4}}); + tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 16}, {"noutputs", 2}}); tc.solutions.push_back({{"tile_h", 16}, {"tile_w", 64}, {"noutputs", 4}}); + tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 16}, {"noutputs", 8}}); + tc.solutions.push_back({{"tile_h", 32}, {"tile_w", 32}, {"noutputs", 1}}); + tc.solutions.push_back({{"tile_h", 40}, {"tile_w", 12}, {"noutputs", 1}}); tc.solutions.push_back({{"tile_h", 48}, {"tile_w", 16}, {"noutputs", 1}}); tc.solutions.push_back({{"tile_h", 56}, {"tile_w", 4}, {"noutputs", 1}}); + tc.solutions.push_back({{"tile_h", 76}, {"tile_w", 8}, {"noutputs", 8}}); + tc.solutions.push_back({{"tile_h", 128}, {"tile_w", 8}, {"noutputs", 8}}); } return tc; } From 36110cf5e735ac1ba34a6eebdd9df2e6dc0b8729 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 18 Feb 2026 17:33:53 -0600 Subject: [PATCH 34/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index cfe905a40f6..bc84ceabf78 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -154,7 +154,7 @@ struct channelwise_conv_compiler : compiler else { tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 32}, {"noutputs", 1}}); - + tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 8}, {"noutputs", 8}}); tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 16}, {"noutputs", 2}}); tc.solutions.push_back({{"tile_h", 8}, {"tile_w", 64}, {"noutputs", 4}}); From 882fe3b685e1d88b1146cf57e0324773fb29db50 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 13:33:19 -0600 Subject: [PATCH 35/84] Add pointwise fusion --- src/targets/gpu/fuse_ops.cpp | 34 ++++++ src/targets/gpu/jit/channelwise_conv.cpp | 29 ++++- .../migraphx/kernels/channelwise_conv.hpp | 13 ++- test/gpu/fuse_ops.cpp | 108 +++++++++++++++++- test/verify/test_channelwise_conv_add.cpp | 50 ++++++++ .../verify/test_channelwise_conv_add_relu.cpp | 51 +++++++++ test/verify/test_channelwise_conv_relu.cpp | 47 ++++++++ 7 files changed, 323 insertions(+), 9 deletions(-) create mode 100644 test/verify/test_channelwise_conv_add.cpp create mode 100644 test/verify/test_channelwise_conv_add_relu.cpp create mode 100644 test/verify/test_channelwise_conv_relu.cpp diff --git a/src/targets/gpu/fuse_ops.cpp b/src/targets/gpu/fuse_ops.cpp index ac925269fbc..e102588c901 100644 --- a/src/targets/gpu/fuse_ops.cpp +++ b/src/targets/gpu/fuse_ops.cpp @@ -983,6 +983,39 @@ struct find_layernorm_pointwise } }; +struct find_channelwise_conv_pointwise +{ + auto matcher() const + { + return precompile_name("pointwise")( + match::not_tuple(), + match::arg(0)( + precompile_name("gpu::channelwise_conv").bind("channelwise_conv"))); + } + + void apply(module& m, const match::matcher_result& r) const + { + auto pw_ins = r.result; + auto channelwise_ins = r.instructions["channelwise_conv"]; + if(not channelwise_ins->module_inputs().empty()) + return; + auto* pm = pw_ins->module_inputs().front(); + auto pw_inputs = pw_ins->inputs(); + auto cw_pos = std::find(pw_inputs.begin(), pw_inputs.end(), channelwise_ins); + assert(cw_pos != pw_inputs.end()); + pw_inputs.erase(cw_pos); + auto inputs = channelwise_ins->inputs(); + inputs.pop_back(); + inputs.insert(inputs.end(), pw_inputs.begin(), pw_inputs.end()); + + auto cw_op_val = channelwise_ins->get_operator().to_value(); + cw_op_val["output_shape"] = to_value(pw_ins->get_shape()); + + m.replace_instruction( + pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm}); + } +}; + struct find_concat_pointwise { auto matcher() const @@ -1032,6 +1065,7 @@ void fuse_ops::apply(module& m) const #endif match::find_matches(m, find_layernorm_pointwise{}, + find_channelwise_conv_pointwise{}, find_concat_pointwise{}, find_contiguous_transpose_rocblas_gemm{}, #if MIGRAPHX_USE_HIPBLASLT diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index bc84ceabf78..e308dc1bab9 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -25,26 +25,32 @@ #include #include #include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { +using namespace migraphx::gpu::gen; // NOLINT + // NOLINTNEXTLINE static const char* const channelwise_conv_kernel = R"__migraphx__( #include #include #include +#include #include namespace migraphx { +${preamble} + extern "C" { -MIGRAPHX_GLOBAL void channelwise_conv_kernel(void* x_p, void* w_p, void* y_p) +MIGRAPHX_GLOBAL void ${kernel}(${params}) { - transform_args(make_tensors(), rotate_last())(x_p, w_p, y_p)([](auto output, auto x, auto w) { - channelwise_conv, ${ntiles}>(index_ints<${tile}>{}, output, x, w); + transform_args(make_tensors(), rotate_last())(${args})([](auto output, auto x, auto w, auto... inputs) { + channelwise_conv, ${ntiles}>(index_ints<${tile}>{}, ${post}, output, x, w, inputs...); }); } @@ -65,7 +71,7 @@ struct channelwise_conv_compiler : compiler const auto& out_s = inputs.back(); options.inputs = inputs; options.output = out_s; - options.kernel_name = "channelwise_conv_kernel"; + options.kernel_name = v.get("kernel", std::string{"channelwise_conv_kernel"}); options.virtual_inputs = inputs; auto out_lens = out_s.lens(); @@ -107,7 +113,13 @@ struct channelwise_conv_compiler : compiler auto src = interpolate_string( channelwise_conv_kernel, - {{"tile", to_string_range(tile_sizes)}, {"ntiles", std::to_string(noutputs)}}); + {{"tile", to_string_range(tile_sizes)}, + {"ntiles", std::to_string(noutputs)}, + {"kernel", options.kernel_name}, + {"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"post", v.get("post", std::string{"op::id{}"})}, + {"preamble", v.get("preamble", std::string{})}}); return compile_hip_code_object(ctx, src, options); } @@ -118,6 +130,13 @@ struct channelwise_conv_compiler : compiler auto v = op.to_value(); for(const auto& x : solution) v.insert(x); + if(not ins->module_inputs().empty()) + { + auto* pm = ins->module_inputs().front(); + v["preamble"] = generate_pointwise(*pm, "post_channelwise_conv"); + v["post"] = "MIGRAPHX_LIFT(post_channelwise_conv)"; + v["kernel"] = "channelwise_conv_" + generate_name_from_ops(*pm) + "_kernel"; + } return compile_op(ctx, to_shapes(ins->inputs()), v); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index fadf92159c0..103d8c074cf 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -31,8 +31,15 @@ namespace migraphx { -template -__device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) +template +__device__ void +channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs) { auto idx = make_index(); auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}); @@ -56,7 +63,7 @@ __device__ void channelwise_conv(TileLens, Output output, Input x, Weights w) auto k_multi = wregs.get_shape().multi(ki); acc += x_ch[out_multi + k_multi] * wregs[k_multi]; }); - out_ch[out_pos] = acc; + out_ch[out_pos] = f(acc, tiler.slice(inputs)[out_pos]...); }); } diff --git a/test/gpu/fuse_ops.cpp b/test/gpu/fuse_ops.cpp index 9867515e6a8..4377afdafb0 100644 --- a/test/gpu/fuse_ops.cpp +++ b/test/gpu/fuse_ops.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -421,4 +421,110 @@ TEST_CASE(concat_pointwise_contiguous) EXPECT(p1 == p2); } +TEST_CASE(channelwise_conv_pointwise) +{ + migraphx::shape sx{migraphx::shape::float_type, {2, 4, 8, 8}}; + migraphx::shape sw{migraphx::shape::float_type, {4, 1, 3, 3}}; + migraphx::shape sout{migraphx::shape::float_type, {2, 4, 6, 6}}; + + auto create_program = [=](bool first_arg_conv) { + migraphx::program p; + auto* mm = p.get_main_module(); + auto x = mm->add_parameter("x", sx); + auto w = mm->add_parameter("w", sw); + auto z = mm->add_parameter("z", sout); + auto alloc = migraphx::make_op("allocate", {{"shape", to_value(sout)}}); + auto alloc_ins = mm->add_instruction(alloc); + auto conv_ins = + mm->add_instruction(make_precompile_op("gpu::channelwise_conv"), x, w, alloc_ins); + std::vector pw_inputs = {conv_ins, z}; + if(not first_arg_conv) + { + pw_inputs = {z, conv_ins}; + } + auto* pw_add = + create_pointwise_module(p, "main:pointwise0", pw_inputs, single_pointwise("add")); + auto alloc_ins2 = mm->add_instruction(alloc); + pw_inputs.push_back(alloc_ins2); + auto add_ins = + mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add}); + mm->add_return({add_ins}); + return p; + }; + + auto create_fused_program = [=]() { + migraphx::program p; + auto* mm = p.get_main_module(); + auto x = mm->add_parameter("x", sx); + auto w = mm->add_parameter("w", sw); + auto z = mm->add_parameter("z", sout); + auto alloc = migraphx::make_op("allocate", {{"shape", to_value(sout)}}); + auto alloc_ins = mm->add_instruction(alloc); + auto* pw_add = + create_pointwise_module(p, "main:pointwise0", {x, z}, single_pointwise("add")); + auto conv_op = migraphx::make_op("gpu::channelwise_conv"); + auto pre_comp_op = migraphx::make_op( + "gpu::precompile_op", + {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}}); + auto fused_ins = + mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add}); + mm->add_return({fused_ins}); + return p; + }; + + { + migraphx::program p1 = create_program(true); + run_pass(p1); + migraphx::program p2 = create_fused_program(); + EXPECT(p1 == p2); + } + { + // conv is not arg(0), should not fuse + migraphx::program p1 = create_program(false); + run_pass(p1); + EXPECT(p1 == create_program(false)); + } +} + +TEST_CASE(channelwise_conv_pointwise_already_fused) +{ + migraphx::shape sx{migraphx::shape::float_type, {2, 4, 8, 8}}; + migraphx::shape sw{migraphx::shape::float_type, {4, 1, 3, 3}}; + migraphx::shape sout{migraphx::shape::float_type, {2, 4, 6, 6}}; + + auto create_program = [=]() { + migraphx::program p; + auto* mm = p.get_main_module(); + auto x = mm->add_parameter("x", sx); + auto w = mm->add_parameter("w", sw); + auto z = mm->add_parameter("z", sout); + auto y = mm->add_parameter("y", sout); + auto alloc = migraphx::make_op("allocate", {{"shape", to_value(sout)}}); + auto alloc_ins = mm->add_instruction(alloc); + // channelwise_conv already has a module (already fused) + auto* pw_relu = + create_pointwise_module(p, "main:pointwise0", {x}, [](auto* pm, const auto& inputs) { + return pm->add_instruction(migraphx::make_op("relu"), inputs[0]); + }); + auto conv_op = migraphx::make_op("gpu::channelwise_conv"); + auto pre_comp_op = migraphx::make_op( + "gpu::precompile_op", + {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}}); + auto conv_ins = + mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu}); + auto* pw_add = + create_pointwise_module(p, "main:pointwise1", {conv_ins, y}, single_pointwise("add")); + auto alloc_ins2 = mm->add_instruction(alloc); + auto add_ins = mm->add_instruction( + make_precompile_op("pointwise"), {conv_ins, y, alloc_ins2}, {pw_add}); + mm->add_return({add_ins}); + return p; + }; + + // Should not fuse since channelwise_conv already has a module + migraphx::program p1 = create_program(); + run_pass(p1); + EXPECT(p1 == create_program()); +} + int main(int argc, const char* argv[]) { test::run(argc, argv); } diff --git a/test/verify/test_channelwise_conv_add.cpp b/test/verify/test_channelwise_conv_add.cpp new file mode 100644 index 00000000000..7354a4616cb --- /dev/null +++ b/test/verify/test_channelwise_conv_add.cpp @@ -0,0 +1,50 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_add : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + auto bias = mm->add_parameter("b", migraphx::shape{DType, {4}}); + auto conv = + mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights); + auto bcast_bias = mm->add_instruction( + migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {2, 4, 6, 6}}}), bias); + mm->add_instruction(migraphx::make_op("add"), conv, bcast_bias); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_add; +template struct test_channelwise_conv_add; diff --git a/test/verify/test_channelwise_conv_add_relu.cpp b/test/verify/test_channelwise_conv_add_relu.cpp new file mode 100644 index 00000000000..1665b9b5b63 --- /dev/null +++ b/test/verify/test_channelwise_conv_add_relu.cpp @@ -0,0 +1,51 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_add_relu : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); + auto bias = mm->add_parameter("b", migraphx::shape{DType, {8}}); + auto conv = + mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); + auto bcast_bias = mm->add_instruction( + migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 8, 10, 10}}}), bias); + auto add = mm->add_instruction(migraphx::make_op("add"), conv, bcast_bias); + mm->add_instruction(migraphx::make_op("relu"), add); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_add_relu; +template struct test_channelwise_conv_add_relu; diff --git a/test/verify/test_channelwise_conv_relu.cpp b/test/verify/test_channelwise_conv_relu.cpp new file mode 100644 index 00000000000..ac1510e89f5 --- /dev/null +++ b/test/verify/test_channelwise_conv_relu.cpp @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_relu : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + auto conv = + mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights); + mm->add_instruction(migraphx::make_op("relu"), conv); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_relu; +template struct test_channelwise_conv_relu; From 24a2645701f9c14adcf0e2b7cf57eec97cc1c636 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 13:33:22 -0600 Subject: [PATCH 36/84] Format --- src/targets/gpu/fuse_ops.cpp | 6 ++---- src/targets/gpu/jit/channelwise_conv.cpp | 17 ++++++++--------- .../migraphx/kernels/channelwise_conv.hpp | 3 +-- test/gpu/fuse_ops.cpp | 9 +++------ 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/targets/gpu/fuse_ops.cpp b/src/targets/gpu/fuse_ops.cpp index e102588c901..d2e9503a993 100644 --- a/src/targets/gpu/fuse_ops.cpp +++ b/src/targets/gpu/fuse_ops.cpp @@ -989,8 +989,7 @@ struct find_channelwise_conv_pointwise { return precompile_name("pointwise")( match::not_tuple(), - match::arg(0)( - precompile_name("gpu::channelwise_conv").bind("channelwise_conv"))); + match::arg(0)(precompile_name("gpu::channelwise_conv").bind("channelwise_conv"))); } void apply(module& m, const match::matcher_result& r) const @@ -1011,8 +1010,7 @@ struct find_channelwise_conv_pointwise auto cw_op_val = channelwise_ins->get_operator().to_value(); cw_op_val["output_shape"] = to_value(pw_ins->get_shape()); - m.replace_instruction( - pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm}); + m.replace_instruction(pw_ins, make_op(channelwise_ins->name(), cw_op_val), inputs, {pm}); } }; diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index e308dc1bab9..608c33bd63f 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -111,15 +111,14 @@ struct channelwise_conv_compiler : compiler options.set_launch_params(v, num_blocks * block_size, block_size); - auto src = interpolate_string( - channelwise_conv_kernel, - {{"tile", to_string_range(tile_sizes)}, - {"ntiles", std::to_string(noutputs)}, - {"kernel", options.kernel_name}, - {"params", enum_params(inputs.size(), "void * private_p")}, - {"args", enum_params(inputs.size(), "private_p")}, - {"post", v.get("post", std::string{"op::id{}"})}, - {"preamble", v.get("preamble", std::string{})}}); + auto src = interpolate_string(channelwise_conv_kernel, + {{"tile", to_string_range(tile_sizes)}, + {"ntiles", std::to_string(noutputs)}, + {"kernel", options.kernel_name}, + {"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"post", v.get("post", std::string{"op::id{}"})}, + {"preamble", v.get("preamble", std::string{})}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 103d8c074cf..f7be9bf6a66 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -38,8 +38,7 @@ template -__device__ void -channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs) +__device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs) { auto idx = make_index(); auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}); diff --git a/test/gpu/fuse_ops.cpp b/test/gpu/fuse_ops.cpp index 4377afdafb0..f7c8c389862 100644 --- a/test/gpu/fuse_ops.cpp +++ b/test/gpu/fuse_ops.cpp @@ -446,8 +446,7 @@ TEST_CASE(channelwise_conv_pointwise) create_pointwise_module(p, "main:pointwise0", pw_inputs, single_pointwise("add")); auto alloc_ins2 = mm->add_instruction(alloc); pw_inputs.push_back(alloc_ins2); - auto add_ins = - mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add}); + auto add_ins = mm->add_instruction(make_precompile_op("pointwise"), pw_inputs, {pw_add}); mm->add_return({add_ins}); return p; }; @@ -466,8 +465,7 @@ TEST_CASE(channelwise_conv_pointwise) auto pre_comp_op = migraphx::make_op( "gpu::precompile_op", {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}}); - auto fused_ins = - mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add}); + auto fused_ins = mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_add}); mm->add_return({fused_ins}); return p; }; @@ -510,8 +508,7 @@ TEST_CASE(channelwise_conv_pointwise_already_fused) auto pre_comp_op = migraphx::make_op( "gpu::precompile_op", {{"op", migraphx::to_value(conv_op)}, {"output_shape", migraphx::to_value(sout)}}); - auto conv_ins = - mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu}); + auto conv_ins = mm->add_instruction(pre_comp_op, {x, w, z, alloc_ins}, {pw_relu}); auto* pw_add = create_pointwise_module(p, "main:pointwise1", {conv_ins, y}, single_pointwise("add")); auto alloc_ins2 = mm->add_instruction(alloc); From 28e32af6c3f947a703ee4f119542062c3878c481 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 13:45:34 -0600 Subject: [PATCH 37/84] Only enable for float and navi --- src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp | 3 +++ src/targets/gpu/prefuse_ops.cpp | 7 +++++-- src/targets/gpu/target.cpp | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp index bed64052009..ddb30a7f18e 100644 --- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp +++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp @@ -34,8 +34,11 @@ struct module_pass_manager; namespace gpu { +struct context; + struct MIGRAPHX_GPU_EXPORT prefuse_ops { + context* ctx = nullptr; bool enable_attention = false; std::string name() const { return "gpu::prefuse_ops"; } void apply(module_pass_manager& mpm) const; diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 34f19869660..df948754d6b 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -297,6 +297,9 @@ struct find_channelwise_convolution auto weights = ins->inputs().back(); auto num_spatial = ins->get_shape().ndim() - 2; + if(input->get_shape().type() != shape::float_type) + return; + m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights); } }; @@ -326,8 +329,8 @@ void prefuse_ops::apply(module_pass_manager& mpm) const match::find_matches(mpm.get_module(), find_add_layernorm{}); } match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention}); - match::find_matches(mpm.get_module(), find_channelwise_convolution{}); - + if(ctx != nullptr and starts_with(ctx->get_current_device().get_gfx_name(), "gfx1")) + match::find_matches(mpm.get_module(), find_channelwise_convolution{}); if(enabled(MIGRAPHX_DISABLE_MLIR{})) { inline_group_sub_module(mpm); diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index ad8d0a36f2b..348f1b66495 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -129,7 +129,7 @@ std::vector target::get_passes(migraphx::context& gctx, const compile_opti optimize_module{}, layout_convolution{.channels_last = enabled(MIGRAPHX_ENABLE_NHWC{})}, dead_code_elimination{}, - prefuse_ops{}, + prefuse_ops{.ctx = &ctx}, dead_code_elimination{}, dead_code_elimination{}, rewrite_reduce{}, From e35373cd6aa365a4b5eca45e1644d40ddfadace3 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 13:45:38 -0600 Subject: [PATCH 38/84] Format --- src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp index ddb30a7f18e..e559132cd3d 100644 --- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp +++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp @@ -38,7 +38,7 @@ struct context; struct MIGRAPHX_GPU_EXPORT prefuse_ops { - context* ctx = nullptr; + context* ctx = nullptr; bool enable_attention = false; std::string name() const { return "gpu::prefuse_ops"; } void apply(module_pass_manager& mpm) const; From f69d9bb21a08fc0539879c03588485f3332e7da0 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:45:00 -0600 Subject: [PATCH 39/84] Fix tidy --- .../include/migraphx/kernels/channelwise_conv.hpp | 6 +++--- .../include/migraphx/kernels/spatial_tiler.hpp | 12 ++++++------ test/gpu/prefuse_ops.cpp | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index f7be9bf6a66..4837e99b719 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -49,15 +49,15 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights auto w_ch = tiler.slice(w); auto out_ch = tiler.slice(output); - using T = typename Output::type; - array wregs_arr; + using t = typename Output::type; + array wregs_arr; auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape())); copy(w_ch.begin(), w_ch.end(), wregs.begin()); __syncthreads(); tiler.for_each([&](auto out_pos, auto out_multi) { - T acc = 0; + t acc = 0; repeat(wregs.get_shape().elements(), [&](auto ki) { auto k_multi = wregs.get_shape().multi(ki); acc += x_ch[out_multi + k_multi] * wregs[k_multi]; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index df4c7c22fbc..6adcff59fb4 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -78,7 +78,7 @@ struct spatial_tiler } static constexpr index_int tiles_total() { return tiles_per_dim().product(); } - static constexpr index_int NDIM() { return out_spatial_lens().size(); } + static constexpr index_int ndim() { return out_spatial_lens().size(); } static constexpr bool is_padded() { @@ -86,7 +86,7 @@ struct spatial_tiler } index idx; - array tile_origin; + array tile_origin; // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial) template @@ -102,10 +102,10 @@ struct spatial_tiler template __device__ auto shared_allocate() const { - using T = typename Input::type; + using t = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr index_int halo_total_v = hl.product(); - return uninitialized_buffer{}; + return uninitialized_buffer{}; } // Slice a tensor to per-channel spatial view @@ -120,7 +120,7 @@ struct spatial_tiler template __device__ auto copy(Input input, Smem& smem) const { - using T = typename Input::type; + using t = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr auto halo_shape = make_shape(hl); constexpr index_int halo_total_v = hl.product(); @@ -137,7 +137,7 @@ struct spatial_tiler auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; if constexpr(is_padded()) - smem[i] = in_bounds(src_pos, input_spatial) ? T{input_ch[src_pos]} : T{0}; + smem[i] = in_bounds(src_pos, input_spatial) ? t{input_ch[src_pos]} : t{0}; else smem[i] = input_ch[src_pos]; }); diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp index bdb99e4097b..425fce6d038 100644 --- a/test/gpu/prefuse_ops.cpp +++ b/test/gpu/prefuse_ops.cpp @@ -41,7 +41,7 @@ struct pre_gemm_softmax_gemm : migraphx::gpu::gemm_softmax_gemm static void run_pass(migraphx::module& m) { - migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{true}, migraphx::dead_code_elimination{}}); + migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{.enable_attention=true}, migraphx::dead_code_elimination{}}); } TEST_CASE(find_gemm_softmax_gemm) From fb48be7caf1b59f70020c86282048748d4967b47 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:45:04 -0600 Subject: [PATCH 40/84] Format --- test/gpu/prefuse_ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp index 425fce6d038..9499c6a5ec5 100644 --- a/test/gpu/prefuse_ops.cpp +++ b/test/gpu/prefuse_ops.cpp @@ -41,7 +41,9 @@ struct pre_gemm_softmax_gemm : migraphx::gpu::gemm_softmax_gemm static void run_pass(migraphx::module& m) { - migraphx::run_passes(m, {migraphx::gpu::prefuse_ops{.enable_attention=true}, migraphx::dead_code_elimination{}}); + migraphx::run_passes( + m, + {migraphx::gpu::prefuse_ops{.enable_attention = true}, migraphx::dead_code_elimination{}}); } TEST_CASE(find_gemm_softmax_gemm) From ef923a8741e8a92bd909214905bdc9468913cbc1 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:49:41 -0600 Subject: [PATCH 41/84] Fix tidy --- src/targets/gpu/jit/channelwise_conv.cpp | 2 +- src/targets/gpu/prefuse_ops.cpp | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 608c33bd63f..2d7a8f3d28c 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -74,7 +74,7 @@ struct channelwise_conv_compiler : compiler options.kernel_name = v.get("kernel", std::string{"channelwise_conv_kernel"}); options.virtual_inputs = inputs; - auto out_lens = out_s.lens(); + const auto& out_lens = out_s.lens(); // Thread block tile dimensions std::vector tile_sizes(num_spatial); diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index df948754d6b..2f399de9a4e 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -280,10 +280,8 @@ MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins) return false; auto x_lens = ins->inputs().front()->get_shape().lens(); auto c_in = x_lens[1]; - auto group = v.at("group").to(); - if(group != 1 and group != static_cast(c_in)) - return false; - return true; + auto group = v.at("group").to(); + return group == 1 or group == c_in; } struct find_channelwise_convolution From 513fafc85524ca79479d61ed222efbf8eb03e54f Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:51:53 -0600 Subject: [PATCH 42/84] Update year --- src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/array.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/index.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp | 2 +- src/targets/gpu/prefuse_ops.cpp | 2 +- test/gpu/prefuse_ops.cpp | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp index e559132cd3d..a1afd7ab087 100644 --- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp +++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp index 10270d20c2c..e7977dd4676 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp index 4b9de7ae7ce..1994d0c16c0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp index 410deefb9be..71641fd498d 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp index 0abae0363d7..4578feed9ea 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp index 4bc4d6354b6..e2f7393c32c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 2f399de9a4e..1d8dd6d01bc 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/gpu/prefuse_ops.cpp b/test/gpu/prefuse_ops.cpp index 9499c6a5ec5..90a70c37830 100644 --- a/test/gpu/prefuse_ops.cpp +++ b/test/gpu/prefuse_ops.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From ec3c657ae4719743bd4538d68e7c562330712773 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:55:28 -0600 Subject: [PATCH 43/84] Fix cppcheck --- src/targets/gpu/jit/channelwise_conv.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 2d7a8f3d28c..a950bcfc791 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -97,9 +97,7 @@ struct channelwise_conv_compiler : compiler std::vector output_tile_sizes = tile_sizes; output_tile_sizes.back() *= noutputs; - std::size_t block_size = 1; - for(auto t : tile_sizes) - block_size *= t; + std::size_t block_size = std::accumulate(tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>()); // Blocks: N * C_out * prod(ceil(out_spatial / output_tile)) std::size_t num_blocks = out_lens[0] * out_lens[1]; @@ -150,8 +148,7 @@ struct channelwise_conv_compiler : compiler if(exhaustive) { std::vector sizes; - for(auto i : range(1, 64)) - sizes.push_back(i * 4); + transform(range(1, 64), std::back_inserter(sizes), [](auto i) { return i * 4; }); for(auto tile_h : sizes) { for(auto tile_w : sizes) From 5d8051bfa5682e82aa426c81173c8f06d2ca1d92 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 15:55:32 -0600 Subject: [PATCH 44/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index a950bcfc791..1d153c323ba 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -97,7 +97,8 @@ struct channelwise_conv_compiler : compiler std::vector output_tile_sizes = tile_sizes; output_tile_sizes.back() *= noutputs; - std::size_t block_size = std::accumulate(tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>()); + std::size_t block_size = std::accumulate( + tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>()); // Blocks: N * C_out * prod(ceil(out_spatial / output_tile)) std::size_t num_blocks = out_lens[0] * out_lens[1]; From 99c896c67ad44aef914a82f9753d4b9c67b38c3b Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:00:53 +0000 Subject: [PATCH 45/84] Use std algos --- src/targets/gpu/jit/channelwise_conv.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 1d153c323ba..cb7762a69b1 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -77,7 +77,7 @@ struct channelwise_conv_compiler : compiler const auto& out_lens = out_s.lens(); // Thread block tile dimensions - std::vector tile_sizes(num_spatial); + std::vector tile_sizes(num_spatial, 1); if(num_spatial == 1) { tile_sizes[0] = v.get("tile_w", std::size_t{256}); @@ -86,8 +86,6 @@ struct channelwise_conv_compiler : compiler { tile_sizes[0] = v.get("tile_h", std::size_t{8}); tile_sizes[num_spatial - 1] = v.get("tile_w", std::size_t{32}); - for(std::size_t d = 1; d + 1 < num_spatial; ++d) - tile_sizes[d] = 1; } // Outputs per lane along W (last spatial dim) @@ -101,12 +99,15 @@ struct channelwise_conv_compiler : compiler tile_sizes.begin(), tile_sizes.end(), std::size_t{1}, std::multiplies<>()); // Blocks: N * C_out * prod(ceil(out_spatial / output_tile)) - std::size_t num_blocks = out_lens[0] * out_lens[1]; - for(std::size_t d = 0; d < num_spatial; ++d) - { - auto out_spatial = out_lens[2 + d]; - num_blocks *= (out_spatial + output_tile_sizes[d] - 1) / output_tile_sizes[d]; - } + auto num_blocks = std::inner_product( + out_lens.begin() + 2, + out_lens.end(), + output_tile_sizes.begin(), + out_lens[0] * out_lens[1], + std::multiplies<>{}, + [](auto out_spatial, auto tile) { + return (out_spatial + tile - 1) / tile; + }); options.set_launch_params(v, num_blocks * block_size, block_size); From 9f0903d29cce8e77b9a527e25f6bd1641efe80c4 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:00:56 +0000 Subject: [PATCH 46/84] Format --- src/targets/gpu/jit/channelwise_conv.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index cb7762a69b1..60d02fab30d 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -105,9 +105,7 @@ struct channelwise_conv_compiler : compiler output_tile_sizes.begin(), out_lens[0] * out_lens[1], std::multiplies<>{}, - [](auto out_spatial, auto tile) { - return (out_spatial + tile - 1) / tile; - }); + [](auto out_spatial, auto tile) { return (out_spatial + tile - 1) / tile; }); options.set_launch_params(v, num_blocks * block_size, block_size); From 680328bc022e79e74d2a1dc8b2ca62d70d01005b Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:03:34 +0000 Subject: [PATCH 47/84] Move in_bounds function --- .../gpu/kernels/include/migraphx/kernels/shape.hpp | 11 +++++++++++ .../include/migraphx/kernels/spatial_tiler.hpp | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp index 54da3fccd38..a2ae4f9c0dc 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp @@ -199,6 +199,17 @@ struct shape : equality_comparable> } }; +template +constexpr bool in_bounds(Pos pos, Lens lens) +{ + for(index_int d = 0; d < pos.size(); d++) + { + if(pos[d] >= lens[d]) + return false; + } + return true; +} + template constexpr auto calculate_strides(Lens) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 6adcff59fb4..b5358469398 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -33,17 +33,6 @@ namespace migraphx { -template -constexpr bool in_bounds(Pos pos, Lens lens) -{ - for(index_int d = 0; d < pos.size(); d++) - { - if(pos[d] >= lens[d]) - return false; - } - return true; -} - template struct spatial_tiler { From 11203097642fd89914ebdb613120e0ffe759778f Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:05:03 +0000 Subject: [PATCH 48/84] Rename type --- .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index b5358469398..d427b2271c9 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -91,10 +91,10 @@ struct spatial_tiler template __device__ auto shared_allocate() const { - using t = typename Input::type; + using type = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr index_int halo_total_v = hl.product(); - return uninitialized_buffer{}; + return uninitialized_buffer{}; } // Slice a tensor to per-channel spatial view From 76457922593e9eb99a5b92249803d47ceba3f7ea Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:05:06 +0000 Subject: [PATCH 49/84] Format --- .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index d427b2271c9..e874365715c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -91,7 +91,7 @@ struct spatial_tiler template __device__ auto shared_allocate() const { - using type = typename Input::type; + using type = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr index_int halo_total_v = hl.product(); return uninitialized_buffer{}; From 32b58940bf6d99a6e234350d16d22793ecf37bee Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:11:43 -0600 Subject: [PATCH 50/84] Fix compilation failure --- .../migraphx/kernels/spatial_tiler.hpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index e874365715c..dccdd0eb6c1 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -47,11 +47,8 @@ struct spatial_tiler return return_array_c([] { auto result = join(index_ints<1, 1>{}, TileLens{}); constexpr auto nd = result.size(); - array r; - for(index_int i = 0; i < nd; i++) - r[i] = result[i]; - r[nd - 1] *= NTiles; - return r; + result[nd - 1] *= NTiles; + return result; }); } @@ -93,8 +90,7 @@ struct spatial_tiler { using type = typename Input::type; constexpr auto hl = halo_lens_for>(); - constexpr index_int halo_total_v = hl.product(); - return uninitialized_buffer{}; + return uninitialized_buffer{}; } // Slice a tensor to per-channel spatial view @@ -109,10 +105,9 @@ struct spatial_tiler template __device__ auto copy(Input input, Smem& smem) const { - using t = typename Input::type; + using type = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr auto halo_shape = make_shape(hl); - constexpr index_int halo_total_v = hl.product(); constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial()).lens; constexpr auto n_out = nslices(OutputShape{}, keep_spatial()); @@ -122,11 +117,11 @@ struct spatial_tiler auto input_ch = slice_tensor( input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial()); - idx.local_stride(_c, [&](auto i) { + idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(index_int{i}); auto src_pos = tile_origin + halo_multi; if constexpr(is_padded()) - smem[i] = in_bounds(src_pos, input_spatial) ? t{input_ch[src_pos]} : t{0}; + smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; else smem[i] = input_ch[src_pos]; }); @@ -168,7 +163,7 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) return result; })); auto block_multi = block_shape.multi(idx.group); - auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + auto tile_origin = generate_array(_c, [&](auto d) -> index_int { if constexpr(d < 2) return 0; else From 214126495ae98df54338b1a86092849282fca3f6 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:11:46 -0600 Subject: [PATCH 51/84] Format --- .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index dccdd0eb6c1..6f5c112b530 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -105,7 +105,7 @@ struct spatial_tiler template __device__ auto copy(Input input, Smem& smem) const { - using type = typename Input::type; + using type = typename Input::type; constexpr auto hl = halo_lens_for>(); constexpr auto halo_shape = make_shape(hl); constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial()).lens; From 19cf17396e12cd0b3d27b210aae6b66ce6eef48c Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:24:06 -0600 Subject: [PATCH 52/84] Simplify some more --- .../include/migraphx/kernels/spatial_tiler.hpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 6f5c112b530..39aa1c0200c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -64,7 +64,7 @@ struct spatial_tiler } static constexpr index_int tiles_total() { return tiles_per_dim().product(); } - static constexpr index_int ndim() { return out_spatial_lens().size(); } + static constexpr auto ndim() { return out_spatial_lens().size(); } static constexpr bool is_padded() { @@ -118,7 +118,7 @@ struct spatial_tiler input, (channel_idx / index_int{groups}) % index_int{n_in}, keep_spatial()); idx.local_stride(_c, [&](auto i) { - auto halo_multi = halo_shape.multi(index_int{i}); + auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; if constexpr(is_padded()) smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; @@ -134,7 +134,7 @@ struct spatial_tiler __device__ void for_each(F f) const { idx.local_stride(_c, [&](auto j) { - auto out_multi = make_shape(output_lens()).multi(index_int{j}); + auto out_multi = make_shape(output_lens()).multi(j); auto out_pos = tile_origin + out_multi; if constexpr(is_padded()) { @@ -152,18 +152,14 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) using tiler_type = spatial_tiler; constexpr auto block_shape = make_shape(return_array_c([] { - constexpr auto tpd = decltype(tiler_type::tiles_per_dim()){}; - constexpr index_int nd = tpd.size(); - constexpr auto olens = OutputShape{}.lens; - array result; - for(index_int i = 0; i < nd; i++) - result[i] = tpd[i]; + auto result = tiler_type::tiles_per_dim().base(); + auto olens = OutputShape{}.lens; result[0] = olens[0]; result[1] = olens[1]; return result; })); auto block_multi = block_shape.multi(idx.group); - auto tile_origin = generate_array(_c, [&](auto d) -> index_int { + auto tile_origin = generate_array(tiler_type::ndim(), [&](auto d) -> index_int { if constexpr(d < 2) return 0; else From b39416ec3ac841315c5f40d41f13fbd90342e701 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:24:09 -0600 Subject: [PATCH 53/84] Format --- .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 39aa1c0200c..0c03a986c57 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -152,8 +152,8 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) using tiler_type = spatial_tiler; constexpr auto block_shape = make_shape(return_array_c([] { - auto result = tiler_type::tiles_per_dim().base(); - auto olens = OutputShape{}.lens; + auto result = tiler_type::tiles_per_dim().base(); + auto olens = OutputShape{}.lens; result[0] = olens[0]; result[1] = olens[1]; return result; From 6c990fd3d44ea4f725535fcb53b164b0a693b3c0 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:28:28 +0000 Subject: [PATCH 54/84] Use std::transform --- src/targets/gpu/prefuse_ops.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 1d8dd6d01bc..b2e5e2552bd 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -251,15 +251,18 @@ struct channelwise_conv shape compute_shape(std::vector inputs) const { - check_shapes{inputs, *this}.has(2); + check_shapes{inputs, *this}.has(2).same_ndims(); auto x_lens = inputs[0].lens(); auto w_lens = inputs[1].lens(); std::vector out_lens; out_lens.push_back(x_lens[0]); out_lens.push_back(w_lens[0]); - for(std::size_t d = 0; d < num_spatial; ++d) - out_lens.push_back(x_lens[2 + d] - w_lens[2 + d] + 1); - return {inputs.front().type(), out_lens}; + std::transform(x_lens.begin() + 2, + x_lens.begin() + 2 + num_spatial, + w_lens.begin() + 2, + std::back_inserter(out_lens), + [](auto x, auto w) { return x - w + 1; }); + return inputs[0].with_lens(out_lens); } }; MIGRAPHX_REGISTER_OP(channelwise_conv); From 90638f89021747627fb4950452cc192c50fe1ac7 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:31:39 -0600 Subject: [PATCH 55/84] Precompute slices --- .../kernels/include/migraphx/kernels/channelwise_conv.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 4837e99b719..30cd136443c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -48,6 +48,7 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights auto x_ch = tiler.copy(x, smem); auto w_ch = tiler.slice(w); auto out_ch = tiler.slice(output); + auto xs_pack = pack(tiler.slice(inputs)...); using t = typename Output::type; array wregs_arr; @@ -62,7 +63,9 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights auto k_multi = wregs.get_shape().multi(ki); acc += x_ch[out_multi + k_multi] * wregs[k_multi]; }); - out_ch[out_pos] = f(acc, tiler.slice(inputs)[out_pos]...); + xs_pack([&](auto... xs) { + out_ch[out_pos] = f(acc, xs[out_pos]...); + }); }); } From 053bf4fd033cbd009aa35496f9902a4a29c76d64 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:31:42 -0600 Subject: [PATCH 56/84] Format --- .../gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 30cd136443c..d75cd590c18 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -63,9 +63,7 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights auto k_multi = wregs.get_shape().multi(ki); acc += x_ch[out_multi + k_multi] * wregs[k_multi]; }); - xs_pack([&](auto... xs) { - out_ch[out_pos] = f(acc, xs[out_pos]...); - }); + xs_pack([&](auto... xs) { out_ch[out_pos] = f(acc, xs[out_pos]...); }); }); } From ffaa5c384d3a2bb24adfb725ab6fafa476f76d0e Mon Sep 17 00:00:00 2001 From: Paul Fultz II Date: Mon, 2 Mar 2026 16:32:31 -0600 Subject: [PATCH 57/84] Update src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp index e2f7393c32c..89f1a4a615e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp @@ -126,7 +126,7 @@ constexpr auto slice_tensor(Input input, T start, Ss... ss) constexpr auto inner_shape = make_slice(get_shape_c{}, ss...); auto outer_lens = transform( get_shape_c{}.lens, inner_shape.lens, [=](auto x, auto inner) { return x / inner; }); - // TODO: Handle non-divisble dimensions + // TODO: Handle non-divisible dimensions auto outer_shape = make_shape(outer_lens, get_shape_c{}.strides * inner_shape.lens); auto offset = outer_shape.index(start); MIGRAPHX_ASSERT(outer_shape.elements() * inner_shape.elements() == From 8a06baf1960d551b2096f607fa3d9cccc4bc8380 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:36:53 -0600 Subject: [PATCH 58/84] Change the navi check --- src/targets/gpu/prefuse_ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index b2e5e2552bd..8a3645ab828 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -323,6 +323,8 @@ void inline_group_sub_module(module_pass_manager& mpm) void prefuse_ops::apply(module_pass_manager& mpm) const { + const auto& device_name = ctx == nullptr ? "" : ctx->get_current_device().get_gfx_name(); + const bool is_navi = starts_with(device_name, "gfx11") or starts_with(device_name, "gfx12"); if(enabled(MIGRAPHX_ENABLE_LAYERNORM_FUSION{})) { match::find_matches(mpm.get_module(), find_layernorm{}); @@ -330,7 +332,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const match::find_matches(mpm.get_module(), find_add_layernorm{}); } match::find_matches(mpm, find_gemm_softmax_gemm{enable_attention}); - if(ctx != nullptr and starts_with(ctx->get_current_device().get_gfx_name(), "gfx1")) + if(is_navi) match::find_matches(mpm.get_module(), find_channelwise_convolution{}); if(enabled(MIGRAPHX_DISABLE_MLIR{})) { From 258af41258fd1a10e327ed708f8f109c135e40a6 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:45:04 +0000 Subject: [PATCH 59/84] Split verify classes --- test/verify/test_channelwise_conv.cpp | 137 ------------------ test/verify/test_channelwise_conv_1d.cpp | 49 +++++++ .../test_channelwise_conv_depthwise.cpp | 45 ++++++ .../test_channelwise_conv_depthwise_5x5.cpp | 46 ++++++ test/verify/test_channelwise_conv_large.cpp | 45 ++++++ .../test_channelwise_conv_non_divisible.cpp | 46 ++++++ .../test_channelwise_conv_single_channel.cpp | 46 ++++++ 7 files changed, 277 insertions(+), 137 deletions(-) delete mode 100644 test/verify/test_channelwise_conv.cpp create mode 100644 test/verify/test_channelwise_conv_1d.cpp create mode 100644 test/verify/test_channelwise_conv_depthwise.cpp create mode 100644 test/verify/test_channelwise_conv_depthwise_5x5.cpp create mode 100644 test/verify/test_channelwise_conv_large.cpp create mode 100644 test/verify/test_channelwise_conv_non_divisible.cpp create mode 100644 test/verify/test_channelwise_conv_single_channel.cpp diff --git a/test/verify/test_channelwise_conv.cpp b/test/verify/test_channelwise_conv.cpp deleted file mode 100644 index 91731d1d2f2..00000000000 --- a/test/verify/test_channelwise_conv.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "verify_program.hpp" -#include -#include -#include - -template -struct test_channelwise_conv_depthwise : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); - mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_depthwise; -template struct test_channelwise_conv_depthwise; - -template -struct test_channelwise_conv_single_channel - : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); - mm->add_instruction(migraphx::make_op("convolution"), input, weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_single_channel; -template struct test_channelwise_conv_single_channel; - -template -struct test_channelwise_conv_depthwise_5x5 - : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}}); - mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_depthwise_5x5; -template struct test_channelwise_conv_depthwise_5x5; - -template -struct test_channelwise_conv_1d : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}}); - mm->add_instruction( - migraphx::make_op("convolution", - {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}), - input, - weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_1d; -template struct test_channelwise_conv_1d; - -template -struct test_channelwise_conv_large : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}}); - mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_large; -template struct test_channelwise_conv_large; - -template -struct test_channelwise_conv_non_divisible - : verify_program> -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}}); - auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); - mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); - return p; - } - std::string section() const { return "conv"; } -}; -template struct test_channelwise_conv_non_divisible; -template struct test_channelwise_conv_non_divisible; diff --git a/test/verify/test_channelwise_conv_1d.cpp b/test/verify/test_channelwise_conv_1d.cpp new file mode 100644 index 00000000000..e78d2095ec7 --- /dev/null +++ b/test/verify/test_channelwise_conv_1d.cpp @@ -0,0 +1,49 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_1d : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}}); + mm->add_instruction( + migraphx::make_op("convolution", + {{"padding", {0}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}), + input, + weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_1d; +template struct test_channelwise_conv_1d; diff --git a/test/verify/test_channelwise_conv_depthwise.cpp b/test/verify/test_channelwise_conv_depthwise.cpp new file mode 100644 index 00000000000..326184e9053 --- /dev/null +++ b/test/verify/test_channelwise_conv_depthwise.cpp @@ -0,0 +1,45 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_depthwise : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_depthwise; +template struct test_channelwise_conv_depthwise; diff --git a/test/verify/test_channelwise_conv_depthwise_5x5.cpp b/test/verify/test_channelwise_conv_depthwise_5x5.cpp new file mode 100644 index 00000000000..425fe7187a7 --- /dev/null +++ b/test/verify/test_channelwise_conv_depthwise_5x5.cpp @@ -0,0 +1,46 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_depthwise_5x5 + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_depthwise_5x5; +template struct test_channelwise_conv_depthwise_5x5; diff --git a/test/verify/test_channelwise_conv_large.cpp b/test/verify/test_channelwise_conv_large.cpp new file mode 100644 index 00000000000..f736fc788f5 --- /dev/null +++ b/test/verify/test_channelwise_conv_large.cpp @@ -0,0 +1,45 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_large : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 16, 56, 56}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {16, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 16}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_large; +template struct test_channelwise_conv_large; diff --git a/test/verify/test_channelwise_conv_non_divisible.cpp b/test/verify/test_channelwise_conv_non_divisible.cpp new file mode 100644 index 00000000000..69a458c5210 --- /dev/null +++ b/test/verify/test_channelwise_conv_non_divisible.cpp @@ -0,0 +1,46 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_non_divisible + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution", {{"group", 8}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_non_divisible; +template struct test_channelwise_conv_non_divisible; diff --git a/test/verify/test_channelwise_conv_single_channel.cpp b/test/verify/test_channelwise_conv_single_channel.cpp new file mode 100644 index 00000000000..9d214be82ec --- /dev/null +++ b/test/verify/test_channelwise_conv_single_channel.cpp @@ -0,0 +1,46 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_single_channel + : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 1, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + mm->add_instruction(migraphx::make_op("convolution"), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_single_channel; +template struct test_channelwise_conv_single_channel; From bcd468d5be758b0a8b99822ed4a9987e5aa84c66 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:47:51 -0600 Subject: [PATCH 60/84] Revert the reduce and index changes --- .../include/migraphx/kernels/index.hpp | 31 +------------------ .../include/migraphx/kernels/reduce.hpp | 22 ++++++------- 2 files changed, 10 insertions(+), 43 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp index 1994d0c16c0..77da7283190 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -267,12 +267,6 @@ struct index } } - template - __device__ void device_stride(N n, F f) const - { - for_stride(_c<0>, n, _c<1>, f); - } - template __device__ void global_stride(N n, F f) const { @@ -339,28 +333,5 @@ struct per_block } }; -struct per_device -{ - index idx; - - constexpr auto local() const { return idx.global; } - - constexpr auto nlocal() const { return idx.nglobal(); } - - constexpr auto size() const { return _c<1>; } - - template - constexpr void group_stride(N n, F f) const - { - return idx.device_stride(n, f); - } - - template - constexpr void local_stride(N n, F f) const - { - return idx.global_stride(n, f); - } -}; - } // namespace migraphx #endif // MIGRAPHX_GUARD_KERNELS_INDEX_HPP diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp index 4578feed9ea..59bf17b3eda 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -516,13 +516,12 @@ struct block return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { + idx.global_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -571,13 +570,12 @@ struct block_large return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - schedule.local_stride(nelements * idx.nlocal(), [&](auto i) { + idx.global_stride(nelements * idx.nlocal(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -650,13 +648,12 @@ struct subwave return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - schedule.local_stride(nelements * idx.nlocal_subwave(), [&](auto i) { + idx.global_stride(nelements * idx.nlocal_subwave(), [&](auto i) { const auto out_idx = get_shape_c{}.multi(i / idx.nlocal_subwave()); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); @@ -712,13 +709,12 @@ struct lane return reducer{{}, idx, slicer}; } - template + template static __device__ void run(F f) { auto idx = make_index(); - auto schedule = Schedule{idx}; constexpr auto nelements = get_shape_c{}.elements(); - schedule.local_stride(nelements, [&](auto i) { + idx.global_stride(nelements, [&](auto i) { const auto out_idx = get_shape_c{}.multi(i); f(out_idx, make(idx, [&](auto input) { return reduce_slice(input, out_idx); })); }); From 7ba2ccac22b58646205f0059810e9b748cf2f8ee Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 22:49:52 +0000 Subject: [PATCH 61/84] Revert pooling changes --- .../gpu/kernels/include/migraphx/kernels/pooling.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp index 71641fd498d..76bb7c3cb6b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/pooling.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -186,18 +186,18 @@ constexpr window make_window(Window w, Stride s, Paddin return {w, s, p}; } -template +template __device__ void pooling_reduce(Output output, F f) { if constexpr(GroupSize < 2) { - Algo::template run( + Algo::template run( [&](auto out_idx, auto r) { r.outer([&] { output[out_idx] = f(out_idx, r); }); }); } else { auto goutput = as_vec(output, output.get_shape().lens.size() - _c<1>); - Algo::template run([&](auto out_idx, auto r) { + Algo::template run([&](auto out_idx, auto r) { auto i = out_idx; i.back() *= GroupSize; auto result = vec_generate([&](auto) { From 61f6ffb4503cbfbf3a8b47d6c0e9e2547924a3f3 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:53:03 -0600 Subject: [PATCH 62/84] Use signed integer --- src/targets/gpu/prefuse_ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 8a3645ab828..7dab5508394 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -261,7 +261,7 @@ struct channelwise_conv x_lens.begin() + 2 + num_spatial, w_lens.begin() + 2, std::back_inserter(out_lens), - [](auto x, auto w) { return x - w + 1; }); + [](std::ptrdiff_t x, std::ptrdiff_t w) { return x - w + 1; }); return inputs[0].with_lens(out_lens); } }; From b5cad757329b54aa3a3967b61d933034603dd64f Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:57:11 -0600 Subject: [PATCH 63/84] Update year --- src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp index a2ae4f9c0dc..0c59388e8b3 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From 5b49459fd3a750fa04bb331b152e1173064c5f79 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 2 Mar 2026 16:57:30 -0600 Subject: [PATCH 64/84] Format --- .../migraphx/kernels/channelwise_conv.hpp | 6 +++--- .../include/migraphx/kernels/spatial_tiler.hpp | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index d75cd590c18..92e60351edd 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -45,9 +45,9 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights __shared__ decltype(tiler.template shared_allocate()) smem; - auto x_ch = tiler.copy(x, smem); - auto w_ch = tiler.slice(w); - auto out_ch = tiler.slice(output); + auto x_ch = tiler.copy(x, smem); + auto w_ch = tiler.slice(w); + auto out_ch = tiler.slice(output); auto xs_pack = pack(tiler.slice(inputs)...); using t = typename Output::type; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 0c03a986c57..9be73bc6d52 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -88,8 +88,8 @@ struct spatial_tiler template __device__ auto shared_allocate() const { - using type = typename Input::type; - constexpr auto hl = halo_lens_for>(); + using type = typename Input::type; + constexpr auto hl = halo_lens_for>(); return uninitialized_buffer{}; } @@ -105,10 +105,10 @@ struct spatial_tiler template __device__ auto copy(Input input, Smem& smem) const { - using type = typename Input::type; - constexpr auto hl = halo_lens_for>(); - constexpr auto halo_shape = make_shape(hl); - constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial()).lens; + using type = typename Input::type; + constexpr auto hl = halo_lens_for>(); + constexpr auto halo_shape = make_shape(hl); + constexpr auto input_spatial = make_slice(get_shape_c{}, keep_spatial()).lens; constexpr auto n_out = nslices(OutputShape{}, keep_spatial()); constexpr auto n_in = nslices(get_shape_c{}, keep_spatial()); @@ -154,8 +154,8 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) constexpr auto block_shape = make_shape(return_array_c([] { auto result = tiler_type::tiles_per_dim().base(); auto olens = OutputShape{}.lens; - result[0] = olens[0]; - result[1] = olens[1]; + result[0] = olens[0]; + result[1] = olens[1]; return result; })); auto block_multi = block_shape.multi(idx.group); From 18a7efa306a00f737ce1bfdf6f3665b6231cb946 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 3 Apr 2026 16:14:31 -0500 Subject: [PATCH 65/84] Support padding --- src/targets/gpu/jit/channelwise_conv.cpp | 8 ++- .../migraphx/kernels/channelwise_conv.hpp | 12 ++-- .../migraphx/kernels/spatial_tiler.hpp | 65 +++++++++++++++++-- src/targets/gpu/prefuse_ops.cpp | 30 ++++++--- 4 files changed, 96 insertions(+), 19 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 60d02fab30d..6425b2a6cfd 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -50,7 +50,7 @@ extern "C" { MIGRAPHX_GLOBAL void ${kernel}(${params}) { transform_args(make_tensors(), rotate_last())(${args})([](auto output, auto x, auto w, auto... inputs) { - channelwise_conv, ${ntiles}>(index_ints<${tile}>{}, ${post}, output, x, w, inputs...); + channelwise_conv, ${ntiles}>(index_ints<${tile}>{}, index_ints<${padding}>{}, ${post}, output, x, w, inputs...); }); } @@ -109,9 +109,15 @@ struct channelwise_conv_compiler : compiler options.set_launch_params(v, num_blocks * block_size, block_size); + auto full_padding = v.get("padding", std::vector{}); + std::vector padding(num_spatial, 0); + for(std::size_t i = 0; i < num_spatial and i < full_padding.size(); i++) + padding[i] = full_padding[i]; + auto src = interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}, {"ntiles", std::to_string(noutputs)}, + {"padding", to_string_range(padding)}, {"kernel", options.kernel_name}, {"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index 92e60351edd..c0c25e38155 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -33,15 +33,17 @@ namespace migraphx { template -__device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights w, Inputs... inputs) +__device__ void channelwise_conv( + TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs) { auto idx = make_index(); - auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}); + auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}, Padding{}); __shared__ decltype(tiler.template shared_allocate()) smem; @@ -50,15 +52,15 @@ __device__ void channelwise_conv(TileLens, F f, Output output, Input x, Weights auto out_ch = tiler.slice(output); auto xs_pack = pack(tiler.slice(inputs)...); - using t = typename Output::type; - array wregs_arr; + using type = typename Output::type; + array wregs_arr; auto wregs = make_tensor_view(wregs_arr.begin(), make_packed_shape(w_ch.get_shape())); copy(w_ch.begin(), w_ch.end(), wregs.begin()); __syncthreads(); tiler.for_each([&](auto out_pos, auto out_multi) { - t acc = 0; + type acc = 0; repeat(wregs.get_shape().elements(), [&](auto ki) { auto k_multi = wregs.get_shape().multi(ki); acc += x_ch[out_multi + k_multi] * wregs[k_multi]; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 9be73bc6d52..78c3a269304 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -33,7 +33,13 @@ namespace migraphx { -template +template +constexpr bool has_nonzero(index_ints) +{ + return ((Ps != 0) or ...); +} + +template > struct spatial_tiler { static constexpr auto keep_spatial() @@ -71,17 +77,33 @@ struct spatial_tiler return (out_spatial_lens() != tiles_per_dim() * output_lens()); } + // Full-dimensional padding: (0, 0, p_h, p_w, ...) + static constexpr auto full_padding() { return join(index_ints<0, 0>{}, Padding{}); } + + static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); } + index idx; array tile_origin; // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial) + // With padding, the output is larger so the raw difference is too small; add padding back. template static constexpr auto halo_lens_for() { constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; constexpr auto halo_extra = transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); - return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); + if constexpr(has_conv_padding()) + { + constexpr auto corrected = transform( + halo_extra, full_padding(), [](auto h, auto p) -> index_int { return h + p; }); + return transform( + output_lens(), corrected, [](auto o, auto h) -> index_int { return o + h; }); + } + else + { + return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); + } } // Type for shared memory allocation @@ -120,10 +142,22 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; - if constexpr(is_padded()) - smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; + if constexpr(has_conv_padding()) + { + constexpr auto pad = full_padding(); + auto input_pos = src_pos - pad; + smem[i] = + in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; + } + else if constexpr(is_padded()) + { + smem[i] = + in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; + } else + { smem[i] = input_ch[src_pos]; + } }); return make_tensor_view(smem.data(), halo_shape); @@ -169,5 +203,28 @@ __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) return tiler_type{idx, tile_origin}; } +template +__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding) +{ + using tiler_type = spatial_tiler; + + constexpr auto block_shape = make_shape(return_array_c([] { + auto result = tiler_type::tiles_per_dim().base(); + auto olens = OutputShape{}.lens; + result[0] = olens[0]; + result[1] = olens[1]; + return result; + })); + auto block_multi = block_shape.multi(idx.group); + auto tile_origin = generate_array(tiler_type::ndim(), [&](auto d) -> index_int { + if constexpr(d < 2) + return 0; + else + return block_multi[d] * tiler_type::output_lens()[d]; + }); + + return tiler_type{idx, tile_origin}; +} + } // namespace migraphx #endif // MIGRAPHX_GUARD_KERNELS_SPATIAL_TILER_HPP diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 7dab5508394..3db298930bd 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -240,13 +240,14 @@ struct find_gemm_softmax_gemm struct channelwise_conv { std::size_t num_spatial = 2; + std::vector padding; std::string name() const { return "gpu::channelwise_conv"; } template static auto reflect(Self& self, F f) { - return pack(f(self.num_spatial, "num_spatial")); + return pack(f(self.num_spatial, "num_spatial"), f(self.padding, "padding")); } shape compute_shape(std::vector inputs) const @@ -257,11 +258,15 @@ struct channelwise_conv std::vector out_lens; out_lens.push_back(x_lens[0]); out_lens.push_back(w_lens[0]); - std::transform(x_lens.begin() + 2, - x_lens.begin() + 2 + num_spatial, - w_lens.begin() + 2, - std::back_inserter(out_lens), - [](std::ptrdiff_t x, std::ptrdiff_t w) { return x - w + 1; }); + for(std::size_t i = 0; i < num_spatial; i++) + { + std::size_t total_pad = 0; + if(i < padding.size()) + total_pad += padding[i]; + if(i + num_spatial < padding.size()) + total_pad += padding[i + num_spatial]; + out_lens.push_back(x_lens[i + 2] + total_pad - w_lens[i + 2] + 1); + } return inputs[0].with_lens(out_lens); } }; @@ -274,8 +279,6 @@ MIGRAPHX_PRED_MATCHER(conv_channelwise, instruction_ref ins) auto v = ins->get_operator().to_value(); if(not all_of(v.at("stride"), [](const value& x) { return x.to() == 1; })) return false; - if(not all_of(v.at("padding"), [](const value& x) { return x.to() == 0; })) - return false; if(not all_of(v.at("dilation"), [](const value& x) { return x.to() == 1; })) return false; auto w_lens = ins->inputs().back()->get_shape().lens(); @@ -301,7 +304,16 @@ struct find_channelwise_convolution if(input->get_shape().type() != shape::float_type) return; - m.replace_instruction(ins, channelwise_conv{num_spatial}, input, weights); + auto v = ins->get_operator().to_value(); + auto pad_vals = v.at("padding"); + std::vector padding; + std::transform(pad_vals.begin(), + pad_vals.end(), + std::back_inserter(padding), + [](const value& x) { return x.to(); }); + + m.replace_instruction( + ins, channelwise_conv{num_spatial, std::move(padding)}, input, weights); } }; From c23a8e8f2d111b228ae998d67ea1a0c6b792d9c4 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 3 Apr 2026 16:14:36 -0500 Subject: [PATCH 66/84] Format --- .../kernels/include/migraphx/kernels/channelwise_conv.hpp | 4 ++-- .../gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp index c0c25e38155..be186ecf91e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/channelwise_conv.hpp @@ -39,8 +39,8 @@ template -__device__ void channelwise_conv( - TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs) +__device__ void +channelwise_conv(TileLens, Padding, F f, Output output, Input x, Weights w, Inputs... inputs) { auto idx = make_index(); auto tiler = make_spatial_tiler(idx, TileLens{}, get_shape_c{}, Padding{}); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 78c3a269304..dc0db118752 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -146,13 +146,11 @@ struct spatial_tiler { constexpr auto pad = full_padding(); auto input_pos = src_pos - pad; - smem[i] = - in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; + smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; } else if constexpr(is_padded()) { - smem[i] = - in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; + smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; } else { From 747292cc8cf4ba12bfe860e486232e6e786e1c0f Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 3 Apr 2026 17:59:04 -0500 Subject: [PATCH 67/84] Fix selection --- .../include/migraphx/kernels/spatial_tiler.hpp | 15 ++++++++------- src/targets/gpu/target.cpp | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index dc0db118752..3734d443f4c 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -90,18 +90,19 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; - constexpr auto halo_extra = - transform(input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); if constexpr(has_conv_padding()) { - constexpr auto corrected = transform( - halo_extra, full_padding(), [](auto h, auto p) -> index_int { return h + p; }); - return transform( - output_lens(), corrected, [](auto o, auto h) -> index_int { return o + h; }); + constexpr auto halo_extra = return_array_c([] { + return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + + full_padding(); + }); + return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } else { + constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; + constexpr auto halo_extra = transform( + input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } } diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index dfcd0797d1e..39649f8e4a6 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -135,7 +135,7 @@ std::vector target::get_passes(migraphx::context& gctx, const compile_opti dead_code_elimination{}, fuse_horizontal{}, dead_code_elimination{}, - prefuse_ops{}, + prefuse_ops{&ctx}, dead_code_elimination{}, dead_code_elimination{}, rewrite_reduce{}, From ad9b8d1434f32e820a94b340d314cf3cc00a9fe3 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 3 Apr 2026 21:57:43 -0500 Subject: [PATCH 68/84] Fix padding --- src/targets/gpu/jit/channelwise_conv.cpp | 7 ++-- .../migraphx/kernels/spatial_tiler.hpp | 35 +++++++++++++++---- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/targets/gpu/jit/channelwise_conv.cpp b/src/targets/gpu/jit/channelwise_conv.cpp index 6425b2a6cfd..7fc4b679138 100644 --- a/src/targets/gpu/jit/channelwise_conv.cpp +++ b/src/targets/gpu/jit/channelwise_conv.cpp @@ -109,10 +109,9 @@ struct channelwise_conv_compiler : compiler options.set_launch_params(v, num_blocks * block_size, block_size); - auto full_padding = v.get("padding", std::vector{}); - std::vector padding(num_spatial, 0); - for(std::size_t i = 0; i < num_spatial and i < full_padding.size(); i++) - padding[i] = full_padding[i]; + auto padding = v.get("padding", std::vector{}); + if(padding.size() < 2 * num_spatial) + padding.resize(2 * num_spatial, 0); auto src = interpolate_string(channelwise_conv_kernel, {{"tile", to_string_range(tile_sizes)}, diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 3734d443f4c..52e88a9e43b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -77,16 +77,39 @@ struct spatial_tiler return (out_spatial_lens() != tiles_per_dim() * output_lens()); } - // Full-dimensional padding: (0, 0, p_h, p_w, ...) - static constexpr auto full_padding() { return join(index_ints<0, 0>{}, Padding{}); } - static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); } + // Left (begin) padding per dim: (0, 0, left_h, left_w) + static constexpr auto left_padding() + { + return return_array_c([] { + constexpr auto p = Padding{}; + constexpr auto ns = p.size() / 2; + auto result = array(index_int{0}); + for(index_int i = 0; i < ns; i++) + result[i + 2] = p[i]; + return result; + }); + } + + // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w) + static constexpr auto total_padding() + { + return return_array_c([] { + constexpr auto p = Padding{}; + constexpr auto ns = p.size() / 2; + auto result = array(index_int{0}); + for(index_int i = 0; i < ns; i++) + result[i + 2] = p[i] + p[i + ns]; + return result; + }); + } + index idx; array tile_origin; // Compute halo lens for a given input shape: output_lens + (input_spatial - output_spatial) - // With padding, the output is larger so the raw difference is too small; add padding back. + // With padding, the output is larger so the raw difference is too small; add total padding. template static constexpr auto halo_lens_for() { @@ -94,7 +117,7 @@ struct spatial_tiler { constexpr auto halo_extra = return_array_c([] { return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + - full_padding(); + total_padding(); }); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } @@ -145,7 +168,7 @@ struct spatial_tiler auto src_pos = tile_origin + halo_multi; if constexpr(has_conv_padding()) { - constexpr auto pad = full_padding(); + constexpr auto pad = left_padding(); auto input_pos = src_pos - pad; smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; } From 77dac357a4eae67df73e545fc22c31aa7766ffac Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 3 Apr 2026 22:24:08 -0500 Subject: [PATCH 69/84] Cleanup --- .../migraphx/kernels/spatial_tiler.hpp | 58 ++++++------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 52e88a9e43b..72ca68deac2 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -113,21 +113,22 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - if constexpr(has_conv_padding()) - { - constexpr auto halo_extra = return_array_c([] { - return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + - total_padding(); - }); - return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); - } - else - { - constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; - constexpr auto halo_extra = transform( - input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); - return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); - } + constexpr auto halo_extra = [] { + if constexpr(has_conv_padding()) + { + return return_array_c([] { + return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + + total_padding(); + }); + } + else + { + constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; + return transform( + input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); + } + }(); + return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } // Type for shared memory allocation @@ -202,31 +203,8 @@ struct spatial_tiler } }; -template -__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape) -{ - using tiler_type = spatial_tiler; - - constexpr auto block_shape = make_shape(return_array_c([] { - auto result = tiler_type::tiles_per_dim().base(); - auto olens = OutputShape{}.lens; - result[0] = olens[0]; - result[1] = olens[1]; - return result; - })); - auto block_multi = block_shape.multi(idx.group); - auto tile_origin = generate_array(tiler_type::ndim(), [&](auto d) -> index_int { - if constexpr(d < 2) - return 0; - else - return block_multi[d] * tiler_type::output_lens()[d]; - }); - - return tiler_type{idx, tile_origin}; -} - -template -__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding) +template > +__device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding = {}) { using tiler_type = spatial_tiler; From c47b394b60b2ab70caa24aa4950a0a976e7c30d1 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 18:32:31 -0500 Subject: [PATCH 70/84] Use generate_array instead --- .../include/migraphx/kernels/array.hpp | 6 ++++ .../migraphx/kernels/spatial_tiler.hpp | 28 +++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp index 6c87fb2ad86..9a5e6432a25 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp @@ -360,6 +360,12 @@ constexpr auto make_const_array(T x, Ts... xs) return integral_const_array{}; } +template +constexpr auto generate_const_array(N n, F f) +{ + return sequence_c([=](auto... is) { return make_const_array(f(is)...); }); +} + template constexpr auto generate_array(N n, F f) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 72ca68deac2..ffeb67e2267 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -82,26 +82,26 @@ struct spatial_tiler // Left (begin) padding per dim: (0, 0, left_h, left_w) static constexpr auto left_padding() { - return return_array_c([] { - constexpr auto p = Padding{}; - constexpr auto ns = p.size() / 2; - auto result = array(index_int{0}); - for(index_int i = 0; i < ns; i++) - result[i + 2] = p[i]; - return result; + constexpr auto p = Padding{}; + constexpr auto ns = p.size() / 2; + return generate_const_array(ns + 2, [](auto i) { + if(i < 2) + return index_int{0}; + else + return p[i - 2]; }); } // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w) static constexpr auto total_padding() { - return return_array_c([] { - constexpr auto p = Padding{}; - constexpr auto ns = p.size() / 2; - auto result = array(index_int{0}); - for(index_int i = 0; i < ns; i++) - result[i + 2] = p[i] + p[i + ns]; - return result; + constexpr auto p = Padding{}; + constexpr auto ns = p.size() / 2; + return generate_const_array(ns + 2, [](auto i) { + if(i < 2) + return index_int{0}; + else + return p[i - 2] + p[i - 2 + ns]; }); } From 604d408d462e489ed56a35d86dd632df02d8087b Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:13:04 -0500 Subject: [PATCH 71/84] Use generate array --- .../migraphx/kernels/integral_constant.hpp | 3 ++ .../migraphx/kernels/spatial_tiler.hpp | 46 +++++++++++-------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp index 74a4aa51cb5..e444ebd7107 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp @@ -131,6 +131,9 @@ struct is_integral_constant> : true_type template using index_constant = integral_constant; +template +static constexpr auto index_c = index_constant{}; + template static constexpr auto _c = integral_constant{}; // NOLINT diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index ffeb67e2267..2e756c18a81 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -79,29 +79,37 @@ struct spatial_tiler static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); } + static constexpr auto get_padding() + { + if constexpr(Padding{}.empty()) + return transform(TileLens{}, [](auto) { return index_int{0}; }); + else + return Padding{}; + } + // Left (begin) padding per dim: (0, 0, left_h, left_w) static constexpr auto left_padding() { - constexpr auto p = Padding{}; + constexpr auto p = get_padding(); constexpr auto ns = p.size() / 2; - return generate_const_array(ns + 2, [](auto i) { - if(i < 2) - return index_int{0}; + return generate_const_array(_c, [&](auto i) { + if constexpr(i < 2) + return index_c<0>; else - return p[i - 2]; + return index_c; }); } // Total (left+right) padding per dim: (0, 0, left_h+right_h, left_w+right_w) static constexpr auto total_padding() { - constexpr auto p = Padding{}; + constexpr auto p = get_padding(); constexpr auto ns = p.size() / 2; - return generate_const_array(ns + 2, [](auto i) { - if(i < 2) - return index_int{0}; + return generate_const_array(_c, [&](auto i) { + if constexpr(i < 2) + return index_c<0>; else - return p[i - 2] + p[i - 2 + ns]; + return index_c; }); } @@ -114,19 +122,19 @@ struct spatial_tiler static constexpr auto halo_lens_for() { constexpr auto halo_extra = [] { - if constexpr(has_conv_padding()) - { + // if constexpr(has_conv_padding()) + // { return return_array_c([] { return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + total_padding(); }); - } - else - { - constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; - return transform( - input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); - } + // } + // else + // { + // constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; + // return transform( + // input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); + // } }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } From 5fc446ab1a784dba37e6295a337a3d2a8ce97a0e Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:13:13 -0500 Subject: [PATCH 72/84] Format --- .../migraphx/kernels/integral_constant.hpp | 2 +- .../migraphx/kernels/spatial_tiler.hpp | 26 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp index e444ebd7107..9d48717bbd8 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp @@ -131,7 +131,7 @@ struct is_integral_constant> : true_type template using index_constant = integral_constant; -template +template static constexpr auto index_c = index_constant{}; template diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 2e756c18a81..e4f457ac441 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -79,7 +79,7 @@ struct spatial_tiler static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); } - static constexpr auto get_padding() + static constexpr auto get_padding() { if constexpr(Padding{}.empty()) return transform(TileLens{}, [](auto) { return index_int{0}; }); @@ -121,21 +121,23 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - constexpr auto halo_extra = [] { - // if constexpr(has_conv_padding()) - // { + constexpr auto halo_extra = + [] { + // if constexpr(has_conv_padding()) + // { return return_array_c([] { return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + total_padding(); }); - // } - // else - // { - // constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; - // return transform( - // input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; }); - // } - }(); + // } + // else + // { + // constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; + // return transform( + // input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; + // }); + // } + }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } From 21442c45de0e2c880bc2df9ef741cbd183cedf1b Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:14:48 -0500 Subject: [PATCH 73/84] Add padding tests --- test/verify/test_channelwise_conv_padding.cpp | 46 +++++++++++++++++ .../test_channelwise_conv_padding_1d.cpp | 50 +++++++++++++++++++ .../test_channelwise_conv_padding_5x5.cpp | 47 +++++++++++++++++ ...channelwise_conv_padding_non_divisible.cpp | 47 +++++++++++++++++ .../test_channelwise_conv_padding_relu.cpp | 48 ++++++++++++++++++ 5 files changed, 238 insertions(+) create mode 100644 test/verify/test_channelwise_conv_padding.cpp create mode 100644 test/verify/test_channelwise_conv_padding_1d.cpp create mode 100644 test/verify/test_channelwise_conv_padding_5x5.cpp create mode 100644 test/verify/test_channelwise_conv_padding_non_divisible.cpp create mode 100644 test/verify/test_channelwise_conv_padding_relu.cpp diff --git a/test/verify/test_channelwise_conv_padding.cpp b/test/verify/test_channelwise_conv_padding.cpp new file mode 100644 index 00000000000..fa38209e455 --- /dev/null +++ b/test/verify/test_channelwise_conv_padding.cpp @@ -0,0 +1,46 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_padding : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 8, 8}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3, 3}}); + mm->add_instruction( + migraphx::make_op("convolution", {{"group", 4}, {"padding", {1, 1}}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_padding; +template struct test_channelwise_conv_padding; diff --git a/test/verify/test_channelwise_conv_padding_1d.cpp b/test/verify/test_channelwise_conv_padding_1d.cpp new file mode 100644 index 00000000000..6094fc98bb7 --- /dev/null +++ b/test/verify/test_channelwise_conv_padding_1d.cpp @@ -0,0 +1,50 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_padding_1d : + verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {2, 4, 16}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {4, 1, 3}}); + mm->add_instruction( + migraphx::make_op("convolution", + {{"padding", {1}}, {"stride", {1}}, {"dilation", {1}}, {"group", 4}}), + input, + weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_padding_1d; +template struct test_channelwise_conv_padding_1d; diff --git a/test/verify/test_channelwise_conv_padding_5x5.cpp b/test/verify/test_channelwise_conv_padding_5x5.cpp new file mode 100644 index 00000000000..49652dd5ab4 --- /dev/null +++ b/test/verify/test_channelwise_conv_padding_5x5.cpp @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_padding_5x5 : + verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 5, 5}}); + mm->add_instruction( + migraphx::make_op("convolution", {{"group", 8}, {"padding", {2, 2}}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_padding_5x5; +template struct test_channelwise_conv_padding_5x5; diff --git a/test/verify/test_channelwise_conv_padding_non_divisible.cpp b/test/verify/test_channelwise_conv_padding_non_divisible.cpp new file mode 100644 index 00000000000..e2d643575bf --- /dev/null +++ b/test/verify/test_channelwise_conv_padding_non_divisible.cpp @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_padding_non_divisible : + verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 30, 30}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); + mm->add_instruction( + migraphx::make_op("convolution", {{"group", 8}, {"padding", {1, 1}}}), input, weights); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_padding_non_divisible; +template struct test_channelwise_conv_padding_non_divisible; diff --git a/test/verify/test_channelwise_conv_padding_relu.cpp b/test/verify/test_channelwise_conv_padding_relu.cpp new file mode 100644 index 00000000000..fe1de7b8fa2 --- /dev/null +++ b/test/verify/test_channelwise_conv_padding_relu.cpp @@ -0,0 +1,48 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_channelwise_conv_padding_relu : + verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto input = mm->add_parameter("x", migraphx::shape{DType, {1, 8, 12, 12}}); + auto weights = mm->add_parameter("w", migraphx::shape{DType, {8, 1, 3, 3}}); + auto conv = mm->add_instruction( + migraphx::make_op("convolution", {{"group", 8}, {"padding", {1, 1}}}), input, weights); + mm->add_instruction(migraphx::make_op("relu"), conv); + return p; + } + std::string section() const { return "conv"; } +}; +template struct test_channelwise_conv_padding_relu; +template struct test_channelwise_conv_padding_relu; From 371f79b13e77df85303bc93cb1196a725a3f45ca Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:14:51 -0500 Subject: [PATCH 74/84] Format --- test/verify/test_channelwise_conv_padding_1d.cpp | 3 +-- test/verify/test_channelwise_conv_padding_5x5.cpp | 3 +-- test/verify/test_channelwise_conv_padding_non_divisible.cpp | 4 ++-- test/verify/test_channelwise_conv_padding_relu.cpp | 4 ++-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/test/verify/test_channelwise_conv_padding_1d.cpp b/test/verify/test_channelwise_conv_padding_1d.cpp index 6094fc98bb7..7e4c5f3d170 100644 --- a/test/verify/test_channelwise_conv_padding_1d.cpp +++ b/test/verify/test_channelwise_conv_padding_1d.cpp @@ -28,8 +28,7 @@ #include template -struct test_channelwise_conv_padding_1d : - verify_program> +struct test_channelwise_conv_padding_1d : verify_program> { migraphx::program create_program() const { diff --git a/test/verify/test_channelwise_conv_padding_5x5.cpp b/test/verify/test_channelwise_conv_padding_5x5.cpp index 49652dd5ab4..4fcf2ce218b 100644 --- a/test/verify/test_channelwise_conv_padding_5x5.cpp +++ b/test/verify/test_channelwise_conv_padding_5x5.cpp @@ -28,8 +28,7 @@ #include template -struct test_channelwise_conv_padding_5x5 : - verify_program> +struct test_channelwise_conv_padding_5x5 : verify_program> { migraphx::program create_program() const { diff --git a/test/verify/test_channelwise_conv_padding_non_divisible.cpp b/test/verify/test_channelwise_conv_padding_non_divisible.cpp index e2d643575bf..4a1fdde33cf 100644 --- a/test/verify/test_channelwise_conv_padding_non_divisible.cpp +++ b/test/verify/test_channelwise_conv_padding_non_divisible.cpp @@ -28,8 +28,8 @@ #include template -struct test_channelwise_conv_padding_non_divisible : - verify_program> +struct test_channelwise_conv_padding_non_divisible + : verify_program> { migraphx::program create_program() const { diff --git a/test/verify/test_channelwise_conv_padding_relu.cpp b/test/verify/test_channelwise_conv_padding_relu.cpp index fe1de7b8fa2..2d934d39ac0 100644 --- a/test/verify/test_channelwise_conv_padding_relu.cpp +++ b/test/verify/test_channelwise_conv_padding_relu.cpp @@ -28,8 +28,8 @@ #include template -struct test_channelwise_conv_padding_relu : - verify_program> +struct test_channelwise_conv_padding_relu + : verify_program> { migraphx::program create_program() const { From 8949117b741c15b16388bbf711c5b02687d27220 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:35:50 -0500 Subject: [PATCH 75/84] Update is_padded() check --- .../migraphx/kernels/spatial_tiler.hpp | 33 +++++-------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index e4f457ac441..8591152bb4b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -72,13 +72,6 @@ struct spatial_tiler static constexpr index_int tiles_total() { return tiles_per_dim().product(); } static constexpr auto ndim() { return out_spatial_lens().size(); } - static constexpr bool is_padded() - { - return (out_spatial_lens() != tiles_per_dim() * output_lens()); - } - - static constexpr bool has_conv_padding() { return has_nonzero(Padding{}); } - static constexpr auto get_padding() { if constexpr(Padding{}.empty()) @@ -113,6 +106,11 @@ struct spatial_tiler }); } + static constexpr bool is_padded() + { + return (out_spatial_lens() != (tiles_per_dim() * output_lens() + total_padding())); + } + index idx; array tile_origin; @@ -123,20 +121,10 @@ struct spatial_tiler { constexpr auto halo_extra = [] { - // if constexpr(has_conv_padding()) - // { return return_array_c([] { return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + total_padding(); }); - // } - // else - // { - // constexpr auto input_spatial = make_slice(InputShape{}, keep_spatial()).lens; - // return transform( - // input_spatial, out_spatial_lens(), [](auto is, auto os) { return is - os; - // }); - // } }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } @@ -177,19 +165,14 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; - if constexpr(has_conv_padding()) + auto input_pos = src_pos - left_padding(); + if constexpr(is_padded()) { - constexpr auto pad = left_padding(); - auto input_pos = src_pos - pad; smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; } - else if constexpr(is_padded()) - { - smem[i] = in_bounds(src_pos, input_spatial) ? type{input_ch[src_pos]} : type{0}; - } else { - smem[i] = input_ch[src_pos]; + smem[i] = input_ch[input_pos]; } }); From be32bda8ac8f969ba7f9cf8937a504d6c71b557c Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:35:54 -0500 Subject: [PATCH 76/84] Format --- .../include/migraphx/kernels/spatial_tiler.hpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 8591152bb4b..6c8a9a0f125 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -119,13 +119,12 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - constexpr auto halo_extra = - [] { - return return_array_c([] { - return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + - total_padding(); - }); - }(); + constexpr auto halo_extra = [] { + return return_array_c([] { + return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + + total_padding(); + }); + }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } @@ -165,7 +164,7 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; - auto input_pos = src_pos - left_padding(); + auto input_pos = src_pos - left_padding(); if constexpr(is_padded()) { smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; From 4f5221e8fdfd73e24f583336c2009140e56d1da6 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:57:35 -0500 Subject: [PATCH 77/84] Add unit tests --- .../migraphx/kernels/spatial_tiler.hpp | 28 +- test/gpu/kernels/spatial_tiler.cpp | 329 ++++++++++++++++++ 2 files changed, 346 insertions(+), 11 deletions(-) create mode 100644 test/gpu/kernels/spatial_tiler.cpp diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 6c8a9a0f125..9c98a269ee7 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -39,7 +39,7 @@ constexpr bool has_nonzero(index_ints) return ((Ps != 0) or ...); } -template > +template > struct spatial_tiler { static constexpr auto keep_spatial() @@ -74,10 +74,15 @@ struct spatial_tiler static constexpr auto get_padding() { - if constexpr(Padding{}.empty()) - return transform(TileLens{}, [](auto) { return index_int{0}; }); + if constexpr(Padding{}.size() < 2) + { + auto pre = transform(TileLens{}, [](auto) { return index_c<0>; }); + return join(pre, pre); + } else + { return Padding{}; + } } // Left (begin) padding per dim: (0, 0, left_h, left_w) @@ -119,12 +124,13 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - constexpr auto halo_extra = [] { - return return_array_c([] { - return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + - total_padding(); - }); - }(); + constexpr auto halo_extra = + [] { + return return_array_c([] { + return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + + total_padding(); + }); + }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } @@ -164,7 +170,7 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; - auto input_pos = src_pos - left_padding(); + auto input_pos = src_pos - left_padding(); if constexpr(is_padded()) { smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; @@ -195,7 +201,7 @@ struct spatial_tiler } }; -template > +template > __device__ auto make_spatial_tiler(index idx, TileLens, OutputShape, Padding = {}) { using tiler_type = spatial_tiler; diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp new file mode 100644 index 00000000000..dd0d5950c4e --- /dev/null +++ b/test/gpu/kernels/spatial_tiler.cpp @@ -0,0 +1,329 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +#include +#include + +// Helper: create a standard 4D shape from lens +template +constexpr auto make_4d_shape() +{ + constexpr auto lens = migraphx::index_ints{}; + return migraphx::make_shape(lens); +} + +// ======== output_lens ======== + +// Tile {4, 4} with NTiles=1 → output_lens = {1, 1, 4, 4} +TEST_CASE(output_lens_ntiles_1) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + constexpr auto ol = tiler::output_lens(); + EXPECT(ol.size() == 4); + EXPECT(ol[0] == 1); + EXPECT(ol[1] == 1); + EXPECT(ol[2] == 4); + EXPECT(ol[3] == 4); +} + +// Tile {4, 4} with NTiles=2 → last dim doubled: {1, 1, 4, 8} +TEST_CASE(output_lens_ntiles_2) +{ + using tiler = migraphx::spatial_tiler<2, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + constexpr auto ol = tiler::output_lens(); + EXPECT(ol[2] == 4); + EXPECT(ol[3] == 8); +} + +// ======== out_spatial_lens ======== + +TEST_CASE(out_spatial_lens_basic) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<2, 3, 16, 16>())>; + constexpr auto sl = tiler::out_spatial_lens(); + // keep_spatial sets dims 0,1 to 1; keeps H,W + EXPECT(sl[0] == 1); + EXPECT(sl[1] == 1); + EXPECT(sl[2] == 16); + EXPECT(sl[3] == 16); +} + +// ======== tiles_per_dim ======== + +// 8x8 output, 4x4 tile, NTiles=1 → ceil(8/4)=2 per spatial dim +TEST_CASE(tiles_per_dim_exact) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + constexpr auto tpd = tiler::tiles_per_dim(); + EXPECT(tpd[2] == 2); + EXPECT(tpd[3] == 2); +} + +// 10x10 output, 4x4 tile → ceil(10/4)=3 per spatial dim +TEST_CASE(tiles_per_dim_inexact) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 10, 10>())>; + constexpr auto tpd = tiler::tiles_per_dim(); + EXPECT(tpd[2] == 3); + EXPECT(tpd[3] == 3); +} + +// NTiles=2 scales last dim: tile output is {4, 8} → ceil(16/4)=4, ceil(16/8)=2 +TEST_CASE(tiles_per_dim_ntiles) +{ + using tiler = migraphx::spatial_tiler<2, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 16, 16>())>; + constexpr auto tpd = tiler::tiles_per_dim(); + EXPECT(tpd[2] == 4); + EXPECT(tpd[3] == 2); +} + +// ======== tiles_total ======== + +TEST_CASE(tiles_total_exact) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + // tiles_per_dim = {1, 1, 2, 2}, product = 4 + EXPECT(tiler::tiles_total() == 4); +} + +// ======== get_padding / left_padding / total_padding ======== + +// No Padding arg → get_padding returns zeros matching TileLens size +TEST_CASE(get_padding_default) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + constexpr auto gp = tiler::get_padding(); + EXPECT(gp.size() == 4); + EXPECT(gp[0] == 0); + EXPECT(gp[1] == 0); + EXPECT(gp[2] == 0); + EXPECT(gp[3] == 0); +} + +// No padding template arg → all zeros +TEST_CASE(padding_default_no_padding) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + constexpr auto lp = tiler::left_padding(); + constexpr auto tp = tiler::total_padding(); + EXPECT(lp[0] == 0); + EXPECT(lp[1] == 0); + EXPECT(lp[2] == 0); + EXPECT(lp[3] == 0); + EXPECT(tp[0] == 0); + EXPECT(tp[1] == 0); + EXPECT(tp[2] == 0); + EXPECT(tp[3] == 0); +} + +// Symmetric padding {1, 1, 1, 1} → left={0,0,1,1}, total={0,0,2,2} +TEST_CASE(padding_symmetric) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<1, 1, 1, 1>>; + constexpr auto lp = tiler::left_padding(); + EXPECT(lp[0] == 0); + EXPECT(lp[1] == 0); + EXPECT(lp[2] == 1); + EXPECT(lp[3] == 1); + + constexpr auto tp = tiler::total_padding(); + EXPECT(tp[0] == 0); + EXPECT(tp[1] == 0); + EXPECT(tp[2] == 2); + EXPECT(tp[3] == 2); +} + +// Asymmetric padding {1, 2, 3, 4} → left={0,0,1,2}, total={0,0,1+3,2+4}={0,0,4,6} +TEST_CASE(padding_asymmetric) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<1, 2, 3, 4>>; + constexpr auto lp = tiler::left_padding(); + EXPECT(lp[2] == 1); + EXPECT(lp[3] == 2); + + constexpr auto tp = tiler::total_padding(); + EXPECT(tp[2] == 4); + EXPECT(tp[3] == 6); +} + +// ======== is_padded ======== + +// Tiles exactly cover output, no conv padding → not padded +TEST_CASE(is_padded_exact_no_padding) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + EXPECT(not tiler::is_padded()); +} + +// Tiles don't exactly cover output (10 not divisible by 4) → padded +TEST_CASE(is_padded_overhang) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 10, 10>())>; + EXPECT(tiler::is_padded()); +} + +// Tiles exactly cover output but conv padding present → padded +TEST_CASE(is_padded_conv_padding_exact_tiles) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<1, 1, 1, 1>>; + EXPECT(tiler::is_padded()); +} + +// Both overhang and conv padding → padded +TEST_CASE(is_padded_overhang_and_conv_padding) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 10, 10>()), + migraphx::index_ints<1, 1, 1, 1>>; + EXPECT(tiler::is_padded()); +} + +// Edge case: tile overhang equals total padding → still padded +// out_spatial=10, tile=8, tiles_per_dim=2, tiles*tile=16, total_pad=6 +// Without the fix: 10 != 16 → padded (only by coincidence). +// With total_padding in formula: 10 != 16+6=22 → padded. +TEST_CASE(is_padded_overhang_equals_padding) +{ + // tiles_per_dim = ceil(10/8) = 2, coverage = 16, total_pad_h=3+3=6 + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<8, 8>, + decltype(make_4d_shape<1, 1, 10, 10>()), + migraphx::index_ints<3, 3, 3, 3>>; + EXPECT(tiler::is_padded()); +} + +// Only one spatial dim has overhang +TEST_CASE(is_padded_partial_overhang) +{ + // H=8 exactly tiled by tile_h=4. W=10 not divisible by tile_w=4. + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 10>())>; + EXPECT(tiler::is_padded()); +} + +// Large padding values +TEST_CASE(is_padded_large_padding) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<3, 3, 3, 3>>; + EXPECT(tiler::is_padded()); +} + +// ======== has_nonzero ======== + +TEST_CASE(has_nonzero_all_zero) +{ + EXPECT(not migraphx::has_nonzero(migraphx::index_ints<0, 0, 0, 0>{})); +} + +TEST_CASE(has_nonzero_some_nonzero) +{ + EXPECT(migraphx::has_nonzero(migraphx::index_ints<0, 0, 1, 0>{})); +} + +TEST_CASE(has_nonzero_all_nonzero) +{ + EXPECT(migraphx::has_nonzero(migraphx::index_ints<1, 2, 3, 4>{})); +} + +// ======== halo_lens_for ======== + +// No padding: halo = output_lens + (input_spatial - out_spatial) +TEST_CASE(halo_lens_no_padding) +{ + // Output 8x8, input 10x10 (e.g. 3x3 conv), tile 4x4 + // out_spatial = {1,1,8,8}, input_spatial = {1,1,10,10} + // halo_extra = {1,1,10,10} - {1,1,8,8} + {0,0,0,0} = {0,0,2,2} + // halo_lens = output_lens + halo_extra = {1,1,4,4} + {0,0,2,2} = {1,1,6,6} + using output_shape = decltype(make_4d_shape<1, 1, 8, 8>()); + using input_shape = decltype(make_4d_shape<1, 1, 10, 10>()); + using tiler = migraphx::spatial_tiler<1, migraphx::index_ints<4, 4>, output_shape>; + + constexpr auto hl = tiler::template halo_lens_for(); + EXPECT(hl[2] == 6); + EXPECT(hl[3] == 6); +} + +// With padding: halo = output_lens + (input_spatial - out_spatial + total_padding) +TEST_CASE(halo_lens_with_padding) +{ + // Output 8x8, input 8x8 (same-padding conv), pad {1,1,1,1} → total_pad={0,0,2,2} + // halo_extra = {1,1,8,8} - {1,1,8,8} + {0,0,2,2} = {0,0,2,2} + // halo_lens = {1,1,4,4} + {0,0,2,2} = {1,1,6,6} + using output_shape = decltype(make_4d_shape<1, 1, 8, 8>()); + using input_shape = decltype(make_4d_shape<1, 1, 8, 8>()); + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + output_shape, + migraphx::index_ints<1, 1, 1, 1>>; + + constexpr auto hl = tiler::template halo_lens_for(); + EXPECT(hl[2] == 6); + EXPECT(hl[3] == 6); +} + +// ======== ndim ======== + +TEST_CASE(ndim_4d) +{ + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>())>; + EXPECT(tiler::ndim() == 4); +} From b0e4634232dcecc10bfba2b83f82d6af5cb3a728 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 7 Apr 2026 19:57:38 -0500 Subject: [PATCH 78/84] Format --- .../migraphx/kernels/spatial_tiler.hpp | 17 ++-- test/gpu/kernels/spatial_tiler.cpp | 92 +++++++++---------- 2 files changed, 49 insertions(+), 60 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp index 9c98a269ee7..bc89ef88268 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/spatial_tiler.hpp @@ -74,7 +74,7 @@ struct spatial_tiler static constexpr auto get_padding() { - if constexpr(Padding{}.size() < 2) + if constexpr(Padding{}.size() < 2) { auto pre = transform(TileLens{}, [](auto) { return index_c<0>; }); return join(pre, pre); @@ -124,13 +124,12 @@ struct spatial_tiler template static constexpr auto halo_lens_for() { - constexpr auto halo_extra = - [] { - return return_array_c([] { - return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + - total_padding(); - }); - }(); + constexpr auto halo_extra = [] { + return return_array_c([] { + return make_slice(InputShape{}, keep_spatial()).lens - out_spatial_lens() + + total_padding(); + }); + }(); return transform(output_lens(), halo_extra, [](auto o, auto h) { return o + h; }); } @@ -170,7 +169,7 @@ struct spatial_tiler idx.local_stride(_c, [&](auto i) { auto halo_multi = halo_shape.multi(i); auto src_pos = tile_origin + halo_multi; - auto input_pos = src_pos - left_padding(); + auto input_pos = src_pos - left_padding(); if constexpr(is_padded()) { smem[i] = in_bounds(input_pos, input_spatial) ? type{input_ch[input_pos]} : type{0}; diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp index dd0d5950c4e..2cecb75d242 100644 --- a/test/gpu/kernels/spatial_tiler.cpp +++ b/test/gpu/kernels/spatial_tiler.cpp @@ -26,7 +26,10 @@ #include // Helper: create a standard 4D shape from lens -template +template constexpr auto make_4d_shape() { constexpr auto lens = migraphx::index_ints{}; @@ -38,9 +41,8 @@ constexpr auto make_4d_shape() // Tile {4, 4} with NTiles=1 → output_lens = {1, 1, 4, 4} TEST_CASE(output_lens_ntiles_1) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; constexpr auto ol = tiler::output_lens(); EXPECT(ol.size() == 4); EXPECT(ol[0] == 1); @@ -52,9 +54,8 @@ TEST_CASE(output_lens_ntiles_1) // Tile {4, 4} with NTiles=2 → last dim doubled: {1, 1, 4, 8} TEST_CASE(output_lens_ntiles_2) { - using tiler = migraphx::spatial_tiler<2, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<2, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; constexpr auto ol = tiler::output_lens(); EXPECT(ol[2] == 4); EXPECT(ol[3] == 8); @@ -64,9 +65,8 @@ TEST_CASE(output_lens_ntiles_2) TEST_CASE(out_spatial_lens_basic) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<2, 3, 16, 16>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<2, 3, 16, 16>())>; constexpr auto sl = tiler::out_spatial_lens(); // keep_spatial sets dims 0,1 to 1; keeps H,W EXPECT(sl[0] == 1); @@ -80,9 +80,8 @@ TEST_CASE(out_spatial_lens_basic) // 8x8 output, 4x4 tile, NTiles=1 → ceil(8/4)=2 per spatial dim TEST_CASE(tiles_per_dim_exact) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; constexpr auto tpd = tiler::tiles_per_dim(); EXPECT(tpd[2] == 2); EXPECT(tpd[3] == 2); @@ -91,9 +90,8 @@ TEST_CASE(tiles_per_dim_exact) // 10x10 output, 4x4 tile → ceil(10/4)=3 per spatial dim TEST_CASE(tiles_per_dim_inexact) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 10, 10>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 10, 10>())>; constexpr auto tpd = tiler::tiles_per_dim(); EXPECT(tpd[2] == 3); EXPECT(tpd[3] == 3); @@ -102,9 +100,8 @@ TEST_CASE(tiles_per_dim_inexact) // NTiles=2 scales last dim: tile output is {4, 8} → ceil(16/4)=4, ceil(16/8)=2 TEST_CASE(tiles_per_dim_ntiles) { - using tiler = migraphx::spatial_tiler<2, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 16, 16>())>; + using tiler = migraphx:: + spatial_tiler<2, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 16, 16>())>; constexpr auto tpd = tiler::tiles_per_dim(); EXPECT(tpd[2] == 4); EXPECT(tpd[3] == 2); @@ -114,9 +111,8 @@ TEST_CASE(tiles_per_dim_ntiles) TEST_CASE(tiles_total_exact) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; // tiles_per_dim = {1, 1, 2, 2}, product = 4 EXPECT(tiler::tiles_total() == 4); } @@ -126,9 +122,8 @@ TEST_CASE(tiles_total_exact) // No Padding arg → get_padding returns zeros matching TileLens size TEST_CASE(get_padding_default) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; constexpr auto gp = tiler::get_padding(); EXPECT(gp.size() == 4); EXPECT(gp[0] == 0); @@ -140,9 +135,8 @@ TEST_CASE(get_padding_default) // No padding template arg → all zeros TEST_CASE(padding_default_no_padding) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; constexpr auto lp = tiler::left_padding(); constexpr auto tp = tiler::total_padding(); EXPECT(lp[0] == 0); @@ -158,10 +152,10 @@ TEST_CASE(padding_default_no_padding) // Symmetric padding {1, 1, 1, 1} → left={0,0,1,1}, total={0,0,2,2} TEST_CASE(padding_symmetric) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>()), - migraphx::index_ints<1, 1, 1, 1>>; + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<1, 1, 1, 1>>; constexpr auto lp = tiler::left_padding(); EXPECT(lp[0] == 0); EXPECT(lp[1] == 0); @@ -178,10 +172,10 @@ TEST_CASE(padding_symmetric) // Asymmetric padding {1, 2, 3, 4} → left={0,0,1,2}, total={0,0,1+3,2+4}={0,0,4,6} TEST_CASE(padding_asymmetric) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>()), - migraphx::index_ints<1, 2, 3, 4>>; + using tiler = migraphx::spatial_tiler<1, + migraphx::index_ints<4, 4>, + decltype(make_4d_shape<1, 1, 8, 8>()), + migraphx::index_ints<1, 2, 3, 4>>; constexpr auto lp = tiler::left_padding(); EXPECT(lp[2] == 1); EXPECT(lp[3] == 2); @@ -196,18 +190,16 @@ TEST_CASE(padding_asymmetric) // Tiles exactly cover output, no conv padding → not padded TEST_CASE(is_padded_exact_no_padding) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; EXPECT(not tiler::is_padded()); } // Tiles don't exactly cover output (10 not divisible by 4) → padded TEST_CASE(is_padded_overhang) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 10, 10>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 10, 10>())>; EXPECT(tiler::is_padded()); } @@ -249,9 +241,8 @@ TEST_CASE(is_padded_overhang_equals_padding) TEST_CASE(is_padded_partial_overhang) { // H=8 exactly tiled by tile_h=4. W=10 not divisible by tile_w=4. - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 10>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 10>())>; EXPECT(tiler::is_padded()); } @@ -309,9 +300,9 @@ TEST_CASE(halo_lens_with_padding) using output_shape = decltype(make_4d_shape<1, 1, 8, 8>()); using input_shape = decltype(make_4d_shape<1, 1, 8, 8>()); using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - output_shape, - migraphx::index_ints<1, 1, 1, 1>>; + migraphx::index_ints<4, 4>, + output_shape, + migraphx::index_ints<1, 1, 1, 1>>; constexpr auto hl = tiler::template halo_lens_for(); EXPECT(hl[2] == 6); @@ -322,8 +313,7 @@ TEST_CASE(halo_lens_with_padding) TEST_CASE(ndim_4d) { - using tiler = migraphx::spatial_tiler<1, - migraphx::index_ints<4, 4>, - decltype(make_4d_shape<1, 1, 8, 8>())>; + using tiler = migraphx:: + spatial_tiler<1, migraphx::index_ints<4, 4>, decltype(make_4d_shape<1, 1, 8, 8>())>; EXPECT(tiler::ndim() == 4); } From eecf7855f4182812f7bfc7662c9e5ba45a5836a0 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 22 Apr 2026 11:04:22 -0500 Subject: [PATCH 79/84] Fix tidy --- test/gpu/kernels/spatial_tiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp index 2cecb75d242..6ce858b83f3 100644 --- a/test/gpu/kernels/spatial_tiler.cpp +++ b/test/gpu/kernels/spatial_tiler.cpp @@ -30,7 +30,7 @@ template -constexpr auto make_4d_shape() +static constexpr auto make_4d_shape() { constexpr auto lens = migraphx::index_ints{}; return migraphx::make_shape(lens); From a3b61a2b5471c8c417adb368f2d0ec4ff02448c1 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 22 Apr 2026 12:39:47 -0500 Subject: [PATCH 80/84] Fix cppcheck warnings --- .../include/migraphx/kernels/debug.hpp | 9 +++++++++ .../include/migraphx/kernels/float8.hpp | 20 ++++++++----------- .../include/migraphx/kernels/float8_impl.hpp | 1 + .../include/migraphx/kernels/slice.hpp | 3 +-- test/gpu/kernels/spatial_tiler.cpp | 2 ++ 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp index 5e5e16b1315..3e7fffaa2f5 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp @@ -206,6 +206,14 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l #define MIGRAPHX_CHECK(cond) \ MIGRAPHX_ASSERT_FAIL(cond, #cond, __FILE__, __LINE__, __PRETTY_FUNCTION__) +#ifdef CPPCHECK +// NOLINTNEXTLINE +#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) T +#define MIGRAPHX_ASSUME assert(cond) +#define MIGRAPHX_UNREACHABLE assert(false) +#define MIGRAPHX_ASSERT(cond) assert(cond) +#define MIGRAPHX_WARN(cond, ...) assert(cond) +#else #ifdef MIGRAPHX_DEBUG // NOLINTNEXTLINE #define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) source_location_capture @@ -221,6 +229,7 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l #define MIGRAPHX_ASSERT(cond) #define MIGRAPHX_WARN(...) #endif +#endif #define MIGRAPHX_STATIC_ASSERT_FOR(...) \ static_assert(__VA_ARGS__); \ diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp index 43ee2ca5d87..527a0c7915b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp @@ -59,7 +59,7 @@ class numeric_limits; template struct float8 { - uint8_t data; + uint8_t data = 0; // default constructor __device__ constexpr float8() = default; // default copy constructor @@ -140,7 +140,7 @@ struct float8 migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard, uint32_t rng = 0) { - if(__builtin_is_constant_evaluated() or !FNUZ) + if(__builtin_is_constant_evaluated() or not FNUZ) { if constexpr(T == migraphx::fp8::f8_type::fp8) { @@ -249,7 +249,7 @@ struct float8 // upcast using device specific intrinsic constexpr __device__ operator float() const { - if(__builtin_is_constant_evaluated() or !FNUZ) + if(__builtin_is_constant_evaluated() or not FNUZ) { if constexpr(T == migraphx::fp8::f8_type::fp8) { @@ -261,7 +261,7 @@ struct float8 else { float fval = 0; - uint32_t i32val = static_cast(data); + uint32_t i32val = data; // upcast if constexpr(T == migraphx::fp8::f8_type::fp8) @@ -312,7 +312,7 @@ struct float8 } else { - if(T == migraphx::fp8::f8_type::bf8) + if constexpr(T == migraphx::fp8::f8_type::bf8) { return (data == 0x7D) or (data == 0x7E) or (data == 0x7F) or (data == 0xFD) or (data == 0xFE) or (data == 0xFF); @@ -333,7 +333,7 @@ struct float8 } else { - if(T == migraphx::fp8::f8_type::bf8) + if constexpr(T == migraphx::fp8::f8_type::bf8) { return (data == 0x7C) or (data == 0xFC); } @@ -370,16 +370,12 @@ struct float8 __device__ constexpr bool operator<(const float8& rhs) const { - const auto we = static_cast(*this); - const auto them = static_cast(rhs); - return we < them; + return static_cast(*this) < static_cast(rhs); } __device__ constexpr bool operator>(const float8& rhs) const { - const auto we = static_cast(*this); - const auto them = static_cast(rhs); - return we > them; + return static_cast(*this) > static_cast(rhs); } }; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp index 09ab146fbed..9fbe5e6f740 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp @@ -118,6 +118,7 @@ __device__ constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng if(x == 0) return 0; // handle negative zero + // cppcheck-suppress compareValueOutOfTypeRangeError else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000)) { return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp index 89f1a4a615e..f7adee4eec5 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/slice.hpp @@ -98,13 +98,12 @@ template constexpr auto slice_group() { return slice_size_transform{[](auto input, auto s) { - auto r = return_array_c([] { + return return_array_c([] { auto lens = decltype(s){}.lens.base(); lens.back() *= N; lens -= 1; return decltype(input){}.lens.carry(lens) + index_int{1}; }); - return r; }}; } diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp index 6ce858b83f3..76875ec7047 100644 --- a/test/gpu/kernels/spatial_tiler.cpp +++ b/test/gpu/kernels/spatial_tiler.cpp @@ -22,9 +22,11 @@ * THE SOFTWARE. * */ +// cppcheck-suppress-file constStatement #include #include + // Helper: create a standard 4D shape from lens template Date: Wed, 22 Apr 2026 12:39:55 -0500 Subject: [PATCH 81/84] Format --- test/gpu/kernels/spatial_tiler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/gpu/kernels/spatial_tiler.cpp b/test/gpu/kernels/spatial_tiler.cpp index 76875ec7047..058eb8e4274 100644 --- a/test/gpu/kernels/spatial_tiler.cpp +++ b/test/gpu/kernels/spatial_tiler.cpp @@ -26,7 +26,6 @@ #include #include - // Helper: create a standard 4D shape from lens template Date: Wed, 22 Apr 2026 12:40:14 -0500 Subject: [PATCH 82/84] Update year --- src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp | 2 +- .../gpu/kernels/include/migraphx/kernels/integral_constant.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp index 3e7fffaa2f5..aa46782ce58 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp index 9d48717bbd8..e8f16b9d5e0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From 7d9e8766415f9e3235b8ee830ddb2a265030466b Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 15 May 2026 14:41:50 -0500 Subject: [PATCH 83/84] Fix tile miscompilation --- .../gpu/kernels/include/migraphx/kernels/tile.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp index 1f11b214fd1..f183a91d369 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace migraphx { @@ -61,8 +62,8 @@ struct tile using type = typename T::type; constexpr auto s = pad_shape(make_packed_shape(get_shape_c{})); constexpr auto size = s.element_space(); - __shared__ type buffer[size]; - auto b = make_tensor_view(buffer, s); + __shared__ uninitialized_buffer buffer; + auto b = make_tensor_view(buffer.data(), s); local_tensor_copy(idx, x, b); f(b); }; @@ -77,8 +78,8 @@ struct tile using type = typename T::type; constexpr auto s = pad_shape(make_packed_shape(get_shape_c{})); constexpr auto size = s.element_space(); - __shared__ type buffer[size]; - auto b = make_tensor_view(buffer, s); + __shared__ uninitialized_buffer buffer; + auto b = make_tensor_view(buffer.data(), s); f(b); local_tensor_copy(idx, b, x); }; From c19c8b537315b0c94dbe0449ed6bc02091d05586 Mon Sep 17 00:00:00 2001 From: kahmed10 <15948690+kahmed10@users.noreply.github.com> Date: Wed, 20 May 2026 23:40:33 -0500 Subject: [PATCH 84/84] update license --- src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp | 2 +- src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp index 527a0c7915b..08640e9e07b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp @@ -2,7 +2,7 @@ * * The MIT License (MIT) * - * Copyright (C) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp index f183a91d369..6ccbd0ba17f 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/tile.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal