diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl index b8f8550baf3..903baabc3cf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl @@ -9,97 +9,97 @@ #version 450 core ${define_required_extensions("texture3d", DTYPE)} -${define_explicit_type_extensions(DTYPE)} #define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} -#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} ${define_active_storage_type("texture3d")} +#extension GL_EXT_control_flow_attributes : require + layout(std430) buffer; -#include "indexing_utils.h" +#include "common.glslh" +#include "indexing.glslh" ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} + layout(push_constant) uniform restrict Block { - ivec4 out_sizes; - ivec4 in_sizes; - ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j + ivec4 permute_dims; }; -${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); -const lowp int out_packed_dim = unhash_packed_dim(out_layout); - -${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} -const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -const lowp int in_packed_dim = unhash_packed_dim(in_layout); +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} +const int out_packed_dim = get_packed_dim(out_layout); +const int in_packed_dim = get_packed_dim(in_layout); layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// Convert output tensor index to input tensor index based on permutation +// Convert output tensor index to input tensor index based on permutation. +// permute_dims[i] = j means output dim i comes from input dim j. +// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w} +// This uses literal component access on the push constant (safe) and dynamic +// indexing into the local in_tidx variable (also safe). ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { ivec4 in_tidx; - - // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] in_tidx[permute_dims.x] = out_tidx.x; in_tidx[permute_dims.y] = out_tidx.y; in_tidx[permute_dims.z] = out_tidx.z; in_tidx[permute_dims.w] = out_tidx.w; - return in_tidx; } -// Check if we can use the fast path where texels from the input tensor can be -// copied directly into the output tensor. This occurs when the packed dimension -// is preserved in the permutation, i.e. reading a texel from the output tensor -// produces 4 texels along the same dimension as reading a texel from the input -// tensor. -bool can_use_fast_path() { - // Fast path is possible when the packed dimension is preserved in the permutation - // This means permute_dims[out_packed_dim] == in_packed_dim - return permute_dims[out_packed_dim] == in_packed_dim; -} - void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); - ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(out_tidx, out_sizes))) { + if (out_of_bounds(out_pos, outp)) { return; } - if (can_use_fast_path()) { + TensorIndex4D out_tidx = + texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout); + + // Check if packed dimension is preserved in the permutation. Use safe_idx + // to avoid dynamic indexing of push constant with spec-const-derived index. + const bool fast_path = + safe_idx(permute_dims, out_packed_dim) == in_packed_dim; + + if (fast_path) { // Fast path: packed dimension is preserved, so we can copy texels directly - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - write_texel_lpos(t_out, lpos, in_texel, out_axis_map); - } - else { + ivec3 in_pos = + tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout); + VEC4_T in_texel = texelFetch(t_in, in_pos, 0); + + imageStore(t_out, out_pos, in_texel); + } else { // Slow path: packed dimension is not preserved, so each element of the - // output texel may be "sourced" from a different texel in the input tensor. - // Therefore each output texel element is processed individually. + // output texel may come from a different texel in the input tensor. VEC4_T out_texel = VEC4_T(0); - for (int texel_i = 0; texel_i < 4; ++texel_i) { - ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); - ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); - int element_idx = in_tidx[in_packed_dim] % 4; + for (int comp = 0; comp < 4; comp++) { + ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data); + TensorIndex4D in_tidx; + in_tidx.data = in_tidx_data; - VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); - T selected_value = T(in_texel[element_idx]); + TextureElementIndex in_elem = + tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout); - out_texel[texel_i] = selected_value; + VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0); + out_texel[comp] = in_texel[in_elem.comp]; - out_tidx[out_packed_dim]++; + out_tidx.data[out_packed_dim]++; } - write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + imageStore(t_out, out_pos, out_texel); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index d7b06015b72..8081424cfb7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -11,17 +11,12 @@ #include #include -#include -#include #include #include namespace vkcompute { -using utils::ivec2; -using utils::ivec3; using utils::ivec4; -using utils::uvec4; namespace { @@ -32,13 +27,38 @@ void check_args( const ValueRef out) { (void)permute_dims; VK_CHECK_COND(check_same_packed_dim(graph, in, out)); - - // This implementation doesn't not requires the input tensor to have the same - // dim size as the argument. The code will work as long as the input tensor's - // dim size is shorter than the permute dim array. In this case, the code - // assume size of 1 at the higher dimensions. } +struct WHCNPermuteDims { + int32_t whcn_permute_dims[api::kTensorDimLimit]; + + void initialize(const std::vector& permute_dims) { + const int32_t permute_ndim = permute_dims.size(); + for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { + const int32_t nchw_i = permute_ndim - 1 - whcn_i; + int64_t index_val = permute_dims.at(nchw_i); + if (index_val < 0) { + index_val += permute_ndim; + } + const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; + whcn_permute_dims[whcn_i] = permute_dim_whcn; + } + for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; + whcn_i++) { + whcn_permute_dims[whcn_i] = whcn_i; + } + } + + int32_t pack_into_int32() const { + VK_CHECK_COND(api::kTensorDimLimit <= 8); + int32_t packed = 0; + for (int32_t i = 0; i < api::kTensorDimLimit; i++) { + packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); + } + return packed; + } +}; + } // namespace void resize_permute_node( @@ -101,15 +121,36 @@ void add_permute_node( const ValueRef out) { check_args(graph, in, permute_dims, out); - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order valueto WHCN order value - // 2. Reverse the order of the permute array from NCHW order to WHCN order + std::string kernel_name = "permute"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)}; + + std::vector push_constants; + vkapi::SpecVarList spec_vars = { + graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; + + // WHCN permute dims for the texture path (ivec4, max 4D). + // Declared here so its lifetime extends to the DynamicDispatchNode creation + // where push_constants references it. ivec4 whcn_permute_dims{0, 1, 2, 3}; - { + + if (graph.is_buffer_storage(out)) { + // Buffer path: supports up to kTensorDimLimit dims via WHCNPermuteDims, + // packed into a spec constant int + WHCNPermuteDims whcn_pd; + whcn_pd.initialize(*graph.get_int_list(permute_dims)); + spec_vars.append(whcn_pd.pack_into_int32()); + } else { + // Texture path: compute 4D WHCN permute dims and pass as push constant IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); const int32_t permute_ndim = utils::safe_downcast(permute_dims_ptr->size()); + VK_CHECK_COND( + permute_ndim <= 4, + "Texture storage only supports permute with up to 4 dims"); for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; nchw_i--, whcn_i++) { @@ -119,133 +160,23 @@ void add_permute_node( permute_dim_nchw += permute_ndim; } const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; - whcn_permute_dims[whcn_i] = permute_dim_whcn; } - } - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - vkapi::SpecVarList spec_vars; - - const int32_t out_channels = dim_at(graph.sizes_of(out)); - const int32_t in_channels = dim_at(graph.sizes_of(in)); - - const int32_t packed_dim = graph.packed_dim_of(in); - ivec2 channel_info = {out_channels, in_channels}; - if (packed_dim == WHCN::kChannelsDim) { - channel_info[0] = utils::align_up_4(channel_info[0]); - channel_info[1] = utils::align_up_4(channel_info[1]); + push_constants.push_back( + PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))); } - push_constants = { - graph.sizes_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))}; - - spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; - graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), default_pick_global_wg_size, default_pick_local_wg_size, {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants + param_ubos, push_constants, - // Specialization Constants spec_vars, - // Resize Args {permute_dims}, - // Resizing Logic - resize_permute_node)); -} - -struct WHCNPermuteDims { - int32_t whcn_permute_dims[api::kTensorDimLimit]; - - void initialize(const std::vector& permute_dims) { - const int32_t permute_ndim = permute_dims.size(); - for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) { - const int32_t nchw_i = permute_ndim - 1 - whcn_i; - int64_t index_val = permute_dims.at(nchw_i); - if (index_val < 0) { - index_val += permute_ndim; - } - const int32_t permute_dim_whcn = permute_ndim - 1 - index_val; - whcn_permute_dims[whcn_i] = permute_dim_whcn; - } - for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit; - whcn_i++) { - whcn_permute_dims[whcn_i] = whcn_i; - } - } - - int32_t pack_into_int32() const { - // If kTensorDimLimit is increased, we will need to send in an additional - // int. - VK_CHECK_COND(api::kTensorDimLimit <= 8); - // Packs the 8 elements in whcn_permute_dims into a single int32_t. Each - // element is packed into 4 bits. - int32_t packed = 0; - for (int32_t i = 0; i < api::kTensorDimLimit; i++) { - packed |= (whcn_permute_dims[i] & 0x0F) << (i * 4); - } - return packed; - } -}; - -void add_permute_buffer_node( - ComputeGraph& graph, - const ValueRef in, - const ValueRef permute_dims, - const ValueRef out) { - check_args(graph, in, permute_dims, out); - - WHCNPermuteDims whcn_permute_dims; - // Convert the permute dims to WHCN dimension order, which is the standard in - // our compute shaders. The following transformations are applied. - // 1. Change dimension index values from NCHW order value to WHCN order value - // 2. Extend the permute array to kTensorDimLimit - { - IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - whcn_permute_dims.initialize(*permute_dims_ptr); - } - - std::string kernel_name = "permute"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - vkapi::ParamsBindList param_buffers = { - graph.buffer_meta_ubo(out), - graph.buffer_meta_ubo(in), - }; - - graph.execute_nodes().emplace_back(new DynamicDispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - default_pick_global_wg_size, - default_pick_local_wg_size, - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Parameter buffers - param_buffers, - // Push Constants - {}, - // Specialization Constants - {graph.hashed_layout_of(out), - graph.hashed_layout_of(in), - whcn_permute_dims.pack_into_int32()}, - // Resize Args - {permute_dims}, - // Resizing Logic resize_permute_node)); } @@ -255,10 +186,7 @@ void permute(ComputeGraph& graph, const std::vector& args) { const ValueRef permute_dims = args.at(idx++); const ValueRef out = args.at(idx++); - if (graph.is_buffer_storage(args[2])) { - return add_permute_buffer_node(graph, in, permute_dims, out); - } - return add_permute_node(graph, in, permute_dims, out); + add_permute_node(graph, in, permute_dims, out); } REGISTER_OPERATORS {