Skip to content

Miscompile: OpSubgroupShuffleINTEL with OpUConvert ushort<->uint and OpBranchConditional consumer (Arc A770) #402

@pvelesko

Description

@pvelesko

Summary

On Intel Arc A770 (DG2), IGC miscompiles OpSubgroupShuffleINTEL when all of the following hold:

  1. Shuffle value is OpUConvert %uint%ushort
  2. Shuffle result is narrowed with OpUConvert %ushort%uint
  3. The consumer of that value lives in a basic block reached only via OpBranchConditional (divergent control flow)

Observed: a small subgroup reduction over four lanes returns wrong values (e.g. first logical warp: got 5.0, expected 14.0 for a half-precision sum). The same SPIR-V with an unconditional consumer path, or a different output layout, computes the correct result.

SPIR-V validates with spirv-val.

Environment

  • GPU: Intel Arc A770
  • API: Level Zero (zeModuleCreate with ZE_MODULE_FORMAT_IL_SPIRV)
  • Reproducer: pure C++ + hand-written SPIR-V (.spvasmspirv-as)

Minimal reproducer layout

Two modules differ minimally:

Module Behavior
pass.spv Consumer of the second shuffle is not gated the same way → PASS
fail.spv Consumer under OpBranchConditionalFAIL

Build & run (from reproducer directory):

make    # spirv-as pass.spvasm fail.spvasm; g++ -std=c++17 -O2 -o run run.cpp -lze_loader
./run   # runs pass.spv then fail.spv; expect pass PASS, fail FAIL

Kernel name: _Z3krnPK6__halfPS_

Reproducer (inline)

Self-contained: pure Level Zero host + two hand-written SPIR-V modules. Assemble with spirv-as, link host against -lze_loader.

Makefile
# spirv-tools + g++ + Level Zero (-lze_loader). Run from this directory.
.PHONY: all clean run test
all: pass.spv fail.spv run
%.spv: %.spvasm
	spirv-as --target-env spv1.1 $< -o $@
	spirv-val $@
run: run.cpp
	$(CXX) -std=c++17 -O2 -o $@ $< -lze_loader
test: all
	./run
clean:
	rm -f pass.spv fail.spv run
run.cpp (141 lines)
// Level Zero: loads fail.spv vs pass.spv. fail: UConvert ushort<->uint around OpSubgroupShuffleINTEL
// + consumer under OpBranchConditional => wrong sum (Arc A770). pass: unconditional path => OK.

#include <level_zero/ze_api.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <vector>

#define CHK(x) do { if ((x) != ZE_RESULT_SUCCESS) { fprintf(stderr, "L0 err @%d\n", __LINE__); exit(1); } } while (0)

static uint16_t f2h(float f) {
  union { float f; uint32_t u; } v{};
  v.f = f;
  uint32_t x = v.u;
  uint16_t sign = (x >> 16) & 0x8000;
  int expo = int((x >> 23) & 0xFF) - 127 + 15;
  uint16_t mant = (x >> 13) & 0x3FF;
  if (expo <= 0) return sign;
  if (expo >= 31) return sign | 0x7C00;
  return sign | uint16_t(expo << 10) | mant;
}
static float h2f(uint16_t h) {
  uint32_t sign = uint32_t(h & 0x8000) << 16;
  int expo = (h >> 10) & 0x1F;
  uint32_t mant = h & 0x3FF;
  uint32_t x;
  if (expo == 0) x = sign;
  else if (expo == 31) x = sign | 0x7F800000 | (mant << 13);
  else x = sign | uint32_t((expo - 15 + 127) << 23) | (mant << 13);
  union { float f; uint32_t u; } v{};
  v.u = x;
  return v.f;
}

static constexpr int BLOCK = 32, BLOCKS = 4, N = BLOCK * BLOCKS;

static bool runCase(ze_context_handle_t ctx, ze_device_handle_t dev, ze_command_queue_handle_t q,
                    const char *spv, int out_elems, bool dense) {
  std::ifstream f(spv, std::ios::binary | std::ios::ate);
  if (!f) {
    fprintf(stderr, "open %s\n", spv);
    return false;
  }
  std::vector<uint8_t> buf(size_t(f.tellg()));
  f.seekg(0);
  f.read((char *)buf.data(), (std::streamsize)buf.size());

  ze_module_desc_t md{ZE_STRUCTURE_TYPE_MODULE_DESC};
  md.format = ZE_MODULE_FORMAT_IL_SPIRV;
  md.pInputModule = buf.data();
  md.inputSize = buf.size();
  ze_module_handle_t mod{};
  ze_module_build_log_handle_t log{};
  if (zeModuleCreate(ctx, dev, &md, &mod, &log) != ZE_RESULT_SUCCESS) {
    size_t s = 0;
    zeModuleBuildLogGetString(log, &s, nullptr);
    std::vector<char> l(s);
    zeModuleBuildLogGetString(log, &s, l.data());
    fprintf(stderr, "build %s: %s\n", spv, l.data());
    zeModuleBuildLogDestroy(log);
    return false;
  }
  zeModuleBuildLogDestroy(log);

  ze_kernel_desc_t kd{ZE_STRUCTURE_TYPE_KERNEL_DESC};
  kd.pKernelName = "_Z3krnPK6__halfPS_";
  ze_kernel_handle_t k{};
  CHK(zeKernelCreate(mod, &kd, &k));

  ze_device_mem_alloc_desc_t ad{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
  uint16_t *d_in{}, *d_out{};
  CHK(zeMemAllocDevice(ctx, &ad, N * sizeof(uint16_t), 64, dev, (void **)&d_in));
  CHK(zeMemAllocDevice(ctx, &ad, size_t(out_elems) * sizeof(uint16_t), 64, dev, (void **)&d_out));

  uint16_t h_in[N];
  for (int i = 0; i < N; i++) h_in[i] = f2h(float((i % 50) + 2));

  ze_command_list_desc_t cld{ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC};
  ze_command_list_handle_t cl{};
  CHK(zeCommandListCreate(ctx, dev, &cld, &cl));
  CHK(zeCommandListAppendMemoryCopy(cl, d_in, h_in, N * sizeof(uint16_t), nullptr, 0, nullptr));
  uint16_t z = 0;
  CHK(zeCommandListAppendMemoryFill(cl, d_out, &z, sizeof(z), size_t(out_elems) * sizeof(uint16_t), nullptr, 0, nullptr));
  CHK(zeCommandListAppendBarrier(cl, nullptr, 0, nullptr));
  CHK(zeKernelSetArgumentValue(k, 0, sizeof(void *), &d_in));
  CHK(zeKernelSetArgumentValue(k, 1, sizeof(void *), &d_out));
  CHK(zeKernelSetGroupSize(k, BLOCK, 1, 1));
  ze_group_count_t gc{BLOCKS, 1, 1};
  CHK(zeCommandListAppendLaunchKernel(cl, k, &gc, nullptr, 0, nullptr));
  CHK(zeCommandListAppendBarrier(cl, nullptr, 0, nullptr));
  std::vector<uint16_t> h_out(size_t(out_elems), 0);
  CHK(zeCommandListAppendMemoryCopy(cl, h_out.data(), d_out, out_elems * sizeof(uint16_t), nullptr, 0, nullptr));
  CHK(zeCommandListClose(cl));
  CHK(zeCommandQueueExecuteCommandLists(q, 1, &cl, nullptr));
  CHK(zeCommandQueueSynchronize(q, UINT64_MAX));

  int fails = 0, total = N / 4;
  for (int w = 0; w < total; w++) {
    float expect = 0;
    for (int j = 0; j < 4; j++) expect += h2f(h_in[w * 4 + j]);
    int slot = dense ? w : (w * 4);
    float got = h2f(h_out[size_t(slot)]);
    if (got != expect) {
      if (++fails <= 3) printf("  %s w=%d got=%.1f exp=%.1f\n", spv, w, got, expect);
    }
  }
  printf("  %s: %d/%d wrong -> %s\n", spv, fails, total, fails ? "FAIL" : "PASS");

  zeCommandListDestroy(cl);
  zeKernelDestroy(k);
  zeMemFree(ctx, d_in);
  zeMemFree(ctx, d_out);
  zeModuleDestroy(mod);
  return fails == 0;
}

int main() {
  CHK(zeInit(ZE_INIT_FLAG_GPU_ONLY));
  uint32_t n = 1;
  ze_driver_handle_t drv{};
  CHK(zeDriverGet(&n, &drv));
  ze_device_handle_t dev{};
  CHK(zeDeviceGet(drv, &n, &dev));
  ze_device_properties_t dp{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
  CHK(zeDeviceGetProperties(dev, &dp));
  printf("Device: %s\n", dp.name);
  ze_context_desc_t cd{ZE_STRUCTURE_TYPE_CONTEXT_DESC};
  ze_context_handle_t ctx{};
  CHK(zeContextCreate(drv, &cd, &ctx));
  ze_command_queue_desc_t qd{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
  qd.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
  ze_command_queue_handle_t q{};
  CHK(zeCommandQueueCreate(ctx, dev, &qd, &q));
  runCase(ctx, dev, q, "pass.spv", N, false);
  runCase(ctx, dev, q, "fail.spv", N / 4, true);
  zeCommandQueueDestroy(q);
  zeContextDestroy(ctx);
  return 0;
}
pass.spvasm (198 lines — control, unchanged behavior)
; SPIR-V
; Version: 1.1
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 108
; Schema: 0
               OpCapability Addresses
               OpCapability Linkage
               OpCapability Kernel
               OpCapability Float16Buffer
               OpCapability Int64
               OpCapability Int16
               OpCapability GenericPointer
               OpCapability Int8
               OpCapability SubgroupDispatch
               OpCapability SubgroupShuffleINTEL
               OpExtension "SPV_INTEL_subgroups"
          %1 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %19 "_Z3krnPK6__halfPS_" %__spirv_BuiltInLocalInvocationId %__spirv_BuiltInWorkgroupId %__spirv_BuiltInSubgroupLocalInvocationId
               OpEntryPoint Kernel %92 "__chip_var_info___chipspv_device_heap"
               OpEntryPoint Kernel %101 "__chip_var_bind___chipspv_device_heap"
               OpEntryPoint Kernel %106 "__chip_reset_non_symbols"
               OpExecutionMode %19 SubgroupSize 32
               OpExecutionMode %92 SubgroupSize 32
               OpExecutionMode %101 SubgroupSize 32
               OpExecutionMode %106 SubgroupSize 32
               OpSource OpenCL_C 200000
               OpName %__chip_var___chipspv_device_heap "__chip_var___chipspv_device_heap"
               OpName %__spirv_BuiltInLocalInvocationId "__spirv_BuiltInLocalInvocationId"
               OpName %__spirv_BuiltInWorkgroupId "__spirv_BuiltInWorkgroupId"
               OpName %__spirv_BuiltInSubgroupLocalInvocationId "__spirv_BuiltInSubgroupLocalInvocationId"
               OpName %in_coerce "in.coerce"
               OpName %out_coerce "out.coerce"
               OpName %entry "entry"
               OpName %if_then "if.then"
               OpName %if_end "if.end"
               OpName %struct___half "struct.__half"
               OpName %union_anon_0 "union.anon.0"
               OpName %call_i22 "call.i22"
               OpName %conv_i "conv.i"
               OpName %call_i "call.i"
               OpName %mul "mul"
               OpName %add "add"
               OpName %idxprom "idxprom"
               OpName %arrayidx "arrayidx"
               OpName %word_0_insert_ext_i19 "word.0.insert.ext.i19"
               OpName %call_i24 "call.i24"
               OpName %rem_i "rem.i"
               OpName %cmp_not_i "cmp.not.i"
               OpName %add_i "add.i"
               OpName %add1_i "add1.i"
               OpName %call2_i "call2.i"
               OpName %word_0_extract_trunc_i21 "word.0.extract.trunc.i21"
               OpName %add_i27 "add.i27"
               OpName %word_0_insert_ext_i "word.0.insert.ext.i"
               OpName %call_i29 "call.i29"
               OpName %rem_i31 "rem.i31"
               OpName %add_i32 "add.i32"
               OpName %cmp_not_i33 "cmp.not.i33"
               OpName %spec_select_i34 "spec.select.i34"
               OpName %mul_i35 "mul.i35"
               OpName %add1_i36 "add1.i36"
               OpName %call2_i37 "call2.i37"
               OpName %rem "rem"
               OpName %cmp "cmp"
               OpName %word_0_extract_trunc_i "word.0.extract.trunc.i"
               OpName %add_i42 "add.i42"
               OpName %conv_i23 "conv.i23"
               OpName %arrayidx10 "arrayidx10"
               OpName %entry_0 "entry"
               OpName %entry_1 "entry"
               OpName %entry_2 "entry"
               OpDecorate %__chip_var___chipspv_device_heap LinkageAttributes "__chip_var___chipspv_device_heap" Export
               OpDecorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
               OpDecorate %__spirv_BuiltInLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
               OpDecorate %__spirv_BuiltInWorkgroupId LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
               OpDecorate %__spirv_BuiltInWorkgroupId Constant
               OpDecorate %__spirv_BuiltInWorkgroupId BuiltIn WorkgroupId
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId LinkageAttributes "__spirv_BuiltInSubgroupLocalInvocationId" Import
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
      %ulong = OpTypeInt 64 0
      %uchar = OpTypeInt 8 0
       %uint = OpTypeInt 32 0
     %ushort = OpTypeInt 16 0
    %ulong_0 = OpConstant %ulong 0
    %uchar_1 = OpConstant %uchar 1
    %ulong_5 = OpConstant %ulong 5
%ulong_4294967295 = OpConstant %ulong 4294967295
     %uint_4 = OpConstant %uint 4
     %uint_3 = OpConstant %uint 3
     %uint_0 = OpConstant %uint 0
     %uint_1 = OpConstant %uint 1
     %uint_2 = OpConstant %uint 2
    %ulong_8 = OpConstant %ulong 8
    %ulong_1 = OpConstant %ulong 1
    %ulong_2 = OpConstant %ulong 2
%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
    %v3ulong = OpTypeVector %ulong 3
%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
%_ptr_Input_uint = OpTypePointer Input %uint
       %void = OpTypeVoid
         %18 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
       %half = OpTypeFloat 16
%union_anon_0 = OpTypeStruct %half
%struct___half = OpTypeStruct %union_anon_0
%_ptr_Generic_struct___half = OpTypePointer Generic %struct___half
       %bool = OpTypeBool
%_ptr_Generic_ushort = OpTypePointer Generic %ushort
%_ptr_Generic_half = OpTypePointer Generic %half
         %91 = OpTypeFunction %void %_ptr_CrossWorkgroup_ulong
        %100 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar
        %105 = OpTypeFunction %void
%__chip_var___chipspv_device_heap = OpVariable %_ptr_CrossWorkgroup_ulong CrossWorkgroup %ulong_0
%__spirv_BuiltInLocalInvocationId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInSubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
       %true = OpConstantTrue %bool
         %19 = OpFunction %void None %18
  %in_coerce = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
 %out_coerce = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
      %entry = OpLabel
         %25 = OpConvertPtrToU %ulong %in_coerce
         %30 = OpConvertUToPtr %_ptr_Generic_struct___half %25
         %31 = OpLoad %v3ulong %__spirv_BuiltInLocalInvocationId Aligned 32
         %32 = OpCompositeExtract %ulong %31 0
   %call_i22 = OpSelect %ulong %true %32 %ulong_0
     %conv_i = OpUConvert %uint %call_i22
         %37 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupId Aligned 32
         %38 = OpCompositeExtract %ulong %37 0
     %call_i = OpSelect %ulong %true %38 %ulong_0
        %mul = OpShiftLeftLogical %ulong %call_i %ulong_5
        %add = OpIAdd %ulong %mul %call_i22
    %idxprom = OpBitwiseAnd %ulong %add %ulong_4294967295
   %arrayidx = OpInBoundsPtrAccessChain %_ptr_Generic_struct___half %30 %idxprom
         %48 = OpBitcast %_ptr_Generic_ushort %arrayidx
         %49 = OpLoad %ushort %48 Aligned 2
%word_0_insert_ext_i19 = OpUConvert %uint %49
   %call_i24 = OpLoad %uint %__spirv_BuiltInSubgroupLocalInvocationId Aligned 4
      %rem_i = OpSRem %uint %call_i24 %uint_4
  %cmp_not_i = OpSLessThan %bool %rem_i %uint_3
      %add_i = OpSelect %uint %cmp_not_i %uint_1 %uint_0
     %add1_i = OpIAdd %uint %call_i24 %add_i
    %call2_i = OpSubgroupShuffleINTEL %uint %word_0_insert_ext_i19 %add1_i
%word_0_extract_trunc_i21 = OpUConvert %ushort %call2_i
         %62 = OpBitcast %half %word_0_extract_trunc_i21
         %63 = OpBitcast %half %49
    %add_i27 = OpFAdd %half %63 %62
         %65 = OpBitcast %ushort %add_i27
%word_0_insert_ext_i = OpUConvert %uint %65
   %call_i29 = OpLoad %uint %__spirv_BuiltInSubgroupLocalInvocationId Aligned 4
    %rem_i31 = OpSRem %uint %call_i29 %uint_4
    %add_i32 = OpIAdd %uint %rem_i31 %uint_2
%cmp_not_i33 = OpSLessThan %bool %rem_i31 %uint_2
%spec_select_i34 = OpSelect %uint %cmp_not_i33 %add_i32 %rem_i31
    %mul_i35 = OpISub %uint %call_i29 %rem_i31
   %add1_i36 = OpIAdd %uint %mul_i35 %spec_select_i34
  %call2_i37 = OpSubgroupShuffleINTEL %uint %word_0_insert_ext_i %add1_i36
        %rem = OpBitwiseAnd %uint %conv_i %uint_3
        %cmp = OpIEqual %bool %rem %uint_0
               OpBranch %if_then
    %if_then = OpLabel
%word_0_extract_trunc_i = OpUConvert %ushort %call2_i37
         %79 = OpBitcast %half %word_0_extract_trunc_i
    %add_i42 = OpFAdd %half %add_i27 %79
   %conv_i23 = OpUConvert %uint %call_i
         %82 = OpConvertPtrToU %ulong %out_coerce
         %83 = OpConvertUToPtr %_ptr_Generic_struct___half %82
 %arrayidx10 = OpInBoundsPtrAccessChain %_ptr_Generic_struct___half %83 %add
         %90 = OpBitcast %_ptr_Generic_half %arrayidx10
               OpStore %90 %add_i42 Aligned 2
               OpBranch %if_end
     %if_end = OpLabel
               OpReturn
               OpFunctionEnd
         %92 = OpFunction %void None %91
         %93 = OpFunctionParameter %_ptr_CrossWorkgroup_ulong
    %entry_0 = OpLabel
               OpStore %93 %ulong_8 Aligned 8
         %97 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ulong %93 %ulong_1
               OpStore %97 %ulong_8 Aligned 8
         %99 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ulong %93 %ulong_2
               OpStore %99 %ulong_1 Aligned 8
               OpReturn
               OpFunctionEnd
        %101 = OpFunction %void None %100
        %102 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
    %entry_1 = OpLabel
        %104 = OpConvertPtrToU %ulong %102
               OpStore %__chip_var___chipspv_device_heap %104 Aligned 8
               OpReturn
               OpFunctionEnd
        %106 = OpFunction %void None %105
    %entry_2 = OpLabel
               OpReturn
               OpFunctionEnd
fail.spvasm (206 lines — miscompiles, consumer under OpBranchConditional)
; SPIR-V
; Version: 1.1
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 108
; Schema: 0
               OpCapability Addresses
               OpCapability Linkage
               OpCapability Kernel
               OpCapability Float16Buffer
               OpCapability Int64
               OpCapability Int16
               OpCapability GenericPointer
               OpCapability Int8
               OpCapability SubgroupDispatch
               OpCapability SubgroupShuffleINTEL
               OpExtension "SPV_INTEL_subgroups"
          %1 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %19 "_Z3krnPK6__halfPS_" %__spirv_BuiltInLocalInvocationId %__spirv_BuiltInWorkgroupId %__spirv_BuiltInSubgroupLocalInvocationId
               OpEntryPoint Kernel %92 "__chip_var_info___chipspv_device_heap"
               OpEntryPoint Kernel %101 "__chip_var_bind___chipspv_device_heap"
               OpEntryPoint Kernel %106 "__chip_reset_non_symbols"
               OpExecutionMode %19 SubgroupSize 32
               OpExecutionMode %92 SubgroupSize 32
               OpExecutionMode %101 SubgroupSize 32
               OpExecutionMode %106 SubgroupSize 32
               OpSource OpenCL_C 200000
               OpName %__chip_var___chipspv_device_heap "__chip_var___chipspv_device_heap"
               OpName %__spirv_BuiltInLocalInvocationId "__spirv_BuiltInLocalInvocationId"
               OpName %__spirv_BuiltInWorkgroupId "__spirv_BuiltInWorkgroupId"
               OpName %__spirv_BuiltInSubgroupLocalInvocationId "__spirv_BuiltInSubgroupLocalInvocationId"
               OpName %in_coerce "in.coerce"
               OpName %out_coerce "out.coerce"
               OpName %entry "entry"
               OpName %if_then "if.then"
               OpName %if_end "if.end"
               OpName %struct___half "struct.__half"
               OpName %union_anon_0 "union.anon.0"
               OpName %call_i22 "call.i22"
               OpName %conv_i "conv.i"
               OpName %call_i "call.i"
               OpName %mul "mul"
               OpName %add "add"
               OpName %idxprom "idxprom"
               OpName %arrayidx "arrayidx"
               OpName %word_0_insert_ext_i19 "word.0.insert.ext.i19"
               OpName %call_i24 "call.i24"
               OpName %rem_i "rem.i"
               OpName %cmp_not_i "cmp.not.i"
               OpName %add_i "add.i"
               OpName %add1_i "add1.i"
               OpName %call2_i "call2.i"
               OpName %word_0_extract_trunc_i21 "word.0.extract.trunc.i21"
               OpName %add_i27 "add.i27"
               OpName %word_0_insert_ext_i "word.0.insert.ext.i"
               OpName %call_i29 "call.i29"
               OpName %rem_i31 "rem.i31"
               OpName %add_i32 "add.i32"
               OpName %cmp_not_i33 "cmp.not.i33"
               OpName %spec_select_i34 "spec.select.i34"
               OpName %mul_i35 "mul.i35"
               OpName %add1_i36 "add1.i36"
               OpName %call2_i37 "call2.i37"
               OpName %rem "rem"
               OpName %cmp "cmp"
               OpName %word_0_extract_trunc_i "word.0.extract.trunc.i"
               OpName %add_i42 "add.i42"
               OpName %conv_i23 "conv.i23"
               OpName %mul7 "mul7"
               OpName %div14 "div14"
               OpName %add8 "add8"
               OpName %idxprom9 "idxprom9"
               OpName %arrayidx10 "arrayidx10"
               OpName %entry_0 "entry"
               OpName %entry_1 "entry"
               OpName %entry_2 "entry"
               OpDecorate %__chip_var___chipspv_device_heap LinkageAttributes "__chip_var___chipspv_device_heap" Export
               OpDecorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
               OpDecorate %__spirv_BuiltInLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
               OpDecorate %__spirv_BuiltInWorkgroupId LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
               OpDecorate %__spirv_BuiltInWorkgroupId Constant
               OpDecorate %__spirv_BuiltInWorkgroupId BuiltIn WorkgroupId
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId LinkageAttributes "__spirv_BuiltInSubgroupLocalInvocationId" Import
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInSubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
      %ulong = OpTypeInt 64 0
      %uchar = OpTypeInt 8 0
       %uint = OpTypeInt 32 0
     %ushort = OpTypeInt 16 0
    %ulong_0 = OpConstant %ulong 0
    %uchar_1 = OpConstant %uchar 1
    %ulong_5 = OpConstant %ulong 5
%ulong_4294967295 = OpConstant %ulong 4294967295
     %uint_4 = OpConstant %uint 4
     %uint_3 = OpConstant %uint 3
     %uint_0 = OpConstant %uint 0
     %uint_1 = OpConstant %uint 1
     %uint_2 = OpConstant %uint 2
    %ulong_8 = OpConstant %ulong 8
    %ulong_1 = OpConstant %ulong 1
    %ulong_2 = OpConstant %ulong 2
%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
    %v3ulong = OpTypeVector %ulong 3
%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
%_ptr_Input_uint = OpTypePointer Input %uint
       %void = OpTypeVoid
         %18 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar
       %half = OpTypeFloat 16
%union_anon_0 = OpTypeStruct %half
%struct___half = OpTypeStruct %union_anon_0
%_ptr_Generic_struct___half = OpTypePointer Generic %struct___half
       %bool = OpTypeBool
%_ptr_Generic_ushort = OpTypePointer Generic %ushort
%_ptr_Generic_half = OpTypePointer Generic %half
         %91 = OpTypeFunction %void %_ptr_CrossWorkgroup_ulong
        %100 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar
        %105 = OpTypeFunction %void
%__chip_var___chipspv_device_heap = OpVariable %_ptr_CrossWorkgroup_ulong CrossWorkgroup %ulong_0
%__spirv_BuiltInLocalInvocationId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInSubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
       %true = OpConstantTrue %bool
         %19 = OpFunction %void None %18
  %in_coerce = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
 %out_coerce = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
      %entry = OpLabel
         %25 = OpConvertPtrToU %ulong %in_coerce
         %30 = OpConvertUToPtr %_ptr_Generic_struct___half %25
         %31 = OpLoad %v3ulong %__spirv_BuiltInLocalInvocationId Aligned 32
         %32 = OpCompositeExtract %ulong %31 0
   %call_i22 = OpSelect %ulong %true %32 %ulong_0
     %conv_i = OpUConvert %uint %call_i22
         %37 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupId Aligned 32
         %38 = OpCompositeExtract %ulong %37 0
     %call_i = OpSelect %ulong %true %38 %ulong_0
        %mul = OpShiftLeftLogical %ulong %call_i %ulong_5
        %add = OpIAdd %ulong %mul %call_i22
    %idxprom = OpBitwiseAnd %ulong %add %ulong_4294967295
   %arrayidx = OpInBoundsPtrAccessChain %_ptr_Generic_struct___half %30 %idxprom
         %48 = OpBitcast %_ptr_Generic_ushort %arrayidx
         %49 = OpLoad %ushort %48 Aligned 2
%word_0_insert_ext_i19 = OpUConvert %uint %49
   %call_i24 = OpLoad %uint %__spirv_BuiltInSubgroupLocalInvocationId Aligned 4
      %rem_i = OpSRem %uint %call_i24 %uint_4
  %cmp_not_i = OpSLessThan %bool %rem_i %uint_3
      %add_i = OpSelect %uint %cmp_not_i %uint_1 %uint_0
     %add1_i = OpIAdd %uint %call_i24 %add_i
    %call2_i = OpSubgroupShuffleINTEL %uint %word_0_insert_ext_i19 %add1_i
%word_0_extract_trunc_i21 = OpUConvert %ushort %call2_i
         %62 = OpBitcast %half %word_0_extract_trunc_i21
         %63 = OpBitcast %half %49
    %add_i27 = OpFAdd %half %63 %62
         %65 = OpBitcast %ushort %add_i27
%word_0_insert_ext_i = OpUConvert %uint %65
   %call_i29 = OpLoad %uint %__spirv_BuiltInSubgroupLocalInvocationId Aligned 4
    %rem_i31 = OpSRem %uint %call_i29 %uint_4
    %add_i32 = OpIAdd %uint %rem_i31 %uint_2
%cmp_not_i33 = OpSLessThan %bool %rem_i31 %uint_2
%spec_select_i34 = OpSelect %uint %cmp_not_i33 %add_i32 %rem_i31
    %mul_i35 = OpISub %uint %call_i29 %rem_i31
   %add1_i36 = OpIAdd %uint %mul_i35 %spec_select_i34
  %call2_i37 = OpSubgroupShuffleINTEL %uint %word_0_insert_ext_i %add1_i36
        %rem = OpBitwiseAnd %uint %conv_i %uint_3
        %cmp = OpIEqual %bool %rem %uint_0
               OpBranchConditional %cmp %if_then %if_end
    %if_then = OpLabel
%word_0_extract_trunc_i = OpUConvert %ushort %call2_i37
         %79 = OpBitcast %half %word_0_extract_trunc_i
    %add_i42 = OpFAdd %half %add_i27 %79
   %conv_i23 = OpUConvert %uint %call_i
         %82 = OpConvertPtrToU %ulong %out_coerce
         %83 = OpConvertUToPtr %_ptr_Generic_struct___half %82
       %mul7 = OpShiftLeftLogical %uint %conv_i23 %uint_3
      %div14 = OpShiftRightLogical %uint %conv_i %uint_2
       %add8 = OpIAdd %uint %mul7 %div14
   %idxprom9 = OpUConvert %ulong %add8
 %arrayidx10 = OpInBoundsPtrAccessChain %_ptr_Generic_struct___half %83 %idxprom9
         %90 = OpBitcast %_ptr_Generic_half %arrayidx10
               OpStore %90 %add_i42 Aligned 2
               OpBranch %if_end
     %if_end = OpLabel
               OpReturn
               OpFunctionEnd
         %92 = OpFunction %void None %91
         %93 = OpFunctionParameter %_ptr_CrossWorkgroup_ulong
    %entry_0 = OpLabel
               OpStore %93 %ulong_8 Aligned 8
         %97 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ulong %93 %ulong_1
               OpStore %97 %ulong_8 Aligned 8
         %99 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ulong %93 %ulong_2
               OpStore %99 %ulong_1 Aligned 8
               OpReturn
               OpFunctionEnd
        %101 = OpFunction %void None %100
        %102 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
    %entry_1 = OpLabel
        %104 = OpConvertPtrToU %ulong %102
               OpStore %__chip_var___chipspv_device_heap %104 Aligned 8
               OpReturn
               OpFunctionEnd
        %106 = OpFunction %void None %105
    %entry_2 = OpLabel
               OpReturn
               OpFunctionEnd

Impact

This pattern appears in HIP/rocPRIM-style __half warp shuffles (ushort↔uint round-trip around shuffle + branchy reductions), causing widespread wrong results in warp/block reduce/scan tests unless the IR avoids the narrow↔wide shuffle sequence.

Related

Distinct from other OpSubgroupShuffleINTEL reports (e.g. ICE with v3uint LocalInvocationId, multi-workgroup visibility, or FP64/atomic interaction): this is a wrong codegen for 16-bit payload + divergent consumer with valid SPIR-V.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions