// Level Zero: loads fail.spv vs pass.spv. fail: UConvert ushort<->uint around OpSubgroupShuffleINTEL
// + consumer under OpBranchConditional => wrong sum (Arc A770). pass: unconditional path => OK.
#include <level_zero/ze_api.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <vector>
#define CHK(x) do { if ((x) != ZE_RESULT_SUCCESS) { fprintf(stderr, "L0 err @%d\n", __LINE__); exit(1); } } while (0)
static uint16_t f2h(float f) {
union { float f; uint32_t u; } v{};
v.f = f;
uint32_t x = v.u;
uint16_t sign = (x >> 16) & 0x8000;
int expo = int((x >> 23) & 0xFF) - 127 + 15;
uint16_t mant = (x >> 13) & 0x3FF;
if (expo <= 0) return sign;
if (expo >= 31) return sign | 0x7C00;
return sign | uint16_t(expo << 10) | mant;
}
static float h2f(uint16_t h) {
uint32_t sign = uint32_t(h & 0x8000) << 16;
int expo = (h >> 10) & 0x1F;
uint32_t mant = h & 0x3FF;
uint32_t x;
if (expo == 0) x = sign;
else if (expo == 31) x = sign | 0x7F800000 | (mant << 13);
else x = sign | uint32_t((expo - 15 + 127) << 23) | (mant << 13);
union { float f; uint32_t u; } v{};
v.u = x;
return v.f;
}
static constexpr int BLOCK = 32, BLOCKS = 4, N = BLOCK * BLOCKS;
static bool runCase(ze_context_handle_t ctx, ze_device_handle_t dev, ze_command_queue_handle_t q,
const char *spv, int out_elems, bool dense) {
std::ifstream f(spv, std::ios::binary | std::ios::ate);
if (!f) {
fprintf(stderr, "open %s\n", spv);
return false;
}
std::vector<uint8_t> buf(size_t(f.tellg()));
f.seekg(0);
f.read((char *)buf.data(), (std::streamsize)buf.size());
ze_module_desc_t md{ZE_STRUCTURE_TYPE_MODULE_DESC};
md.format = ZE_MODULE_FORMAT_IL_SPIRV;
md.pInputModule = buf.data();
md.inputSize = buf.size();
ze_module_handle_t mod{};
ze_module_build_log_handle_t log{};
if (zeModuleCreate(ctx, dev, &md, &mod, &log) != ZE_RESULT_SUCCESS) {
size_t s = 0;
zeModuleBuildLogGetString(log, &s, nullptr);
std::vector<char> l(s);
zeModuleBuildLogGetString(log, &s, l.data());
fprintf(stderr, "build %s: %s\n", spv, l.data());
zeModuleBuildLogDestroy(log);
return false;
}
zeModuleBuildLogDestroy(log);
ze_kernel_desc_t kd{ZE_STRUCTURE_TYPE_KERNEL_DESC};
kd.pKernelName = "_Z3krnPK6__halfPS_";
ze_kernel_handle_t k{};
CHK(zeKernelCreate(mod, &kd, &k));
ze_device_mem_alloc_desc_t ad{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
uint16_t *d_in{}, *d_out{};
CHK(zeMemAllocDevice(ctx, &ad, N * sizeof(uint16_t), 64, dev, (void **)&d_in));
CHK(zeMemAllocDevice(ctx, &ad, size_t(out_elems) * sizeof(uint16_t), 64, dev, (void **)&d_out));
uint16_t h_in[N];
for (int i = 0; i < N; i++) h_in[i] = f2h(float((i % 50) + 2));
ze_command_list_desc_t cld{ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC};
ze_command_list_handle_t cl{};
CHK(zeCommandListCreate(ctx, dev, &cld, &cl));
CHK(zeCommandListAppendMemoryCopy(cl, d_in, h_in, N * sizeof(uint16_t), nullptr, 0, nullptr));
uint16_t z = 0;
CHK(zeCommandListAppendMemoryFill(cl, d_out, &z, sizeof(z), size_t(out_elems) * sizeof(uint16_t), nullptr, 0, nullptr));
CHK(zeCommandListAppendBarrier(cl, nullptr, 0, nullptr));
CHK(zeKernelSetArgumentValue(k, 0, sizeof(void *), &d_in));
CHK(zeKernelSetArgumentValue(k, 1, sizeof(void *), &d_out));
CHK(zeKernelSetGroupSize(k, BLOCK, 1, 1));
ze_group_count_t gc{BLOCKS, 1, 1};
CHK(zeCommandListAppendLaunchKernel(cl, k, &gc, nullptr, 0, nullptr));
CHK(zeCommandListAppendBarrier(cl, nullptr, 0, nullptr));
std::vector<uint16_t> h_out(size_t(out_elems), 0);
CHK(zeCommandListAppendMemoryCopy(cl, h_out.data(), d_out, out_elems * sizeof(uint16_t), nullptr, 0, nullptr));
CHK(zeCommandListClose(cl));
CHK(zeCommandQueueExecuteCommandLists(q, 1, &cl, nullptr));
CHK(zeCommandQueueSynchronize(q, UINT64_MAX));
int fails = 0, total = N / 4;
for (int w = 0; w < total; w++) {
float expect = 0;
for (int j = 0; j < 4; j++) expect += h2f(h_in[w * 4 + j]);
int slot = dense ? w : (w * 4);
float got = h2f(h_out[size_t(slot)]);
if (got != expect) {
if (++fails <= 3) printf(" %s w=%d got=%.1f exp=%.1f\n", spv, w, got, expect);
}
}
printf(" %s: %d/%d wrong -> %s\n", spv, fails, total, fails ? "FAIL" : "PASS");
zeCommandListDestroy(cl);
zeKernelDestroy(k);
zeMemFree(ctx, d_in);
zeMemFree(ctx, d_out);
zeModuleDestroy(mod);
return fails == 0;
}
int main() {
CHK(zeInit(ZE_INIT_FLAG_GPU_ONLY));
uint32_t n = 1;
ze_driver_handle_t drv{};
CHK(zeDriverGet(&n, &drv));
ze_device_handle_t dev{};
CHK(zeDeviceGet(drv, &n, &dev));
ze_device_properties_t dp{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
CHK(zeDeviceGetProperties(dev, &dp));
printf("Device: %s\n", dp.name);
ze_context_desc_t cd{ZE_STRUCTURE_TYPE_CONTEXT_DESC};
ze_context_handle_t ctx{};
CHK(zeContextCreate(drv, &cd, &ctx));
ze_command_queue_desc_t qd{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
qd.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_command_queue_handle_t q{};
CHK(zeCommandQueueCreate(ctx, dev, &qd, &q));
runCase(ctx, dev, q, "pass.spv", N, false);
runCase(ctx, dev, q, "fail.spv", N / 4, true);
zeCommandQueueDestroy(q);
zeContextDestroy(ctx);
return 0;
}
Summary
On Intel Arc A770 (DG2), IGC miscompiles
OpSubgroupShuffleINTELwhen all of the following hold:OpUConvert %uint←%ushortOpUConvert %ushort←%uintOpBranchConditional(divergent control flow)Observed: a small subgroup reduction over four lanes returns wrong values (e.g. first logical warp: got 5.0, expected 14.0 for a half-precision sum). The same SPIR-V with an unconditional consumer path, or a different output layout, computes the correct result.
SPIR-V validates with
spirv-val.Environment
zeModuleCreatewithZE_MODULE_FORMAT_IL_SPIRV).spvasm→spirv-as)Minimal reproducer layout
Two modules differ minimally:
pass.spvfail.spvOpBranchConditional→ FAILBuild & run (from reproducer directory):
Kernel name:
_Z3krnPK6__halfPS_Reproducer (inline)
Self-contained: pure Level Zero host + two hand-written SPIR-V modules. Assemble with
spirv-as, link host against-lze_loader.Makefilerun.cpp(141 lines)pass.spvasm(198 lines — control, unchanged behavior)fail.spvasm(206 lines — miscompiles, consumer underOpBranchConditional)Impact
This pattern appears in HIP/rocPRIM-style
__halfwarp shuffles (ushort↔uint round-trip around shuffle + branchy reductions), causing widespread wrong results in warp/block reduce/scan tests unless the IR avoids the narrow↔wide shuffle sequence.Related
Distinct from other
OpSubgroupShuffleINTELreports (e.g. ICE withv3uintLocalInvocationId, multi-workgroup visibility, or FP64/atomic interaction): this is a wrong codegen for 16-bit payload + divergent consumer with valid SPIR-V.