miscompilation: incorrect vectorization of loop copying i32 values

We have recently experienced a miscompilation in the [Zig](https://codeberg.org/ziglang/zig) compiler which appears to be related to loop vectorization. It only reproduces for some values of `target-cpu` (this repro uses `znver4`). The bug was first encountered in LLVM 21, but @alexrp has been able to confirm that it still reproduces in LLVM 22.

I was able to reduce the repro to this IR (and some simple C code calling it):

```llvm
; ModuleID = 'reduced.bc'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux6.12.561-gnu2.41.0"

@0 = constant { i32, i8, [3 x i8] } { i32 undef, i8 0, [3 x i8] undef }

declare ptr @malloc(i64)

define i32 @repro_entry(ptr %arg_ptr, i64 %1) #0 {
entry:
  %3 = alloca { i32, i8, [3 x i8] }, align 4
  %4 = shl i64 %1, 2
  %5 = icmp eq i64 %4, 0
  br i1 %5, label %pre_loop, label %len_nonzero

pre_loop:                                                ; preds = %len_nonzero, %entry
  %buf = phi ptr [ %9, %len_nonzero ], [ %arg_ptr, %entry ]
  br label %loop_check

len_nonzero:                                                ; preds = %entry
  %9 = call ptr @malloc(i64 %4)
  br label %pre_loop

loop_check:                                               ; preds = %loop_body, %pre_loop
  %index = phi i64 [ 0, %pre_loop ], [ %18, %loop_body ]
  %11 = icmp ult i64 %index, %1
  br i1 %11, label %loop_body, label %end

loop_body:                                               ; preds = %loop_check
  %buf_elem_ptr = getelementptr i32, ptr %buf, i64 %index
  %arg_elem_ptr = getelementptr i32, ptr %arg_ptr, i64 %index
  %arg_elem = load i32, ptr %arg_elem_ptr, align 4
  %16 = call fastcc i32 @repro.sameIfNonzero2(ptr %3, i32 %arg_elem)
  %17 = load i32, ptr %3, align 4
  store i32 %17, ptr %buf_elem_ptr, align 4
  %18 = add i64 %index, 1
  br label %loop_check

end:                                               ; preds = %loop_check
  %20 = load i32, ptr %buf, align 4
  ret i32 %20
}

define fastcc i32 @repro.sameIfNonzero2(ptr %0, i32 %1) {
  call fastcc void @repro.sameIfNonzero(ptr %0, i32 %1)
  ret i32 0
}

define fastcc void @repro.sameIfNonzero(ptr %0, i32 %1) {
entry:
  %3 = alloca { i32, i8, [3 x i8] }, align 4
  %zero_cond = icmp eq i32 %1, 0
  br i1 %zero_cond, label %yes_zero, label %non_zero

yes_zero:                                                ; preds = %non_zero, %entry
  %5 = phi ptr [ %3, %non_zero ], [ @0, %entry ]
  call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %5, i64 8, i1 false)
  ret void

non_zero:                                                ; preds = %entry
  store i32 %1, ptr %3, align 4
  br label %yes_zero
}

attributes #0 = { "target-cpu"="znver4" }
```
```c
/* main.c. I have this in a separate compilation unit to prevent inlining interfering with the repro. */
#include <stdint.h>
#include <stdio.h>
uint32_t repro_entry(uint32_t *ptr, size_t len);
uint32_t args[32] = {
    100, 101, 102, 103, 104, 105, 106, 107,
    108, 109, 110, 111, 112, 113, 114, 115,
    116, 117, 118, 119, 120, 121, 122, 123,
    124, 125, 126, 127, 128, 129, 130, 131,
};
int main(void) {
    uint32_t res = repro_entry(args, 32);
    printf("expect %u, got %u\n", args[0], res);
    return 0;
}
```
With `clang`:
```shell-session
$ clang-21 -O0 main.c reduced.ll && ./a.out
expect 100, got 100
$ clang-21 -O1 main.c reduced.ll && ./a.out
expect 100, got 100
$ clang-21 -O2 main.c reduced.ll && ./a.out
expect 100, got 107
```
Invoking `opt` instead of using `clang -O2`:
```shell-session
$ opt reduced.ll -S -o reduced_optimized.ll -passes='inline,loop-rotate,sroa,instcombine,loop-vectorize'
$ clang -O0 main.c reduced_optimized.ll && ./a.out
expect 100, got 107
```
It looks to me like the `loop-vectorize` pass is probably at fault, since it appears that the 8th input element (value 107) is copied to the first 8 elements of the output buffer which certainly sounds like a vectorization issue.

This is a quite serious problem for the Zig project: the bug is triggering in our normal compiler bootstrapping process for many users, and there's no clear way to work around this in the code which is triggering the miscompilation, so the only option I can see is disabling the `loop-vectorize` pass, which will presumably seriously pessimize certain codegen.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

miscompilation: incorrect vectorization of loop copying i32 values #186922

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

miscompilation: incorrect vectorization of loop copying i32 values #186922

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions