Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ jobs:
- name: Run miri tests
env:
TARGET: "aarch64-unknown-linux-gnu"
RUSTFLAGS: "-Ctarget-cpu=neoverse-v3"
run: |
# read filters and join them with a space.
FILTERS=$(cat aarch64-miri-tests.txt | tr '\n' ' ')
Expand Down
4 changes: 2 additions & 2 deletions crates/core_arch/src/aarch64/neon/generated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12189,7 +12189,7 @@ pub unsafe fn vld3q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x3_t) ->
#[rustc_legacy_const_generics(2)]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
pub unsafe fn vld3q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x3_t) -> int8x16x3_t {
static_assert_uimm_bits!(LANE, 3);
static_assert_uimm_bits!(LANE, 4);
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
Expand Down Expand Up @@ -12571,7 +12571,7 @@ pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -
#[rustc_legacy_const_generics(2)]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
pub unsafe fn vld4q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x4_t) -> int8x16x4_t {
static_assert_uimm_bits!(LANE, 3);
static_assert_uimm_bits!(LANE, 4);
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
Expand Down
205 changes: 127 additions & 78 deletions crates/core_arch/src/aarch64/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,7 @@ mod tests {
macro_rules! wide_store_load_roundtrip_fp16 {
($( $name:ident $args:tt);* $(;)?) => {
$(
#[cfg_attr(miri, ignore)]
Copy link
Copy Markdown
Member

@RalfJung RalfJung May 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this disabled in Miri now? Please always add comments explaining that.

View changes since the review

#[simd_test(enable = "neon,fp16")]
#[cfg(not(target_arch = "arm64ec"))]
unsafe fn $name() {
Expand All @@ -1055,13 +1056,13 @@ mod tests {
test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);

test_vld2_f16_x2(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
test_vld2_f16_x3(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
test_vld2_f16_x4(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);
test_vld2_f16(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
test_vld3_f16(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
test_vld4_f16(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);

test_vld2q_f16_x2(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
test_vld3q_f16_x3(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
test_vld4q_f16_x4(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
test_vld2q_f16(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
test_vld3q_f16(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
test_vld4q_f16(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
}

macro_rules! wide_store_load_roundtrip_aes {
Expand Down Expand Up @@ -1195,101 +1196,149 @@ mod tests {
}

wide_store_load_roundtrip_neon! {
test_vld2_f32_x2(f32, 4, float32x2x2_t, vst2_f32, vld2_f32);
test_vld2_f32_x3(f32, 6, float32x2x3_t, vst3_f32, vld3_f32);
test_vld2_f32_x4(f32, 8, float32x2x4_t, vst4_f32, vld4_f32);
test_vld2_f32(f32, 4, float32x2x2_t, vst2_f32, vld2_f32);
test_vld3_f32(f32, 6, float32x2x3_t, vst3_f32, vld3_f32);
test_vld4_f32(f32, 8, float32x2x4_t, vst4_f32, vld4_f32);

test_vld2q_f32_x2(f32, 8, float32x4x2_t, vst2q_f32, vld2q_f32);
test_vld3q_f32_x3(f32, 12, float32x4x3_t, vst3q_f32, vld3q_f32);
test_vld4q_f32_x4(f32, 16, float32x4x4_t, vst4q_f32, vld4q_f32);
test_vld2q_f32(f32, 8, float32x4x2_t, vst2q_f32, vld2q_f32);
test_vld3q_f32(f32, 12, float32x4x3_t, vst3q_f32, vld3q_f32);
test_vld4q_f32(f32, 16, float32x4x4_t, vst4q_f32, vld4q_f32);

test_vld2_f64_x2(f64, 2, float64x1x2_t, vst2_f64, vld2_f64);
test_vld2_f64_x3(f64, 3, float64x1x3_t, vst3_f64, vld3_f64);
test_vld2_f64_x4(f64, 4, float64x1x4_t, vst4_f64, vld4_f64);
test_vld2_f64(f64, 2, float64x1x2_t, vst2_f64, vld2_f64);
test_vld3_f64(f64, 3, float64x1x3_t, vst3_f64, vld3_f64);
test_vld4_f64(f64, 4, float64x1x4_t, vst4_f64, vld4_f64);

test_vld2q_f64_x2(f64, 4, float64x2x2_t, vst2q_f64, vld2q_f64);
test_vld3q_f64_x3(f64, 6, float64x2x3_t, vst3q_f64, vld3q_f64);
test_vld4q_f64_x4(f64, 8, float64x2x4_t, vst4q_f64, vld4q_f64);
test_vld2q_f64(f64, 4, float64x2x2_t, vst2q_f64, vld2q_f64);
test_vld3q_f64(f64, 6, float64x2x3_t, vst3q_f64, vld3q_f64);
test_vld4q_f64(f64, 8, float64x2x4_t, vst4q_f64, vld4q_f64);

test_vld2_s8_x2(i8, 16, int8x8x2_t, vst2_s8, vld2_s8);
test_vld2_s8_x3(i8, 24, int8x8x3_t, vst3_s8, vld3_s8);
test_vld2_s8_x4(i8, 32, int8x8x4_t, vst4_s8, vld4_s8);
test_vld2_s8(i8, 16, int8x8x2_t, vst2_s8, vld2_s8);
test_vld3_s8(i8, 24, int8x8x3_t, vst3_s8, vld3_s8);
test_vld4_s8(i8, 32, int8x8x4_t, vst4_s8, vld4_s8);

test_vld2q_s8_x2(i8, 32, int8x16x2_t, vst2q_s8, vld2q_s8);
test_vld3q_s8_x3(i8, 48, int8x16x3_t, vst3q_s8, vld3q_s8);
test_vld4q_s8_x4(i8, 64, int8x16x4_t, vst4q_s8, vld4q_s8);
test_vld2q_s8(i8, 32, int8x16x2_t, vst2q_s8, vld2q_s8);
test_vld3q_s8(i8, 48, int8x16x3_t, vst3q_s8, vld3q_s8);
test_vld4q_s8(i8, 64, int8x16x4_t, vst4q_s8, vld4q_s8);

test_vld2_s16_x2(i16, 8, int16x4x2_t, vst2_s16, vld2_s16);
test_vld2_s16_x3(i16, 12, int16x4x3_t, vst3_s16, vld3_s16);
test_vld2_s16_x4(i16, 16, int16x4x4_t, vst4_s16, vld4_s16);
test_vld2_s16(i16, 8, int16x4x2_t, vst2_s16, vld2_s16);
test_vld3_s16(i16, 12, int16x4x3_t, vst3_s16, vld3_s16);
test_vld4_s16(i16, 16, int16x4x4_t, vst4_s16, vld4_s16);

test_vld2q_s16_x2(i16, 16, int16x8x2_t, vst2q_s16, vld2q_s16);
test_vld3q_s16_x3(i16, 24, int16x8x3_t, vst3q_s16, vld3q_s16);
test_vld4q_s16_x4(i16, 32, int16x8x4_t, vst4q_s16, vld4q_s16);
test_vld2q_s16(i16, 16, int16x8x2_t, vst2q_s16, vld2q_s16);
test_vld3q_s16(i16, 24, int16x8x3_t, vst3q_s16, vld3q_s16);
test_vld4q_s16(i16, 32, int16x8x4_t, vst4q_s16, vld4q_s16);

test_vld2_s32_x2(i32, 4, int32x2x2_t, vst2_s32, vld2_s32);
test_vld2_s32_x3(i32, 6, int32x2x3_t, vst3_s32, vld3_s32);
test_vld2_s32_x4(i32, 8, int32x2x4_t, vst4_s32, vld4_s32);
test_vld2_s32(i32, 4, int32x2x2_t, vst2_s32, vld2_s32);
test_vld3_s32(i32, 6, int32x2x3_t, vst3_s32, vld3_s32);
test_vld4_s32(i32, 8, int32x2x4_t, vst4_s32, vld4_s32);

test_vld2q_s32_x2(i32, 8, int32x4x2_t, vst2q_s32, vld2q_s32);
test_vld3q_s32_x3(i32, 12, int32x4x3_t, vst3q_s32, vld3q_s32);
test_vld4q_s32_x4(i32, 16, int32x4x4_t, vst4q_s32, vld4q_s32);
test_vld2q_s32(i32, 8, int32x4x2_t, vst2q_s32, vld2q_s32);
test_vld3q_s32(i32, 12, int32x4x3_t, vst3q_s32, vld3q_s32);
test_vld4q_s32(i32, 16, int32x4x4_t, vst4q_s32, vld4q_s32);

test_vld2_s64_x2(i64, 2, int64x1x2_t, vst2_s64, vld2_s64);
test_vld2_s64_x3(i64, 3, int64x1x3_t, vst3_s64, vld3_s64);
test_vld2_s64_x4(i64, 4, int64x1x4_t, vst4_s64, vld4_s64);
test_vld2_s64(i64, 2, int64x1x2_t, vst2_s64, vld2_s64);
test_vld3_s64(i64, 3, int64x1x3_t, vst3_s64, vld3_s64);
test_vld4_s64(i64, 4, int64x1x4_t, vst4_s64, vld4_s64);

test_vld2q_s64_x2(i64, 4, int64x2x2_t, vst2q_s64, vld2q_s64);
test_vld3q_s64_x3(i64, 6, int64x2x3_t, vst3q_s64, vld3q_s64);
test_vld4q_s64_x4(i64, 8, int64x2x4_t, vst4q_s64, vld4q_s64);
test_vld2q_s64(i64, 4, int64x2x2_t, vst2q_s64, vld2q_s64);
test_vld3q_s64(i64, 6, int64x2x3_t, vst3q_s64, vld3q_s64);
test_vld4q_s64(i64, 8, int64x2x4_t, vst4q_s64, vld4q_s64);

test_vld2_u8_x2(u8, 16, uint8x8x2_t, vst2_u8, vld2_u8);
test_vld2_u8_x3(u8, 24, uint8x8x3_t, vst3_u8, vld3_u8);
test_vld2_u8_x4(u8, 32, uint8x8x4_t, vst4_u8, vld4_u8);
test_vld2_u8(u8, 16, uint8x8x2_t, vst2_u8, vld2_u8);
test_vld3_u8(u8, 24, uint8x8x3_t, vst3_u8, vld3_u8);
test_vld4_u8(u8, 32, uint8x8x4_t, vst4_u8, vld4_u8);

test_vld2q_u8_x2(u8, 32, uint8x16x2_t, vst2q_u8, vld2q_u8);
test_vld3q_u8_x3(u8, 48, uint8x16x3_t, vst3q_u8, vld3q_u8);
test_vld4q_u8_x4(u8, 64, uint8x16x4_t, vst4q_u8, vld4q_u8);
test_vld2q_u8(u8, 32, uint8x16x2_t, vst2q_u8, vld2q_u8);
test_vld3q_u8(u8, 48, uint8x16x3_t, vst3q_u8, vld3q_u8);
test_vld4q_u8(u8, 64, uint8x16x4_t, vst4q_u8, vld4q_u8);

test_vld2_u16_x2(u16, 8, uint16x4x2_t, vst2_u16, vld2_u16);
test_vld2_u16_x3(u16, 12, uint16x4x3_t, vst3_u16, vld3_u16);
test_vld2_u16_x4(u16, 16, uint16x4x4_t, vst4_u16, vld4_u16);
test_vld2_u16(u16, 8, uint16x4x2_t, vst2_u16, vld2_u16);
test_vld3_u16(u16, 12, uint16x4x3_t, vst3_u16, vld3_u16);
test_vld4_u16(u16, 16, uint16x4x4_t, vst4_u16, vld4_u16);

test_vld2q_u16_x2(u16, 16, uint16x8x2_t, vst2q_u16, vld2q_u16);
test_vld3q_u16_x3(u16, 24, uint16x8x3_t, vst3q_u16, vld3q_u16);
test_vld4q_u16_x4(u16, 32, uint16x8x4_t, vst4q_u16, vld4q_u16);
test_vld2q_u16(u16, 16, uint16x8x2_t, vst2q_u16, vld2q_u16);
test_vld3q_u16(u16, 24, uint16x8x3_t, vst3q_u16, vld3q_u16);
test_vld4q_u16(u16, 32, uint16x8x4_t, vst4q_u16, vld4q_u16);

test_vld2_u32_x2(u32, 4, uint32x2x2_t, vst2_u32, vld2_u32);
test_vld2_u32_x3(u32, 6, uint32x2x3_t, vst3_u32, vld3_u32);
test_vld2_u32_x4(u32, 8, uint32x2x4_t, vst4_u32, vld4_u32);
test_vld2_u32(u32, 4, uint32x2x2_t, vst2_u32, vld2_u32);
test_vld3_u32(u32, 6, uint32x2x3_t, vst3_u32, vld3_u32);
test_vld4_u32(u32, 8, uint32x2x4_t, vst4_u32, vld4_u32);

test_vld2q_u32_x2(u32, 8, uint32x4x2_t, vst2q_u32, vld2q_u32);
test_vld3q_u32_x3(u32, 12, uint32x4x3_t, vst3q_u32, vld3q_u32);
test_vld4q_u32_x4(u32, 16, uint32x4x4_t, vst4q_u32, vld4q_u32);
test_vld2q_u32(u32, 8, uint32x4x2_t, vst2q_u32, vld2q_u32);
test_vld3q_u32(u32, 12, uint32x4x3_t, vst3q_u32, vld3q_u32);
test_vld4q_u32(u32, 16, uint32x4x4_t, vst4q_u32, vld4q_u32);

test_vld2_u64_x2(u64, 2, uint64x1x2_t, vst2_u64, vld2_u64);
test_vld2_u64_x3(u64, 3, uint64x1x3_t, vst3_u64, vld3_u64);
test_vld2_u64_x4(u64, 4, uint64x1x4_t, vst4_u64, vld4_u64);
test_vld2_u64(u64, 2, uint64x1x2_t, vst2_u64, vld2_u64);
test_vld3_u64(u64, 3, uint64x1x3_t, vst3_u64, vld3_u64);
test_vld4_u64(u64, 4, uint64x1x4_t, vst4_u64, vld4_u64);

test_vld2q_u64_x2(u64, 4, uint64x2x2_t, vst2q_u64, vld2q_u64);
test_vld3q_u64_x3(u64, 6, uint64x2x3_t, vst3q_u64, vld3q_u64);
test_vld4q_u64_x4(u64, 8, uint64x2x4_t, vst4q_u64, vld4q_u64);
test_vld2q_u64(u64, 4, uint64x2x2_t, vst2q_u64, vld2q_u64);
test_vld3q_u64(u64, 6, uint64x2x3_t, vst3q_u64, vld3q_u64);
test_vld4q_u64(u64, 8, uint64x2x4_t, vst4q_u64, vld4q_u64);

test_vld2_p8_x2(p8, 16, poly8x8x2_t, vst2_p8, vld2_p8);
test_vld2_p8_x3(p8, 24, poly8x8x3_t, vst3_p8, vld3_p8);
test_vld2_p8_x4(p8, 32, poly8x8x4_t, vst4_p8, vld4_p8);
test_vld2_p8(p8, 16, poly8x8x2_t, vst2_p8, vld2_p8);
test_vld3_p8(p8, 24, poly8x8x3_t, vst3_p8, vld3_p8);
test_vld4_p8(p8, 32, poly8x8x4_t, vst4_p8, vld4_p8);

test_vld2q_p8_x2(p8, 32, poly8x16x2_t, vst2q_p8, vld2q_p8);
test_vld3q_p8_x3(p8, 48, poly8x16x3_t, vst3q_p8, vld3q_p8);
test_vld4q_p8_x4(p8, 64, poly8x16x4_t, vst4q_p8, vld4q_p8);
test_vld2q_p8(p8, 32, poly8x16x2_t, vst2q_p8, vld2q_p8);
test_vld3q_p8(p8, 48, poly8x16x3_t, vst3q_p8, vld3q_p8);
test_vld4q_p8(p8, 64, poly8x16x4_t, vst4q_p8, vld4q_p8);

test_vld2_p16_x2(p16, 8, poly16x4x2_t, vst2_p16, vld2_p16);
test_vld2_p16_x3(p16, 12, poly16x4x3_t, vst3_p16, vld3_p16);
test_vld2_p16_x4(p16, 16, poly16x4x4_t, vst4_p16, vld4_p16);
test_vld2_p16(p16, 8, poly16x4x2_t, vst2_p16, vld2_p16);
test_vld3_p16(p16, 12, poly16x4x3_t, vst3_p16, vld3_p16);
test_vld4_p16(p16, 16, poly16x4x4_t, vst4_p16, vld4_p16);

test_vld2q_p16_x2(p16, 16, poly16x8x2_t, vst2q_p16, vld2q_p16);
test_vld3q_p16_x3(p16, 24, poly16x8x3_t, vst3q_p16, vld3q_p16);
test_vld4q_p16_x4(p16, 32, poly16x8x4_t, vst4q_p16, vld4q_p16);
test_vld2q_p16(p16, 16, poly16x8x2_t, vst2q_p16, vld2q_p16);
test_vld3q_p16(p16, 24, poly16x8x3_t, vst3q_p16, vld3q_p16);
test_vld4q_p16(p16, 32, poly16x8x4_t, vst4q_p16, vld4q_p16);
}

macro_rules! lane_wide_store_load_roundtrip {
($elem_ty:ty, $len:expr, $idx:expr, $vec_ty:ty, $store:ident, $load:ident) => {
let vals: [$elem_ty; $len] = crate::array::from_fn(|i| i as $elem_ty);
let a: $vec_ty = transmute(vals);
let mut tmp = [0 as $elem_ty; 4];
$store::<$idx>(tmp.as_mut_ptr().cast(), a);
let r: $vec_ty = $load::<$idx>(tmp.as_ptr().cast(), a);
let out: [$elem_ty; $len] = transmute(r);
assert_eq!(out, vals);
};
}

macro_rules! lane_wide_store_load_roundtrip_neon {
($( $name:ident $args:tt);* $(;)?) => {
$(
#[cfg_attr(miri, ignore)]
Copy link
Copy Markdown
Member

@RalfJung RalfJung May 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here

View changes since the review

#[simd_test(enable = "neon")]
unsafe fn $name() {
lane_wide_store_load_roundtrip! $args;
}
)*
};
}

lane_wide_store_load_roundtrip_neon! {
test_vld2q_lane_s8(i8, 32, 15, int8x16x2_t, vst2q_lane_s8, vld2q_lane_s8);
test_vld3q_lane_s8(i8, 48, 15, int8x16x3_t, vst3q_lane_s8, vld3q_lane_s8);
test_vld4q_lane_s8(i8, 64, 15, int8x16x4_t, vst4q_lane_s8, vld4q_lane_s8);

test_vld2q_lane_u8(u8, 32, 15, uint8x16x2_t, vst2q_lane_u8, vld2q_lane_u8);
test_vld3q_lane_u8(u8, 48, 15, uint8x16x3_t, vst3q_lane_u8, vld3q_lane_u8);
test_vld4q_lane_u8(u8, 64, 15, uint8x16x4_t, vst4q_lane_u8, vld4q_lane_u8);

test_vld2_lane_s64(i64, 2, 0, int64x1x2_t, vst2_lane_s64, vld2_lane_s64);
test_vld3_lane_s64(i64, 3, 0, int64x1x3_t, vst3_lane_s64, vld3_lane_s64);
test_vld4_lane_s64(i64, 4, 0, int64x1x4_t, vst4_lane_s64, vld4_lane_s64);
test_vld2q_lane_s64(i64, 4, 1, int64x2x2_t, vst2q_lane_s64, vld2q_lane_s64);
test_vld3q_lane_s64(i64, 6, 1, int64x2x3_t, vst3q_lane_s64, vld3q_lane_s64);
test_vld4q_lane_s64(i64, 8, 1, int64x2x4_t, vst4q_lane_s64, vld4q_lane_s64);

test_vld2_lane_u64(u64, 2, 0, uint64x1x2_t, vst2_lane_u64, vld2_lane_u64);
test_vld3_lane_u64(u64, 3, 0, uint64x1x3_t, vst3_lane_u64, vld3_lane_u64);
test_vld4_lane_u64(u64, 4, 0, uint64x1x4_t, vst4_lane_u64, vld4_lane_u64);
test_vld2q_lane_u64(u64, 4, 1, uint64x2x2_t, vst2q_lane_u64, vld2q_lane_u64);
test_vld3q_lane_u64(u64, 6, 1, uint64x2x3_t, vst3q_lane_u64, vld3q_lane_u64);
test_vld4q_lane_u64(u64, 8, 1, uint64x2x4_t, vst4q_lane_u64, vld4q_lane_u64);
}
}

Expand Down
Loading