From 20be9139d78c637810126bf78060466e5aa2979a Mon Sep 17 00:00:00 2001 From: sayantn Date: Fri, 8 May 2026 14:50:34 +0530 Subject: [PATCH 01/19] replace uses of `simd_extract` with `vget_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 558 ++++++++---------- .../src/arm_shared/neon/generated.rs | 14 +- .../spec/neon/aarch64.spec.yml | 327 +++++----- .../spec/neon/arm_shared.spec.yml | 10 +- 4 files changed, 423 insertions(+), 486 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 5a0bbfa095..2624e2f22d 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -204,7 +204,7 @@ pub fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fabd))] pub fn vabdd_f64(a: f64, b: f64) -> f64 { - unsafe { simd_extract!(vabd_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_f64::<0>(vabd_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point absolute difference"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabds_f32)"] @@ -213,7 +213,7 @@ pub fn vabdd_f64(a: f64, b: f64) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fabd))] pub fn vabds_f32(a: f32, b: f32) -> f32 { - unsafe { simd_extract!(vabd_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_f32::<0>(vabd_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Floating-point absolute difference"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdh_f16)"] @@ -223,7 +223,7 @@ pub fn vabds_f32(a: f32, b: f32) -> f32 { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fabd))] pub fn vabdh_f16(a: f16, b: f16) -> f16 { - unsafe { simd_extract!(vabd_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_f16::<0>(vabd_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Signed Absolute difference Long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s16)"] @@ -1539,7 +1539,7 @@ pub fn vceqq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vceqd_f64(a: f64, b: f64) -> u64 { - unsafe { simd_extract!(vceq_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_u64::<0>(vceq_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point compare equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqs_f32)"] @@ -1548,7 +1548,7 @@ pub fn vceqd_f64(a: f64, b: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vceqs_f32(a: f32, b: f32) -> u32 { - unsafe { simd_extract!(vceq_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_u32::<0>(vceq_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Compare bitwise equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqd_s64)"] @@ -1576,7 +1576,7 @@ pub fn vceqd_u64(a: u64, b: u64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vceqh_f16(a: f16, b: f16) -> u16 { - unsafe { simd_extract!(vceq_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_u16::<0>(vceq_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Floating-point compare bitwise equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqz_f16)"] @@ -1866,7 +1866,7 @@ pub fn vceqzd_u64(a: u64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vceqzh_f16(a: f16) -> u16 { - unsafe { simd_extract!(vceqz_f16(vdup_n_f16(a)), 0) } + vget_lane_u16::<0>(vceqz_f16(vdup_n_f16(a))) } #[doc = "Floating-point compare bitwise equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzs_f32)"] @@ -1875,7 +1875,7 @@ pub fn vceqzh_f16(a: f16) -> u16 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vceqzs_f32(a: f32) -> u32 { - unsafe { simd_extract!(vceqz_f32(vdup_n_f32(a)), 0) } + vget_lane_u32::<0>(vceqz_f32(vdup_n_f32(a))) } #[doc = "Floating-point compare bitwise equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzd_f64)"] @@ -1884,7 +1884,7 @@ pub fn vceqzs_f32(a: f32) -> u32 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vceqzd_f64(a: f64) -> u64 { - unsafe { simd_extract!(vceqz_f64(vdup_n_f64(a)), 0) } + vget_lane_u64::<0>(vceqz_f64(vdup_n_f64(a))) } #[doc = "Floating-point compare greater than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcge_f64)"] @@ -1947,7 +1947,7 @@ pub fn vcgeq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcged_f64(a: f64, b: f64) -> u64 { - unsafe { simd_extract!(vcge_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_u64::<0>(vcge_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point compare greater than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcges_f32)"] @@ -1956,7 +1956,7 @@ pub fn vcged_f64(a: f64, b: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcges_f32(a: f32, b: f32) -> u32 { - unsafe { simd_extract!(vcge_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_u32::<0>(vcge_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Compare greater than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcged_s64)"] @@ -1984,7 +1984,7 @@ pub fn vcged_u64(a: u64, b: u64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcgeh_f16(a: f16, b: f16) -> u16 { - unsafe { simd_extract!(vcge_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_u16::<0>(vcge_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Floating-point compare greater than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgez_f32)"] @@ -2113,7 +2113,7 @@ pub fn vcgezq_s64(a: int64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgezd_f64(a: f64) -> u64 { - unsafe { simd_extract!(vcgez_f64(vdup_n_f64(a)), 0) } + vget_lane_u64::<0>(vcgez_f64(vdup_n_f64(a))) } #[doc = "Floating-point compare greater than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezs_f32)"] @@ -2122,7 +2122,7 @@ pub fn vcgezd_f64(a: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgezs_f32(a: f32) -> u32 { - unsafe { simd_extract!(vcgez_f32(vdup_n_f32(a)), 0) } + vget_lane_u32::<0>(vcgez_f32(vdup_n_f32(a))) } #[doc = "Compare signed greater than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgezd_s64)"] @@ -2141,7 +2141,7 @@ pub fn vcgezd_s64(a: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcgezh_f16(a: f16) -> u16 { - unsafe { simd_extract!(vcgez_f16(vdup_n_f16(a)), 0) } + vget_lane_u16::<0>(vcgez_f16(vdup_n_f16(a))) } #[doc = "Floating-point compare greater than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_f64)"] @@ -2204,7 +2204,7 @@ pub fn vcgtq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgtd_f64(a: f64, b: f64) -> u64 { - unsafe { simd_extract!(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_u64::<0>(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point compare greater than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgts_f32)"] @@ -2213,7 +2213,7 @@ pub fn vcgtd_f64(a: f64, b: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgts_f32(a: f32, b: f32) -> u32 { - unsafe { simd_extract!(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_u32::<0>(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Compare greater than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtd_s64)"] @@ -2241,7 +2241,7 @@ pub fn vcgtd_u64(a: u64, b: u64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcgth_f16(a: f16, b: f16) -> u16 { - unsafe { simd_extract!(vcgt_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_u16::<0>(vcgt_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Floating-point compare greater than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtz_f32)"] @@ -2370,7 +2370,7 @@ pub fn vcgtzq_s64(a: int64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgtzd_f64(a: f64) -> u64 { - unsafe { simd_extract!(vcgtz_f64(vdup_n_f64(a)), 0) } + vget_lane_u64::<0>(vcgtz_f64(vdup_n_f64(a))) } #[doc = "Floating-point compare greater than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzs_f32)"] @@ -2379,7 +2379,7 @@ pub fn vcgtzd_f64(a: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcgtzs_f32(a: f32) -> u32 { - unsafe { simd_extract!(vcgtz_f32(vdup_n_f32(a)), 0) } + vget_lane_u32::<0>(vcgtz_f32(vdup_n_f32(a))) } #[doc = "Compare signed greater than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtzd_s64)"] @@ -2398,7 +2398,7 @@ pub fn vcgtzd_s64(a: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcgtzh_f16(a: f16) -> u16 { - unsafe { simd_extract!(vcgtz_f16(vdup_n_f16(a)), 0) } + vget_lane_u16::<0>(vcgtz_f16(vdup_n_f16(a))) } #[doc = "Floating-point compare less than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcle_f64)"] @@ -2461,7 +2461,7 @@ pub fn vcleq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcled_f64(a: f64, b: f64) -> u64 { - unsafe { simd_extract!(vcle_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_u64::<0>(vcle_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point compare less than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcles_f32)"] @@ -2470,7 +2470,7 @@ pub fn vcled_f64(a: f64, b: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcles_f32(a: f32, b: f32) -> u32 { - unsafe { simd_extract!(vcle_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_u32::<0>(vcle_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Compare less than or equal"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcled_u64)"] @@ -2498,7 +2498,7 @@ pub fn vcled_s64(a: i64, b: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcleh_f16(a: f16, b: f16) -> u16 { - unsafe { simd_extract!(vcle_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_u16::<0>(vcle_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Floating-point compare less than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclez_f32)"] @@ -2627,7 +2627,7 @@ pub fn vclezq_s64(a: int64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vclezd_f64(a: f64) -> u64 { - unsafe { simd_extract!(vclez_f64(vdup_n_f64(a)), 0) } + vget_lane_u64::<0>(vclez_f64(vdup_n_f64(a))) } #[doc = "Floating-point compare less than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezs_f32)"] @@ -2636,7 +2636,7 @@ pub fn vclezd_f64(a: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vclezs_f32(a: f32) -> u32 { - unsafe { simd_extract!(vclez_f32(vdup_n_f32(a)), 0) } + vget_lane_u32::<0>(vclez_f32(vdup_n_f32(a))) } #[doc = "Compare less than or equal to zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclezd_s64)"] @@ -2655,7 +2655,7 @@ pub fn vclezd_s64(a: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vclezh_f16(a: f16) -> u16 { - unsafe { simd_extract!(vclez_f16(vdup_n_f16(a)), 0) } + vget_lane_u16::<0>(vclez_f16(vdup_n_f16(a))) } #[doc = "Floating-point compare less than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclt_f64)"] @@ -2737,7 +2737,7 @@ pub fn vcltd_s64(a: i64, b: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vclth_f16(a: f16, b: f16) -> u16 { - unsafe { simd_extract!(vclt_f16(vdup_n_f16(a), vdup_n_f16(b)), 0) } + vget_lane_u16::<0>(vclt_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Floating-point compare less than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vclts_f32)"] @@ -2746,7 +2746,7 @@ pub fn vclth_f16(a: f16, b: f16) -> u16 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vclts_f32(a: f32, b: f32) -> u32 { - unsafe { simd_extract!(vclt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0) } + vget_lane_u32::<0>(vclt_f32(vdup_n_f32(a), vdup_n_f32(b))) } #[doc = "Floating-point compare less than"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltd_f64)"] @@ -2755,7 +2755,7 @@ pub fn vclts_f32(a: f32, b: f32) -> u32 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcltd_f64(a: f64, b: f64) -> u64 { - unsafe { simd_extract!(vclt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0) } + vget_lane_u64::<0>(vclt_f64(vdup_n_f64(a), vdup_n_f64(b))) } #[doc = "Floating-point compare less than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltz_f32)"] @@ -2884,7 +2884,7 @@ pub fn vcltzq_s64(a: int64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcltzd_f64(a: f64) -> u64 { - unsafe { simd_extract!(vcltz_f64(vdup_n_f64(a)), 0) } + vget_lane_u64::<0>(vcltz_f64(vdup_n_f64(a))) } #[doc = "Floating-point compare less than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzs_f32)"] @@ -2893,7 +2893,7 @@ pub fn vcltzd_f64(a: f64) -> u64 { #[cfg_attr(test, assert_instr(fcmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcltzs_f32(a: f32) -> u32 { - unsafe { simd_extract!(vcltz_f32(vdup_n_f32(a)), 0) } + vget_lane_u32::<0>(vcltz_f32(vdup_n_f32(a))) } #[doc = "Compare less than zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcltzd_s64)"] @@ -2912,7 +2912,7 @@ pub fn vcltzd_s64(a: i64) -> u64 { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vcltzh_f16(a: f16) -> u16 { - unsafe { simd_extract!(vcltz_f16(vdup_n_f16(a)), 0) } + vget_lane_u16::<0>(vcltz_f16(vdup_n_f16(a))) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_f16)"] @@ -4662,7 +4662,7 @@ pub fn vcopy_laneq_f64( ) -> float64x1_t { static_assert!(LANE1 == 0); static_assert_uimm_bits!(LANE2, 1); - unsafe { transmute::(simd_extract!(b, LANE2 as u32)) } + unsafe { transmute(vgetq_lane_f64::(b)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s64)"] @@ -4677,7 +4677,7 @@ pub fn vcopy_laneq_s64( ) -> int64x1_t { static_assert!(LANE1 == 0); static_assert_uimm_bits!(LANE2, 1); - unsafe { transmute::(simd_extract!(b, LANE2 as u32)) } + unsafe { transmute(vgetq_lane_s64::(b)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u64)"] @@ -4692,7 +4692,7 @@ pub fn vcopy_laneq_u64( ) -> uint64x1_t { static_assert!(LANE1 == 0); static_assert_uimm_bits!(LANE2, 1); - unsafe { transmute::(simd_extract!(b, LANE2 as u32)) } + unsafe { transmute(vgetq_lane_u64::(b)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_p64)"] @@ -4707,7 +4707,7 @@ pub fn vcopy_laneq_p64( ) -> poly64x1_t { static_assert!(LANE1 == 0); static_assert_uimm_bits!(LANE2, 1); - unsafe { transmute::(simd_extract!(b, LANE2 as u32)) } + unsafe { transmute(vgetq_lane_p64::(b)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_f32)"] @@ -9763,7 +9763,7 @@ pub fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t { #[cfg_attr(test, assert_instr(fcvtxn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcvtxd_f32_f64(a: f64) -> f32 { - unsafe { simd_extract!(vcvtx_f32_f64(vdupq_n_f64(a)), 0) } + vget_lane_f32::<0>(vcvtx_f32_f64(vdupq_n_f64(a))) } #[doc = "Divide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdiv_f16)"] @@ -9862,7 +9862,7 @@ pub fn vdup_lane_p64(a: poly64x1_t) -> poly64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdup_laneq_f64(a: float64x2_t) -> float64x1_t { static_assert_uimm_bits!(N, 1); - unsafe { transmute::(simd_extract!(a, N as u32)) } + unsafe { transmute(vgetq_lane_f64::(a)) } } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_p64)"] @@ -9873,7 +9873,7 @@ pub fn vdup_laneq_f64(a: float64x2_t) -> float64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdup_laneq_p64(a: poly64x2_t) -> poly64x1_t { static_assert_uimm_bits!(N, 1); - unsafe { transmute::(simd_extract!(a, N as u32)) } + unsafe { transmute(vgetq_lane_p64::(a)) } } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_s8)"] @@ -9884,7 +9884,7 @@ pub fn vdup_laneq_p64(a: poly64x2_t) -> poly64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_lane_s8(a: int8x8_t) -> i8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vget_lane_s8::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_s16)"] @@ -9895,7 +9895,7 @@ pub fn vdupb_lane_s8(a: int8x8_t) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_laneq_s16(a: int16x8_t) -> i16 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_s16::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_u8)"] @@ -9906,7 +9906,7 @@ pub fn vduph_laneq_s16(a: int16x8_t) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_lane_u8(a: uint8x8_t) -> u8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vget_lane_u8::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_u16)"] @@ -9917,7 +9917,7 @@ pub fn vdupb_lane_u8(a: uint8x8_t) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_laneq_u16(a: uint16x8_t) -> u16 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_u16::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_lane_p8)"] @@ -9928,7 +9928,7 @@ pub fn vduph_laneq_u16(a: uint16x8_t) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_lane_p8(a: poly8x8_t) -> p8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vget_lane_p8::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_p16)"] @@ -9939,7 +9939,7 @@ pub fn vdupb_lane_p8(a: poly8x8_t) -> p8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_laneq_p16(a: poly16x8_t) -> p16 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_p16::(a) } #[doc = "Extract an element from a vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_s8)"] @@ -9950,7 +9950,7 @@ pub fn vduph_laneq_p16(a: poly16x8_t) -> p16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_laneq_s8(a: int8x16_t) -> i8 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_s8::(a) } #[doc = "Extract an element from a vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_u8)"] @@ -9961,7 +9961,7 @@ pub fn vdupb_laneq_s8(a: int8x16_t) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_laneq_u8(a: uint8x16_t) -> u8 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_u8::(a) } #[doc = "Extract an element from a vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupb_laneq_p8)"] @@ -9972,7 +9972,7 @@ pub fn vdupb_laneq_u8(a: uint8x16_t) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupb_laneq_p8(a: poly8x16_t) -> p8 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_p8::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_f64)"] @@ -9983,7 +9983,7 @@ pub fn vdupb_laneq_p8(a: poly8x16_t) -> p8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_lane_f64(a: float64x1_t) -> f64 { static_assert!(N == 0); - unsafe { simd_extract!(a, N as u32) } + vget_lane_f64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_s64)"] @@ -9994,7 +9994,7 @@ pub fn vdupd_lane_f64(a: float64x1_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_lane_s64(a: int64x1_t) -> i64 { static_assert!(N == 0); - unsafe { simd_extract!(a, N as u32) } + vget_lane_s64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_lane_u64)"] @@ -10005,7 +10005,7 @@ pub fn vdupd_lane_s64(a: int64x1_t) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_lane_u64(a: uint64x1_t) -> u64 { static_assert!(N == 0); - unsafe { simd_extract!(a, N as u32) } + vget_lane_u64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_f16)"] @@ -10017,7 +10017,7 @@ pub fn vdupd_lane_u64(a: uint64x1_t) -> u64 { #[cfg(not(target_arch = "arm64ec"))] pub fn vduph_lane_f16(a: float16x4_t) -> f16 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vget_lane_f16::(a) } #[doc = "Extract an element from a vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_laneq_f16)"] @@ -10029,7 +10029,7 @@ pub fn vduph_lane_f16(a: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vduph_laneq_f16(a: float16x8_t) -> f16 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_f16::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_f64)"] @@ -10084,7 +10084,7 @@ pub fn vdupq_laneq_p64(a: poly64x2_t) -> poly64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_lane_f32(a: float32x2_t) -> f32 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vget_lane_f32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_f64)"] @@ -10095,7 +10095,7 @@ pub fn vdups_lane_f32(a: float32x2_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_laneq_f64(a: float64x2_t) -> f64 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_f64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_lane_s32)"] @@ -10106,7 +10106,7 @@ pub fn vdupd_laneq_f64(a: float64x2_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_lane_s32(a: int32x2_t) -> i32 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vget_lane_s32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_s64)"] @@ -10117,7 +10117,7 @@ pub fn vdups_lane_s32(a: int32x2_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_laneq_s64(a: int64x2_t) -> i64 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_s64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_lane_u32)"] @@ -10128,7 +10128,7 @@ pub fn vdupd_laneq_s64(a: int64x2_t) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_lane_u32(a: uint32x2_t) -> u32 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vget_lane_u32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupd_laneq_u64)"] @@ -10139,7 +10139,7 @@ pub fn vdups_lane_u32(a: uint32x2_t) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdupd_laneq_u64(a: uint64x2_t) -> u64 { static_assert_uimm_bits!(N, 1); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_u64::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_f32)"] @@ -10150,7 +10150,7 @@ pub fn vdupd_laneq_u64(a: uint64x2_t) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_laneq_f32(a: float32x4_t) -> f32 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_f32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_s16)"] @@ -10161,7 +10161,7 @@ pub fn vdups_laneq_f32(a: float32x4_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_lane_s16(a: int16x4_t) -> i16 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vget_lane_s16::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_s32)"] @@ -10172,7 +10172,7 @@ pub fn vduph_lane_s16(a: int16x4_t) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_laneq_s32(a: int32x4_t) -> i32 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_s32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_u16)"] @@ -10183,7 +10183,7 @@ pub fn vdups_laneq_s32(a: int32x4_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_lane_u16(a: uint16x4_t) -> u16 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vget_lane_u16::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdups_laneq_u32)"] @@ -10194,7 +10194,7 @@ pub fn vduph_lane_u16(a: uint16x4_t) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vdups_laneq_u32(a: uint32x4_t) -> u32 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vgetq_lane_u32::(a) } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vduph_lane_p16)"] @@ -10205,7 +10205,7 @@ pub fn vdups_laneq_u32(a: uint32x4_t) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vduph_lane_p16(a: poly16x4_t) -> p16 { static_assert_uimm_bits!(N, 2); - unsafe { simd_extract!(a, N as u32) } + vget_lane_p16::(a) } #[doc = "Three-way exclusive OR"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/veor3q_s8)"] @@ -10392,7 +10392,7 @@ pub fn vfma_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfma_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) } + vfma_f16(a, b, vdup_n_f16(vget_lane_f16::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f16)"] @@ -10408,7 +10408,7 @@ pub fn vfma_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vfma_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) } + vfma_f16(a, b, vdup_n_f16(vgetq_lane_f16::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_lane_f16)"] @@ -10424,7 +10424,7 @@ pub fn vfmaq_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfmaq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) } + vfmaq_f16(a, b, vdupq_n_f16(vget_lane_f16::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f16)"] @@ -10440,7 +10440,7 @@ pub fn vfmaq_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vfmaq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) } + vfmaq_f16(a, b, vdupq_n_f16(vgetq_lane_f16::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f32)"] @@ -10455,7 +10455,7 @@ pub fn vfma_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) } + vfma_f32(a, b, vdup_n_f32(vget_lane_f32::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f32)"] @@ -10470,7 +10470,7 @@ pub fn vfma_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) } + vfma_f32(a, b, vdup_n_f32(vgetq_lane_f32::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_lane_f32)"] @@ -10485,7 +10485,7 @@ pub fn vfmaq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) } + vfmaq_f32(a, b, vdupq_n_f32(vget_lane_f32::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f32)"] @@ -10500,7 +10500,7 @@ pub fn vfmaq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) } + vfmaq_f32(a, b, vdupq_n_f32(vgetq_lane_f32::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_laneq_f64)"] @@ -10515,7 +10515,7 @@ pub fn vfmaq_laneq_f64( c: float64x2_t, ) -> float64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) } + vfmaq_f64(a, b, vdupq_n_f64(vgetq_lane_f64::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f64)"] @@ -10530,7 +10530,7 @@ pub fn vfma_lane_f64( c: float64x1_t, ) -> float64x1_t { static_assert!(LANE == 0); - unsafe { vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) } + vfma_f64(a, b, vdup_n_f64(vget_lane_f64::(c))) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f64)"] @@ -10545,7 +10545,7 @@ pub fn vfma_laneq_f64( c: float64x2_t, ) -> float64x1_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) } + vfma_f64(a, b, vdup_n_f64(vgetq_lane_f64::(c))) } #[doc = "Floating-point fused Multiply-Subtract from accumulator."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_n_f16)"] @@ -10585,10 +10585,8 @@ pub fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vfmad_lane_f64(a: f64, b: f64, c: float64x1_t) -> f64 { static_assert!(LANE == 0); - unsafe { - let c: f64 = simd_extract!(c, LANE as u32); - fmaf64(b, c, a) - } + let c: f64 = vget_lane_f64::(c); + fmaf64(b, c, a) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmah_f16)"] @@ -10610,10 +10608,8 @@ pub fn vfmah_f16(a: f16, b: f16, c: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vfmah_lane_f16(a: f16, b: f16, v: float16x4_t) -> f16 { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: f16 = simd_extract!(v, LANE as u32); - vfmah_f16(a, b, c) - } + let c: f16 = vget_lane_f16::(v); + vfmah_f16(a, b, c) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmah_laneq_f16)"] @@ -10625,10 +10621,8 @@ pub fn vfmah_lane_f16(a: f16, b: f16, v: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vfmah_laneq_f16(a: f16, b: f16, v: float16x8_t) -> f16 { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: f16 = simd_extract!(v, LANE as u32); - vfmah_f16(a, b, c) - } + let c: f16 = vgetq_lane_f16::(v); + vfmah_f16(a, b, c) } #[doc = "Floating-point fused Multiply-Add to accumulator(vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f64)"] @@ -10652,7 +10646,7 @@ pub fn vfmaq_lane_f64( c: float64x1_t, ) -> float64x2_t { static_assert!(LANE == 0); - unsafe { vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) } + vfmaq_f64(a, b, vdupq_n_f64(vget_lane_f64::(c))) } #[doc = "Floating-point fused Multiply-Add to accumulator(vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_n_f64)"] @@ -10672,10 +10666,8 @@ pub fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vfmas_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32 { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: f32 = simd_extract!(c, LANE as u32); - fmaf32(b, c, a) - } + let c: f32 = vget_lane_f32::(c); + fmaf32(b, c, a) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmas_laneq_f32)"] @@ -10686,10 +10678,8 @@ pub fn vfmas_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vfmas_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32 { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: f32 = simd_extract!(c, LANE as u32); - fmaf32(b, c, a) - } + let c: f32 = vgetq_lane_f32::(c); + fmaf32(b, c, a) } #[doc = "Floating-point fused multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmad_laneq_f64)"] @@ -10700,10 +10690,8 @@ pub fn vfmas_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vfmad_laneq_f64(a: f64, b: f64, c: float64x2_t) -> f64 { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: f64 = simd_extract!(c, LANE as u32); - fmaf64(b, c, a) - } + let c: f64 = vgetq_lane_f64::(c); + fmaf64(b, c, a) } #[doc = "Floating-point fused Multiply-Add Long to accumulator (vector)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlal_high_f16)"] @@ -11147,7 +11135,7 @@ pub fn vfms_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfms_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) } + vfms_f16(a, b, vdup_n_f16(vget_lane_f16::(c))) } #[doc = "Floating-point fused multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f16)"] @@ -11163,7 +11151,7 @@ pub fn vfms_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vfms_f16(a, b, vdup_n_f16(simd_extract!(c, LANE as u32))) } + vfms_f16(a, b, vdup_n_f16(vgetq_lane_f16::(c))) } #[doc = "Floating-point fused multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_lane_f16)"] @@ -11179,7 +11167,7 @@ pub fn vfmsq_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfmsq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) } + vfmsq_f16(a, b, vdupq_n_f16(vget_lane_f16::(c))) } #[doc = "Floating-point fused multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f16)"] @@ -11195,7 +11183,7 @@ pub fn vfmsq_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vfmsq_f16(a, b, vdupq_n_f16(simd_extract!(c, LANE as u32))) } + vfmsq_f16(a, b, vdupq_n_f16(vgetq_lane_f16::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_lane_f32)"] @@ -11210,7 +11198,7 @@ pub fn vfms_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) } + vfms_f32(a, b, vdup_n_f32(vget_lane_f32::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f32)"] @@ -11225,7 +11213,7 @@ pub fn vfms_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32))) } + vfms_f32(a, b, vdup_n_f32(vgetq_lane_f32::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_lane_f32)"] @@ -11240,7 +11228,7 @@ pub fn vfmsq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) } + vfmsq_f32(a, b, vdupq_n_f32(vget_lane_f32::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f32)"] @@ -11255,7 +11243,7 @@ pub fn vfmsq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32))) } + vfmsq_f32(a, b, vdupq_n_f32(vgetq_lane_f32::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_laneq_f64)"] @@ -11270,7 +11258,7 @@ pub fn vfmsq_laneq_f64( c: float64x2_t, ) -> float64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) } + vfmsq_f64(a, b, vdupq_n_f64(vgetq_lane_f64::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_lane_f64)"] @@ -11285,7 +11273,7 @@ pub fn vfms_lane_f64( c: float64x1_t, ) -> float64x1_t { static_assert!(LANE == 0); - unsafe { vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) } + vfms_f64(a, b, vdup_n_f64(vget_lane_f64::(c))) } #[doc = "Floating-point fused multiply-subtract to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_laneq_f64)"] @@ -11300,7 +11288,7 @@ pub fn vfms_laneq_f64( c: float64x2_t, ) -> float64x1_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32))) } + vfms_f64(a, b, vdup_n_f64(vgetq_lane_f64::(c))) } #[doc = "Floating-point fused Multiply-Subtract from accumulator."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfms_n_f16)"] @@ -11351,10 +11339,8 @@ pub fn vfmsh_f16(a: f16, b: f16, c: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vfmsh_lane_f16(a: f16, b: f16, v: float16x4_t) -> f16 { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: f16 = simd_extract!(v, LANE as u32); - vfmsh_f16(a, b, c) - } + let c: f16 = vget_lane_f16::(v); + vfmsh_f16(a, b, c) } #[doc = "Floating-point fused multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsh_laneq_f16)"] @@ -11366,10 +11352,8 @@ pub fn vfmsh_lane_f16(a: f16, b: f16, v: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vfmsh_laneq_f16(a: f16, b: f16, v: float16x8_t) -> f16 { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: f16 = simd_extract!(v, LANE as u32); - vfmsh_f16(a, b, c) - } + let c: f16 = vgetq_lane_f16::(v); + vfmsh_f16(a, b, c) } #[doc = "Floating-point fused multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_f64)"] @@ -11396,7 +11380,7 @@ pub fn vfmsq_lane_f64( c: float64x1_t, ) -> float64x2_t { static_assert!(LANE == 0); - unsafe { vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32))) } + vfmsq_f64(a, b, vdupq_n_f64(vget_lane_f64::(c))) } #[doc = "Floating-point fused Multiply-subtract to accumulator(vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmsq_n_f64)"] @@ -15152,7 +15136,7 @@ pub fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmul_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { static_assert!(LANE == 0); - unsafe { simd_mul(a, transmute::(simd_extract!(b, LANE as u32))) } + unsafe { simd_mul(a, transmute::(vget_lane_f64::(b))) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f16)"] @@ -15187,7 +15171,7 @@ pub fn vmulq_laneq_f16(a: float16x8_t, b: float16x8_t) -> float #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmul_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t { static_assert_uimm_bits!(LANE, 1); - unsafe { simd_mul(a, transmute::(simd_extract!(b, LANE as u32))) } + unsafe { simd_mul(a, transmute::(vgetq_lane_f64::(b))) } } #[doc = "Vector multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_f64)"] @@ -15216,10 +15200,8 @@ pub fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmuld_lane_f64(a: f64, b: float64x1_t) -> f64 { static_assert!(LANE == 0); - unsafe { - let b: f64 = simd_extract!(b, LANE as u32); - a * b - } + let b: f64 = vget_lane_f64::(b); + a * b } #[doc = "Add"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulh_f16)"] @@ -15241,10 +15223,8 @@ pub fn vmulh_f16(a: f16, b: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vmulh_lane_f16(a: f16, b: float16x4_t) -> f16 { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: f16 = simd_extract!(b, LANE as u32); - a * b - } + let b: f16 = vget_lane_f16::(b); + a * b } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulh_laneq_f16)"] @@ -15256,10 +15236,8 @@ pub fn vmulh_lane_f16(a: f16, b: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vmulh_laneq_f16(a: f16, b: float16x8_t) -> f16 { static_assert_uimm_bits!(LANE, 3); - unsafe { - let b: f16 = simd_extract!(b, LANE as u32); - a * b - } + let b: f16 = vgetq_lane_f16::(b); + a * b } #[doc = "Multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_s16)"] @@ -15392,7 +15370,7 @@ pub fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(pmull2))] pub fn vmull_high_p64(a: poly64x2_t, b: poly64x2_t) -> p128 { - unsafe { vmull_p64(simd_extract!(a, 1), simd_extract!(b, 1)) } + vmull_p64(vgetq_lane_p64::<1>(a), vgetq_lane_p64::<1>(b)) } #[doc = "Polynomial multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_p8)"] @@ -15532,10 +15510,8 @@ pub fn vmulq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmuls_lane_f32(a: f32, b: float32x2_t) -> f32 { static_assert_uimm_bits!(LANE, 1); - unsafe { - let b: f32 = simd_extract!(b, LANE as u32); - a * b - } + let b: f32 = vget_lane_f32::(b); + a * b } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuls_laneq_f32)"] @@ -15546,10 +15522,8 @@ pub fn vmuls_lane_f32(a: f32, b: float32x2_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmuls_laneq_f32(a: f32, b: float32x4_t) -> f32 { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: f32 = simd_extract!(b, LANE as u32); - a * b - } + let b: f32 = vgetq_lane_f32::(b); + a * b } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuld_laneq_f64)"] @@ -15560,10 +15534,8 @@ pub fn vmuls_laneq_f32(a: f32, b: float32x4_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmuld_laneq_f64(a: f64, b: float64x2_t) -> f64 { static_assert_uimm_bits!(LANE, 1); - unsafe { - let b: f64 = simd_extract!(b, LANE as u32); - a * b - } + let b: f64 = vgetq_lane_f64::(b); + a * b } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_f16)"] @@ -15775,7 +15747,7 @@ pub fn vmulxq_laneq_f64(a: float64x2_t, b: float64x2_t) -> floa #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulx_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { static_assert!(LANE == 0); - unsafe { vmulx_f64(a, transmute::(simd_extract!(b, LANE as u32))) } + unsafe { vmulx_f64(a, transmute(vget_lane_f64::(b))) } } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f64)"] @@ -15786,7 +15758,7 @@ pub fn vmulx_lane_f64(a: float64x1_t, b: float64x1_t) -> float6 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulx_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulx_f64(a, transmute::(simd_extract!(b, LANE as u32))) } + unsafe { vmulx_f64(a, transmute(vgetq_lane_f64::(b))) } } #[doc = "Vector multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_n_f16)"] @@ -15849,7 +15821,7 @@ pub fn vmulxs_f32(a: f32, b: f32) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxd_lane_f64(a: f64, b: float64x1_t) -> f64 { static_assert!(LANE == 0); - unsafe { vmulxd_f64(a, simd_extract!(b, LANE as u32)) } + vmulxd_f64(a, vget_lane_f64::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxd_laneq_f64)"] @@ -15860,7 +15832,7 @@ pub fn vmulxd_lane_f64(a: f64, b: float64x1_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxd_laneq_f64(a: f64, b: float64x2_t) -> f64 { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulxd_f64(a, simd_extract!(b, LANE as u32)) } + vmulxd_f64(a, vgetq_lane_f64::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxs_lane_f32)"] @@ -15871,7 +15843,7 @@ pub fn vmulxd_laneq_f64(a: f64, b: float64x2_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxs_lane_f32(a: f32, b: float32x2_t) -> f32 { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulxs_f32(a, simd_extract!(b, LANE as u32)) } + vmulxs_f32(a, vget_lane_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxs_laneq_f32)"] @@ -15882,7 +15854,7 @@ pub fn vmulxs_lane_f32(a: f32, b: float32x2_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxs_laneq_f32(a: f32, b: float32x4_t) -> f32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulxs_f32(a, simd_extract!(b, LANE as u32)) } + vmulxs_f32(a, vgetq_lane_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxh_f16)"] @@ -15911,7 +15883,7 @@ pub fn vmulxh_f16(a: f16, b: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vmulxh_lane_f16(a: f16, b: float16x4_t) -> f16 { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulxh_f16(a, simd_extract!(b, LANE as u32)) } + vmulxh_f16(a, vget_lane_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxh_laneq_f16)"] @@ -15923,7 +15895,7 @@ pub fn vmulxh_lane_f16(a: f16, b: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] pub fn vmulxh_laneq_f16(a: f16, b: float16x8_t) -> f16 { static_assert_uimm_bits!(LANE, 3); - unsafe { vmulxh_f16(a, simd_extract!(b, LANE as u32)) } + vmulxh_f16(a, vgetq_lane_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f64)"] @@ -15998,11 +15970,9 @@ pub fn vnegh_f16(a: f16) -> f16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub fn vpaddd_f64(a: float64x2_t) -> f64 { - unsafe { - let a1: f64 = simd_extract!(a, 0); - let a2: f64 = simd_extract!(a, 1); - a1 + a2 - } + let a1: f64 = vgetq_lane_f64::<0>(a); + let a2: f64 = vgetq_lane_f64::<1>(a); + a1 + a2 } #[doc = "Floating-point add pairwise"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpadds_f32)"] @@ -16011,11 +15981,9 @@ pub fn vpaddd_f64(a: float64x2_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub fn vpadds_f32(a: float32x2_t) -> f32 { - unsafe { - let a1: f32 = simd_extract!(a, 0); - let a2: f32 = simd_extract!(a, 1); - a1 + a2 - } + let a1: f32 = vget_lane_f32::<0>(a); + let a2: f32 = vget_lane_f32::<1>(a); + a1 + a2 } #[doc = "Add pairwise"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddd_s64)"] @@ -16834,7 +16802,7 @@ pub fn vqabsq_s64(a: int64x2_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))] pub fn vqabsb_s8(a: i8) -> i8 { - unsafe { simd_extract!(vqabs_s8(vdup_n_s8(a)), 0) } + vget_lane_s8::<0>(vqabs_s8(vdup_n_s8(a))) } #[doc = "Signed saturating absolute value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabsh_s16)"] @@ -16843,7 +16811,7 @@ pub fn vqabsb_s8(a: i8) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(sqabs))] pub fn vqabsh_s16(a: i16) -> i16 { - unsafe { simd_extract!(vqabs_s16(vdup_n_s16(a)), 0) } + vget_lane_s16::<0>(vqabs_s16(vdup_n_s16(a))) } #[doc = "Signed saturating absolute value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqabss_s32)"] @@ -16886,7 +16854,7 @@ pub fn vqabsd_s64(a: i64) -> i64 { pub fn vqaddb_s8(a: i8, b: i8) -> i8 { let a: int8x8_t = vdup_n_s8(a); let b: int8x8_t = vdup_n_s8(b); - unsafe { simd_extract!(vqadd_s8(a, b), 0) } + vget_lane_s8::<0>(vqadd_s8(a, b)) } #[doc = "Saturating add"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddh_s16)"] @@ -16897,7 +16865,7 @@ pub fn vqaddb_s8(a: i8, b: i8) -> i8 { pub fn vqaddh_s16(a: i16, b: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqadd_s16(a, b), 0) } + vget_lane_s16::<0>(vqadd_s16(a, b)) } #[doc = "Saturating add"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddb_u8)"] @@ -16908,7 +16876,7 @@ pub fn vqaddh_s16(a: i16, b: i16) -> i16 { pub fn vqaddb_u8(a: u8, b: u8) -> u8 { let a: uint8x8_t = vdup_n_u8(a); let b: uint8x8_t = vdup_n_u8(b); - unsafe { simd_extract!(vqadd_u8(a, b), 0) } + vget_lane_u8::<0>(vqadd_u8(a, b)) } #[doc = "Saturating add"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqaddh_u16)"] @@ -16919,7 +16887,7 @@ pub fn vqaddb_u8(a: u8, b: u8) -> u8 { pub fn vqaddh_u16(a: u16, b: u16) -> u16 { let a: uint16x4_t = vdup_n_u16(a); let b: uint16x4_t = vdup_n_u16(b); - unsafe { simd_extract!(vqadd_u16(a, b), 0) } + vget_lane_u16::<0>(vqadd_u16(a, b)) } #[doc = "Saturating add"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqadds_s32)"] @@ -17096,7 +17064,7 @@ pub fn vqdmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlalh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqdmlalh_s16(a, b, vget_lane_s16::(c)) } #[doc = "Signed saturating doubling multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlalh_laneq_s16)"] @@ -17107,7 +17075,7 @@ pub fn vqdmlalh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlalh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32 { static_assert_uimm_bits!(LANE, 3); - unsafe { vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqdmlalh_s16(a, b, vgetq_lane_s16::(c)) } #[doc = "Signed saturating doubling multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_lane_s32)"] @@ -17118,7 +17086,7 @@ pub fn vqdmlalh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlals_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64 { static_assert_uimm_bits!(LANE, 1); - unsafe { vqdmlals_s32(a, b, simd_extract!(c, LANE as u32)) } + vqdmlals_s32(a, b, vget_lane_s32::(c)) } #[doc = "Signed saturating doubling multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_laneq_s32)"] @@ -17129,7 +17097,7 @@ pub fn vqdmlals_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlals_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmlals_s32(a, b, simd_extract!(c, LANE as u32)) } + vqdmlals_s32(a, b, vgetq_lane_s32::(c)) } #[doc = "Signed saturating doubling multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlalh_s16)"] @@ -17139,7 +17107,7 @@ pub fn vqdmlals_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlalh_s16(a: i32, b: i16, c: i16) -> i32 { let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c)); - unsafe { vqadds_s32(a, simd_extract!(x, 0)) } + vqadds_s32(a, vgetq_lane_s32::<0>(x)) } #[doc = "Signed saturating doubling multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlals_s32)"] @@ -17262,7 +17230,7 @@ pub fn vqdmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlslh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqdmlslh_s16(a, b, vget_lane_s16::(c)) } #[doc = "Signed saturating doubling multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlslh_laneq_s16)"] @@ -17273,7 +17241,7 @@ pub fn vqdmlslh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlslh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32 { static_assert_uimm_bits!(LANE, 3); - unsafe { vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqdmlslh_s16(a, b, vgetq_lane_s16::(c)) } #[doc = "Signed saturating doubling multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_lane_s32)"] @@ -17284,7 +17252,7 @@ pub fn vqdmlslh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlsls_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64 { static_assert_uimm_bits!(LANE, 1); - unsafe { vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32)) } + vqdmlsls_s32(a, b, vget_lane_s32::(c)) } #[doc = "Signed saturating doubling multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_laneq_s32)"] @@ -17295,7 +17263,7 @@ pub fn vqdmlsls_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlsls_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32)) } + vqdmlsls_s32(a, b, vgetq_lane_s32::(c)) } #[doc = "Signed saturating doubling multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlslh_s16)"] @@ -17305,7 +17273,7 @@ pub fn vqdmlsls_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmlslh_s16(a: i32, b: i16, c: i16) -> i32 { let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c)); - unsafe { vqsubs_s32(a, simd_extract!(x, 0)) } + vqsubs_s32(a, vgetq_lane_s32::<0>(x)) } #[doc = "Signed saturating doubling multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmlsls_s32)"] @@ -17326,7 +17294,7 @@ pub fn vqdmlsls_s32(a: i64, b: i32, c: i32) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32))) } + vqdmulh_s16(a, vdup_n_s16(vget_lane_s16::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_lane_s16)"] @@ -17337,7 +17305,7 @@ pub fn vqdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32))) } + vqdmulhq_s16(a, vdupq_n_s16(vget_lane_s16::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_lane_s32)"] @@ -17348,7 +17316,7 @@ pub fn vqdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32))) } + vqdmulh_s32(a, vdup_n_s32(vget_lane_s32::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_lane_s32)"] @@ -17359,7 +17327,7 @@ pub fn vqdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32))) } + vqdmulhq_s32(a, vdupq_n_s32(vget_lane_s32::(b))) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_lane_s16)"] @@ -17370,10 +17338,8 @@ pub fn vqdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16 { static_assert_uimm_bits!(N, 2); - unsafe { - let b: i16 = simd_extract!(b, N as u32); - vqdmulhh_s16(a, b) - } + let b: i16 = vget_lane_s16::(b); + vqdmulhh_s16(a, b) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_laneq_s16)"] @@ -17384,10 +17350,8 @@ pub fn vqdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16 { static_assert_uimm_bits!(N, 3); - unsafe { - let b: i16 = simd_extract!(b, N as u32); - vqdmulhh_s16(a, b) - } + let b: i16 = vgetq_lane_s16::(b); + vqdmulhh_s16(a, b) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhh_s16)"] @@ -17398,7 +17362,7 @@ pub fn vqdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16 { pub fn vqdmulhh_s16(a: i16, b: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqdmulh_s16(a, b), 0) } + vget_lane_s16::<0>(vqdmulh_s16(a, b)) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_s32)"] @@ -17409,7 +17373,7 @@ pub fn vqdmulhh_s16(a: i16, b: i16) -> i16 { pub fn vqdmulhs_s32(a: i32, b: i32) -> i32 { let a: int32x2_t = vdup_n_s32(a); let b: int32x2_t = vdup_n_s32(b); - unsafe { simd_extract!(vqdmulh_s32(a, b), 0) } + vget_lane_s32::<0>(vqdmulh_s32(a, b)) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_lane_s32)"] @@ -17420,10 +17384,8 @@ pub fn vqdmulhs_s32(a: i32, b: i32) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32 { static_assert_uimm_bits!(N, 1); - unsafe { - let b: i32 = simd_extract!(b, N as u32); - vqdmulhs_s32(a, b) - } + let b: i32 = vget_lane_s32::(b); + vqdmulhs_s32(a, b) } #[doc = "Signed saturating doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhs_laneq_s32)"] @@ -17434,10 +17396,8 @@ pub fn vqdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 { static_assert_uimm_bits!(N, 2); - unsafe { - let b: i32 = simd_extract!(b, N as u32); - vqdmulhs_s32(a, b) - } + let b: i32 = vgetq_lane_s32::(b); + vqdmulhs_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_lane_s16)"] @@ -17588,10 +17548,8 @@ pub fn vqdmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmullh_lane_s16(a: i16, b: int16x4_t) -> i32 { static_assert_uimm_bits!(N, 2); - unsafe { - let b: i16 = simd_extract!(b, N as u32); - vqdmullh_s16(a, b) - } + let b: i16 = vget_lane_s16::(b); + vqdmullh_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_laneq_s32)"] @@ -17602,10 +17560,8 @@ pub fn vqdmullh_lane_s16(a: i16, b: int16x4_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulls_laneq_s32(a: i32, b: int32x4_t) -> i64 { static_assert_uimm_bits!(N, 2); - unsafe { - let b: i32 = simd_extract!(b, N as u32); - vqdmulls_s32(a, b) - } + let b: i32 = vgetq_lane_s32::(b); + vqdmulls_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_laneq_s16)"] @@ -17616,10 +17572,8 @@ pub fn vqdmulls_laneq_s32(a: i32, b: int32x4_t) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmullh_laneq_s16(a: i16, b: int16x8_t) -> i32 { static_assert_uimm_bits!(N, 3); - unsafe { - let b: i16 = simd_extract!(b, N as u32); - vqdmullh_s16(a, b) - } + let b: i16 = vgetq_lane_s16::(b); + vqdmullh_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_s16)"] @@ -17630,7 +17584,7 @@ pub fn vqdmullh_laneq_s16(a: i16, b: int16x8_t) -> i32 { pub fn vqdmullh_s16(a: i16, b: i16) -> i32 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqdmull_s16(a, b), 0) } + vgetq_lane_s32::<0>(vqdmull_s16(a, b)) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_lane_s32)"] @@ -17641,10 +17595,8 @@ pub fn vqdmullh_s16(a: i16, b: i16) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmulls_lane_s32(a: i32, b: int32x2_t) -> i64 { static_assert_uimm_bits!(N, 1); - unsafe { - let b: i32 = simd_extract!(b, N as u32); - vqdmulls_s32(a, b) - } + let b: i32 = vget_lane_s32::(b); + vqdmulls_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulls_s32)"] @@ -17767,7 +17719,7 @@ pub fn vqmovnd_u64(a: u64) -> u32 { #[cfg_attr(test, assert_instr(sqxtn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovnh_s16(a: i16) -> i8 { - unsafe { simd_extract!(vqmovn_s16(vdupq_n_s16(a)), 0) } + vget_lane_s8::<0>(vqmovn_s16(vdupq_n_s16(a))) } #[doc = "Saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovns_s32)"] @@ -17776,7 +17728,7 @@ pub fn vqmovnh_s16(a: i16) -> i8 { #[cfg_attr(test, assert_instr(sqxtn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovns_s32(a: i32) -> i16 { - unsafe { simd_extract!(vqmovn_s32(vdupq_n_s32(a)), 0) } + vget_lane_s16::<0>(vqmovn_s32(vdupq_n_s32(a))) } #[doc = "Saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnh_u16)"] @@ -17785,7 +17737,7 @@ pub fn vqmovns_s32(a: i32) -> i16 { #[cfg_attr(test, assert_instr(uqxtn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovnh_u16(a: u16) -> u8 { - unsafe { simd_extract!(vqmovn_u16(vdupq_n_u16(a)), 0) } + vget_lane_u8::<0>(vqmovn_u16(vdupq_n_u16(a))) } #[doc = "Saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovns_u32)"] @@ -17794,7 +17746,7 @@ pub fn vqmovnh_u16(a: u16) -> u8 { #[cfg_attr(test, assert_instr(uqxtn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovns_u32(a: u32) -> u16 { - unsafe { simd_extract!(vqmovn_u32(vdupq_n_u32(a)), 0) } + vget_lane_u16::<0>(vqmovn_u32(vdupq_n_u32(a))) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s16)"] @@ -17836,7 +17788,7 @@ pub fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t { #[cfg_attr(test, assert_instr(sqxtun))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovunh_s16(a: i16) -> u8 { - unsafe { simd_extract!(vqmovun_s16(vdupq_n_s16(a)), 0) } + vget_lane_u8::<0>(vqmovun_s16(vdupq_n_s16(a))) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovuns_s32)"] @@ -17845,7 +17797,7 @@ pub fn vqmovunh_s16(a: i16) -> u8 { #[cfg_attr(test, assert_instr(sqxtun))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovuns_s32(a: i32) -> u16 { - unsafe { simd_extract!(vqmovun_s32(vdupq_n_s32(a)), 0) } + vget_lane_u16::<0>(vqmovun_s32(vdupq_n_s32(a))) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovund_s64)"] @@ -17854,7 +17806,7 @@ pub fn vqmovuns_s32(a: i32) -> u16 { #[cfg_attr(test, assert_instr(sqxtun))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovund_s64(a: i64) -> u32 { - unsafe { simd_extract!(vqmovun_s64(vdupq_n_s64(a)), 0) } + vget_lane_u32::<0>(vqmovun_s64(vdupq_n_s64(a))) } #[doc = "Signed saturating negate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqneg_s64)"] @@ -17895,7 +17847,7 @@ pub fn vqnegq_s64(a: int64x2_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(sqneg))] pub fn vqnegb_s8(a: i8) -> i8 { - unsafe { simd_extract!(vqneg_s8(vdup_n_s8(a)), 0) } + vget_lane_s8::<0>(vqneg_s8(vdup_n_s8(a))) } #[doc = "Signed saturating negate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegh_s16)"] @@ -17904,7 +17856,7 @@ pub fn vqnegb_s8(a: i8) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(sqneg))] pub fn vqnegh_s16(a: i16) -> i16 { - unsafe { simd_extract!(vqneg_s16(vdup_n_s16(a)), 0) } + vget_lane_s16::<0>(vqneg_s16(vdup_n_s16(a))) } #[doc = "Signed saturating negate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegs_s32)"] @@ -17913,7 +17865,7 @@ pub fn vqnegh_s16(a: i16) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(sqneg))] pub fn vqnegs_s32(a: i32) -> i32 { - unsafe { simd_extract!(vqneg_s32(vdup_n_s32(a)), 0) } + vget_lane_s32::<0>(vqneg_s32(vdup_n_s32(a))) } #[doc = "Signed saturating negate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqnegd_s64)"] @@ -17922,7 +17874,7 @@ pub fn vqnegs_s32(a: i32) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(sqneg))] pub fn vqnegd_s64(a: i64) -> i64 { - unsafe { simd_extract!(vqneg_s64(vdup_n_s64(a)), 0) } + vget_lane_s64::<0>(vqneg_s64(vdup_n_s64(a))) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_lane_s16)"] @@ -18109,7 +18061,7 @@ pub fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlahh_s16(a, b, vget_lane_s16::(c)) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahh_laneq_s16)"] @@ -18120,7 +18072,7 @@ pub fn vqrdmlahh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16 { static_assert_uimm_bits!(LANE, 3); - unsafe { vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlahh_s16(a, b, vgetq_lane_s16::(c)) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_lane_s32)"] @@ -18131,7 +18083,7 @@ pub fn vqrdmlahh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32 { static_assert_uimm_bits!(LANE, 1); - unsafe { vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlahs_s32(a, b, vget_lane_s32::(c)) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_laneq_s32)"] @@ -18142,7 +18094,7 @@ pub fn vqrdmlahs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahs_laneq_s32(a: i32, b: i32, c: int32x4_t) -> i32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlahs_s32(a, b, vgetq_lane_s32::(c)) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahh_s16)"] @@ -18154,7 +18106,7 @@ pub fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); let c: int16x4_t = vdup_n_s16(c); - unsafe { simd_extract!(vqrdmlah_s16(a, b, c), 0) } + vget_lane_s16::<0>(vqrdmlah_s16(a, b, c)) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahs_s32)"] @@ -18166,7 +18118,7 @@ pub fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 { let a: int32x2_t = vdup_n_s32(a); let b: int32x2_t = vdup_n_s32(b); let c: int32x2_t = vdup_n_s32(c); - unsafe { simd_extract!(vqrdmlah_s32(a, b, c), 0) } + vget_lane_s32::<0>(vqrdmlah_s32(a, b, c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_lane_s16)"] @@ -18353,7 +18305,7 @@ pub fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlshh_s16(a, b, vget_lane_s16::(c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshh_laneq_s16)"] @@ -18364,7 +18316,7 @@ pub fn vqrdmlshh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16 { static_assert_uimm_bits!(LANE, 3); - unsafe { vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlshh_s16(a, b, vgetq_lane_s16::(c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_lane_s32)"] @@ -18375,7 +18327,7 @@ pub fn vqrdmlshh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32 { static_assert_uimm_bits!(LANE, 1); - unsafe { vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlshs_s32(a, b, vget_lane_s32::(c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_laneq_s32)"] @@ -18386,7 +18338,7 @@ pub fn vqrdmlshs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshs_laneq_s32(a: i32, b: i32, c: int32x4_t) -> i32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32)) } + vqrdmlshs_s32(a, b, vgetq_lane_s32::(c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshh_s16)"] @@ -18398,7 +18350,7 @@ pub fn vqrdmlshh_s16(a: i16, b: i16, c: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); let c: int16x4_t = vdup_n_s16(c); - unsafe { simd_extract!(vqrdmlsh_s16(a, b, c), 0) } + vget_lane_s16::<0>(vqrdmlsh_s16(a, b, c)) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshs_s32)"] @@ -18410,7 +18362,7 @@ pub fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32 { let a: int32x2_t = vdup_n_s32(a); let b: int32x2_t = vdup_n_s32(b); let c: int32x2_t = vdup_n_s32(c); - unsafe { simd_extract!(vqrdmlsh_s32(a, b, c), 0) } + vget_lane_s32::<0>(vqrdmlsh_s32(a, b, c)) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_lane_s16)"] @@ -18421,7 +18373,7 @@ pub fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmulhh_s16(a, simd_extract!(b, LANE as u32)) } + vqrdmulhh_s16(a, vget_lane_s16::(b)) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_laneq_s16)"] @@ -18432,7 +18384,7 @@ pub fn vqrdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16 { static_assert_uimm_bits!(LANE, 3); - unsafe { vqrdmulhh_s16(a, simd_extract!(b, LANE as u32)) } + vqrdmulhh_s16(a, vgetq_lane_s16::(b)) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_lane_s32)"] @@ -18443,7 +18395,7 @@ pub fn vqrdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32 { static_assert_uimm_bits!(LANE, 1); - unsafe { vqrdmulhs_s32(a, simd_extract!(b, LANE as u32)) } + vqrdmulhs_s32(a, vget_lane_s32::(b)) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_laneq_s32)"] @@ -18454,7 +18406,7 @@ pub fn vqrdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 { static_assert_uimm_bits!(LANE, 2); - unsafe { vqrdmulhs_s32(a, simd_extract!(b, LANE as u32)) } + vqrdmulhs_s32(a, vgetq_lane_s32::(b)) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhh_s16)"] @@ -18463,7 +18415,7 @@ pub fn vqrdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 { #[cfg_attr(test, assert_instr(sqrdmulh))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhh_s16(a: i16, b: i16) -> i16 { - unsafe { simd_extract!(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b)), 0) } + vget_lane_s16::<0>(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b))) } #[doc = "Signed saturating rounding doubling multiply returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhs_s32)"] @@ -18472,7 +18424,7 @@ pub fn vqrdmulhh_s16(a: i16, b: i16) -> i16 { #[cfg_attr(test, assert_instr(sqrdmulh))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrdmulhs_s32(a: i32, b: i32) -> i32 { - unsafe { simd_extract!(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b)), 0) } + vget_lane_s32::<0>(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b))) } #[doc = "Signed saturating rounding shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlb_s8)"] @@ -18483,7 +18435,7 @@ pub fn vqrdmulhs_s32(a: i32, b: i32) -> i32 { pub fn vqrshlb_s8(a: i8, b: i8) -> i8 { let a: int8x8_t = vdup_n_s8(a); let b: int8x8_t = vdup_n_s8(b); - unsafe { simd_extract!(vqrshl_s8(a, b), 0) } + vget_lane_s8::<0>(vqrshl_s8(a, b)) } #[doc = "Signed saturating rounding shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlh_s16)"] @@ -18494,7 +18446,7 @@ pub fn vqrshlb_s8(a: i8, b: i8) -> i8 { pub fn vqrshlh_s16(a: i16, b: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqrshl_s16(a, b), 0) } + vget_lane_s16::<0>(vqrshl_s16(a, b)) } #[doc = "Unsigned signed saturating rounding shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlb_u8)"] @@ -18505,7 +18457,7 @@ pub fn vqrshlh_s16(a: i16, b: i16) -> i16 { pub fn vqrshlb_u8(a: u8, b: i8) -> u8 { let a: uint8x8_t = vdup_n_u8(a); let b: int8x8_t = vdup_n_s8(b); - unsafe { simd_extract!(vqrshl_u8(a, b), 0) } + vget_lane_u8::<0>(vqrshl_u8(a, b)) } #[doc = "Unsigned signed saturating rounding shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshlh_u16)"] @@ -18516,7 +18468,7 @@ pub fn vqrshlb_u8(a: u8, b: i8) -> u8 { pub fn vqrshlh_u16(a: u16, b: i16) -> u16 { let a: uint16x4_t = vdup_n_u16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqrshl_u16(a, b), 0) } + vget_lane_u16::<0>(vqrshl_u16(a, b)) } #[doc = "Signed saturating rounding shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshld_s64)"] @@ -18670,7 +18622,7 @@ pub fn vqrshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x pub fn vqrshrnd_n_u64(a: u64) -> u32 { static_assert!(N >= 1 && N <= 32); let a: uint64x2_t = vdupq_n_u64(a); - unsafe { simd_extract!(vqrshrn_n_u64::(a), 0) } + vget_lane_u32::<0>(vqrshrn_n_u64::(a)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnh_n_u16)"] @@ -18682,7 +18634,7 @@ pub fn vqrshrnd_n_u64(a: u64) -> u32 { pub fn vqrshrnh_n_u16(a: u16) -> u8 { static_assert!(N >= 1 && N <= 8); let a: uint16x8_t = vdupq_n_u16(a); - unsafe { simd_extract!(vqrshrn_n_u16::(a), 0) } + vget_lane_u8::<0>(vqrshrn_n_u16::(a)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrns_n_u32)"] @@ -18694,7 +18646,7 @@ pub fn vqrshrnh_n_u16(a: u16) -> u8 { pub fn vqrshrns_n_u32(a: u32) -> u16 { static_assert!(N >= 1 && N <= 16); let a: uint32x4_t = vdupq_n_u32(a); - unsafe { simd_extract!(vqrshrn_n_u32::(a), 0) } + vget_lane_u16::<0>(vqrshrn_n_u32::(a)) } #[doc = "Signed saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnh_n_s16)"] @@ -18706,7 +18658,7 @@ pub fn vqrshrns_n_u32(a: u32) -> u16 { pub fn vqrshrnh_n_s16(a: i16) -> i8 { static_assert!(N >= 1 && N <= 8); let a: int16x8_t = vdupq_n_s16(a); - unsafe { simd_extract!(vqrshrn_n_s16::(a), 0) } + vget_lane_s8::<0>(vqrshrn_n_s16::(a)) } #[doc = "Signed saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrns_n_s32)"] @@ -18718,7 +18670,7 @@ pub fn vqrshrnh_n_s16(a: i16) -> i8 { pub fn vqrshrns_n_s32(a: i32) -> i16 { static_assert!(N >= 1 && N <= 16); let a: int32x4_t = vdupq_n_s32(a); - unsafe { simd_extract!(vqrshrn_n_s32::(a), 0) } + vget_lane_s16::<0>(vqrshrn_n_s32::(a)) } #[doc = "Signed saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnd_n_s64)"] @@ -18730,7 +18682,7 @@ pub fn vqrshrns_n_s32(a: i32) -> i16 { pub fn vqrshrnd_n_s64(a: i64) -> i32 { static_assert!(N >= 1 && N <= 32); let a: int64x2_t = vdupq_n_s64(a); - unsafe { simd_extract!(vqrshrn_n_s64::(a), 0) } + vget_lane_s32::<0>(vqrshrn_n_s64::(a)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s16)"] @@ -18781,7 +18733,7 @@ pub fn vqrshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x pub fn vqrshrund_n_s64(a: i64) -> u32 { static_assert!(N >= 1 && N <= 32); let a: int64x2_t = vdupq_n_s64(a); - unsafe { simd_extract!(vqrshrun_n_s64::(a), 0) } + vget_lane_u32::<0>(vqrshrun_n_s64::(a)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrunh_n_s16)"] @@ -18793,7 +18745,7 @@ pub fn vqrshrund_n_s64(a: i64) -> u32 { pub fn vqrshrunh_n_s16(a: i16) -> u8 { static_assert!(N >= 1 && N <= 8); let a: int16x8_t = vdupq_n_s16(a); - unsafe { simd_extract!(vqrshrun_n_s16::(a), 0) } + vget_lane_u8::<0>(vqrshrun_n_s16::(a)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshruns_n_s32)"] @@ -18805,7 +18757,7 @@ pub fn vqrshrunh_n_s16(a: i16) -> u8 { pub fn vqrshruns_n_s32(a: i32) -> u16 { static_assert!(N >= 1 && N <= 16); let a: int32x4_t = vdupq_n_s32(a); - unsafe { simd_extract!(vqrshrun_n_s32::(a), 0) } + vget_lane_u16::<0>(vqrshrun_n_s32::(a)) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_n_s8)"] @@ -18816,7 +18768,7 @@ pub fn vqrshruns_n_s32(a: i32) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlb_n_s8(a: i8) -> i8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(vqshl_n_s8::(vdup_n_s8(a)), 0) } + vget_lane_s8::<0>(vqshl_n_s8::(vdup_n_s8(a))) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_n_s64)"] @@ -18827,7 +18779,7 @@ pub fn vqshlb_n_s8(a: i8) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshld_n_s64(a: i64) -> i64 { static_assert_uimm_bits!(N, 6); - unsafe { simd_extract!(vqshl_n_s64::(vdup_n_s64(a)), 0) } + vget_lane_s64::<0>(vqshl_n_s64::(vdup_n_s64(a))) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_n_s16)"] @@ -18838,7 +18790,7 @@ pub fn vqshld_n_s64(a: i64) -> i64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlh_n_s16(a: i16) -> i16 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(vqshl_n_s16::(vdup_n_s16(a)), 0) } + vget_lane_s16::<0>(vqshl_n_s16::(vdup_n_s16(a))) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_n_s32)"] @@ -18849,7 +18801,7 @@ pub fn vqshlh_n_s16(a: i16) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshls_n_s32(a: i32) -> i32 { static_assert_uimm_bits!(N, 5); - unsafe { simd_extract!(vqshl_n_s32::(vdup_n_s32(a)), 0) } + vget_lane_s32::<0>(vqshl_n_s32::(vdup_n_s32(a))) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_n_u8)"] @@ -18860,7 +18812,7 @@ pub fn vqshls_n_s32(a: i32) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlb_n_u8(a: u8) -> u8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(vqshl_n_u8::(vdup_n_u8(a)), 0) } + vget_lane_u8::<0>(vqshl_n_u8::(vdup_n_u8(a))) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_n_u64)"] @@ -18871,7 +18823,7 @@ pub fn vqshlb_n_u8(a: u8) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshld_n_u64(a: u64) -> u64 { static_assert_uimm_bits!(N, 6); - unsafe { simd_extract!(vqshl_n_u64::(vdup_n_u64(a)), 0) } + vget_lane_u64::<0>(vqshl_n_u64::(vdup_n_u64(a))) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_n_u16)"] @@ -18882,7 +18834,7 @@ pub fn vqshld_n_u64(a: u64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlh_n_u16(a: u16) -> u16 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(vqshl_n_u16::(vdup_n_u16(a)), 0) } + vget_lane_u16::<0>(vqshl_n_u16::(vdup_n_u16(a))) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_n_u32)"] @@ -18893,7 +18845,7 @@ pub fn vqshlh_n_u16(a: u16) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshls_n_u32(a: u32) -> u32 { static_assert_uimm_bits!(N, 5); - unsafe { simd_extract!(vqshl_n_u32::(vdup_n_u32(a)), 0) } + vget_lane_u32::<0>(vqshl_n_u32::(vdup_n_u32(a))) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_s8)"] @@ -18903,7 +18855,7 @@ pub fn vqshls_n_u32(a: u32) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlb_s8(a: i8, b: i8) -> i8 { let c: int8x8_t = vqshl_s8(vdup_n_s8(a), vdup_n_s8(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_s8::<0>(c) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_s16)"] @@ -18913,7 +18865,7 @@ pub fn vqshlb_s8(a: i8, b: i8) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlh_s16(a: i16, b: i16) -> i16 { let c: int16x4_t = vqshl_s16(vdup_n_s16(a), vdup_n_s16(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_s16::<0>(c) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_s32)"] @@ -18923,7 +18875,7 @@ pub fn vqshlh_s16(a: i16, b: i16) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshls_s32(a: i32, b: i32) -> i32 { let c: int32x2_t = vqshl_s32(vdup_n_s32(a), vdup_n_s32(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_s32::<0>(c) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlb_u8)"] @@ -18933,7 +18885,7 @@ pub fn vqshls_s32(a: i32, b: i32) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlb_u8(a: u8, b: i8) -> u8 { let c: uint8x8_t = vqshl_u8(vdup_n_u8(a), vdup_n_s8(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_u8::<0>(c) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlh_u16)"] @@ -18943,7 +18895,7 @@ pub fn vqshlb_u8(a: u8, b: i8) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlh_u16(a: u16, b: i16) -> u16 { let c: uint16x4_t = vqshl_u16(vdup_n_u16(a), vdup_n_s16(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_u16::<0>(c) } #[doc = "Unsigned saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshls_u32)"] @@ -18953,7 +18905,7 @@ pub fn vqshlh_u16(a: u16, b: i16) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshls_u32(a: u32, b: i32) -> u32 { let c: uint32x2_t = vqshl_u32(vdup_n_u32(a), vdup_n_s32(b)); - unsafe { simd_extract!(c, 0) } + vget_lane_u32::<0>(c) } #[doc = "Signed saturating shift left"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshld_s64)"] @@ -18996,7 +18948,7 @@ pub fn vqshld_u64(a: u64, b: i64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlub_n_s8(a: i8) -> u8 { static_assert_uimm_bits!(N, 3); - unsafe { simd_extract!(vqshlu_n_s8::(vdup_n_s8(a)), 0) } + vget_lane_u8::<0>(vqshlu_n_s8::(vdup_n_s8(a))) } #[doc = "Signed saturating shift left unsigned"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlud_n_s64)"] @@ -19007,7 +18959,7 @@ pub fn vqshlub_n_s8(a: i8) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlud_n_s64(a: i64) -> u64 { static_assert_uimm_bits!(N, 6); - unsafe { simd_extract!(vqshlu_n_s64::(vdup_n_s64(a)), 0) } + vget_lane_u64::<0>(vqshlu_n_s64::(vdup_n_s64(a))) } #[doc = "Signed saturating shift left unsigned"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluh_n_s16)"] @@ -19018,7 +18970,7 @@ pub fn vqshlud_n_s64(a: i64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshluh_n_s16(a: i16) -> u16 { static_assert_uimm_bits!(N, 4); - unsafe { simd_extract!(vqshlu_n_s16::(vdup_n_s16(a)), 0) } + vget_lane_u16::<0>(vqshlu_n_s16::(vdup_n_s16(a))) } #[doc = "Signed saturating shift left unsigned"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlus_n_s32)"] @@ -19029,7 +18981,7 @@ pub fn vqshluh_n_s16(a: i16) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshlus_n_s32(a: i32) -> u32 { static_assert_uimm_bits!(N, 5); - unsafe { simd_extract!(vqshlu_n_s32::(vdup_n_s32(a)), 0) } + vget_lane_u32::<0>(vqshlu_n_s32::(vdup_n_s32(a))) } #[doc = "Signed saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s16)"] @@ -19154,7 +19106,7 @@ pub fn vqshrnd_n_u64(a: u64) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrnh_n_s16(a: i16) -> i8 { static_assert!(N >= 1 && N <= 8); - unsafe { simd_extract!(vqshrn_n_s16::(vdupq_n_s16(a)), 0) } + vget_lane_s8::<0>(vqshrn_n_s16::(vdupq_n_s16(a))) } #[doc = "Signed saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrns_n_s32)"] @@ -19165,7 +19117,7 @@ pub fn vqshrnh_n_s16(a: i16) -> i8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrns_n_s32(a: i32) -> i16 { static_assert!(N >= 1 && N <= 16); - unsafe { simd_extract!(vqshrn_n_s32::(vdupq_n_s32(a)), 0) } + vget_lane_s16::<0>(vqshrn_n_s32::(vdupq_n_s32(a))) } #[doc = "Unsigned saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnh_n_u16)"] @@ -19176,7 +19128,7 @@ pub fn vqshrns_n_s32(a: i32) -> i16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrnh_n_u16(a: u16) -> u8 { static_assert!(N >= 1 && N <= 8); - unsafe { simd_extract!(vqshrn_n_u16::(vdupq_n_u16(a)), 0) } + vget_lane_u8::<0>(vqshrn_n_u16::(vdupq_n_u16(a))) } #[doc = "Unsigned saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrns_n_u32)"] @@ -19187,7 +19139,7 @@ pub fn vqshrnh_n_u16(a: u16) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrns_n_u32(a: u32) -> u16 { static_assert!(N >= 1 && N <= 16); - unsafe { simd_extract!(vqshrn_n_u32::(vdupq_n_u32(a)), 0) } + vget_lane_u16::<0>(vqshrn_n_u32::(vdupq_n_u32(a))) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s16)"] @@ -19237,7 +19189,7 @@ pub fn vqshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrund_n_s64(a: i64) -> u32 { static_assert!(N >= 1 && N <= 32); - unsafe { simd_extract!(vqshrun_n_s64::(vdupq_n_s64(a)), 0) } + vget_lane_u32::<0>(vqshrun_n_s64::(vdupq_n_s64(a))) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrunh_n_s16)"] @@ -19248,7 +19200,7 @@ pub fn vqshrund_n_s64(a: i64) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrunh_n_s16(a: i16) -> u8 { static_assert!(N >= 1 && N <= 8); - unsafe { simd_extract!(vqshrun_n_s16::(vdupq_n_s16(a)), 0) } + vget_lane_u8::<0>(vqshrun_n_s16::(vdupq_n_s16(a))) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshruns_n_s32)"] @@ -19259,7 +19211,7 @@ pub fn vqshrunh_n_s16(a: i16) -> u8 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshruns_n_s32(a: i32) -> u16 { static_assert!(N >= 1 && N <= 16); - unsafe { simd_extract!(vqshrun_n_s32::(vdupq_n_s32(a)), 0) } + vget_lane_u16::<0>(vqshrun_n_s32::(vdupq_n_s32(a))) } #[doc = "Saturating subtract"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubb_s8)"] @@ -19270,7 +19222,7 @@ pub fn vqshruns_n_s32(a: i32) -> u16 { pub fn vqsubb_s8(a: i8, b: i8) -> i8 { let a: int8x8_t = vdup_n_s8(a); let b: int8x8_t = vdup_n_s8(b); - unsafe { simd_extract!(vqsub_s8(a, b), 0) } + vget_lane_s8::<0>(vqsub_s8(a, b)) } #[doc = "Saturating subtract"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubh_s16)"] @@ -19281,7 +19233,7 @@ pub fn vqsubb_s8(a: i8, b: i8) -> i8 { pub fn vqsubh_s16(a: i16, b: i16) -> i16 { let a: int16x4_t = vdup_n_s16(a); let b: int16x4_t = vdup_n_s16(b); - unsafe { simd_extract!(vqsub_s16(a, b), 0) } + vget_lane_s16::<0>(vqsub_s16(a, b)) } #[doc = "Saturating subtract"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubb_u8)"] @@ -19292,7 +19244,7 @@ pub fn vqsubh_s16(a: i16, b: i16) -> i16 { pub fn vqsubb_u8(a: u8, b: u8) -> u8 { let a: uint8x8_t = vdup_n_u8(a); let b: uint8x8_t = vdup_n_u8(b); - unsafe { simd_extract!(vqsub_u8(a, b), 0) } + vget_lane_u8::<0>(vqsub_u8(a, b)) } #[doc = "Saturating subtract"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubh_u16)"] @@ -19303,7 +19255,7 @@ pub fn vqsubb_u8(a: u8, b: u8) -> u8 { pub fn vqsubh_u16(a: u16, b: u16) -> u16 { let a: uint16x4_t = vdup_n_u16(a); let b: uint16x4_t = vdup_n_u16(b); - unsafe { simd_extract!(vqsub_u16(a, b), 0) } + vget_lane_u16::<0>(vqsub_u16(a, b)) } #[doc = "Saturating subtract"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqsubs_s32)"] @@ -22050,7 +22002,7 @@ pub fn vrnd32x_f64(a: float64x1_t) -> float64x1_t { )] fn _vrnd32x_f64(a: f64) -> f64; } - unsafe { transmute(_vrnd32x_f64(simd_extract!(a, 0))) } + unsafe { transmute(_vrnd32x_f64(vget_lane_f64::<0>(a))) } } #[doc = "Floating-point round to 32-bit integer toward zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32z_f32)"] @@ -22114,7 +22066,7 @@ pub fn vrnd32z_f64(a: float64x1_t) -> float64x1_t { )] fn _vrnd32z_f64(a: f64) -> f64; } - unsafe { transmute(_vrnd32z_f64(simd_extract!(a, 0))) } + unsafe { transmute(_vrnd32z_f64(vget_lane_f64::<0>(a))) } } #[doc = "Floating-point round to 64-bit integer, using current rounding mode"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64x_f32)"] @@ -22178,7 +22130,7 @@ pub fn vrnd64x_f64(a: float64x1_t) -> float64x1_t { )] fn _vrnd64x_f64(a: f64) -> f64; } - unsafe { transmute(_vrnd64x_f64(simd_extract!(a, 0))) } + unsafe { transmute(_vrnd64x_f64(vget_lane_f64::<0>(a))) } } #[doc = "Floating-point round to 64-bit integer toward zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64z_f32)"] @@ -22242,7 +22194,7 @@ pub fn vrnd64z_f64(a: float64x1_t) -> float64x1_t { )] fn _vrnd64z_f64(a: f64) -> f64; } - unsafe { transmute(_vrnd64z_f64(simd_extract!(a, 0))) } + unsafe { transmute(_vrnd64z_f64(vget_lane_f64::<0>(a))) } } #[doc = "Floating-point round to integral, toward zero"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd_f16)"] @@ -24157,7 +24109,7 @@ pub fn vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t { #[cfg_attr(test, assert_instr(usqadd))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vsqaddb_u8(a: u8, b: i8) -> u8 { - unsafe { simd_extract!(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b)), 0) } + vget_lane_u8::<0>(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b))) } #[doc = "Unsigned saturating accumulate of signed value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddh_u16)"] @@ -24166,7 +24118,7 @@ pub fn vsqaddb_u8(a: u8, b: i8) -> u8 { #[cfg_attr(test, assert_instr(usqadd))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vsqaddh_u16(a: u16, b: i16) -> u16 { - unsafe { simd_extract!(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b)), 0) } + vget_lane_u16::<0>(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b))) } #[doc = "Unsigned saturating accumulate of signed value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsqaddd_u64)"] @@ -25845,7 +25797,7 @@ pub unsafe fn vstl1q_lane_p64(ptr: *mut p64, val: poly64x2_t) { pub unsafe fn vstl1_lane_s64(ptr: *mut i64, val: int64x1_t) { static_assert!(LANE == 0); let atomic_dst = ptr as *mut crate::sync::atomic::AtomicI64; - let lane: i64 = simd_extract!(val, LANE as u32); + let lane: i64 = vget_lane_s64::(val); (*atomic_dst).store(transmute(lane), crate::sync::atomic::Ordering::Release) } #[doc = "Store-Release a single-element structure from one lane of one register."] @@ -25861,7 +25813,7 @@ pub unsafe fn vstl1_lane_s64(ptr: *mut i64, val: int64x1_t) { pub unsafe fn vstl1q_lane_s64(ptr: *mut i64, val: int64x2_t) { static_assert_uimm_bits!(LANE, 1); let atomic_dst = ptr as *mut crate::sync::atomic::AtomicI64; - let lane: i64 = simd_extract!(val, LANE as u32); + let lane: i64 = vgetq_lane_s64::(val); (*atomic_dst).store(transmute(lane), crate::sync::atomic::Ordering::Release) } #[doc = "Subtract"] @@ -27245,7 +27197,7 @@ pub fn vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t { #[cfg_attr(test, assert_instr(suqadd))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vuqaddb_s8(a: i8, b: u8) -> i8 { - unsafe { simd_extract!(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b)), 0) } + vget_lane_s8::<0>(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b))) } #[doc = "Signed saturating accumulate of unsigned value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddh_s16)"] @@ -27254,7 +27206,7 @@ pub fn vuqaddb_s8(a: i8, b: u8) -> i8 { #[cfg_attr(test, assert_instr(suqadd))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vuqaddh_s16(a: i16, b: u16) -> i16 { - unsafe { simd_extract!(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b)), 0) } + vget_lane_s16::<0>(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b))) } #[doc = "Signed saturating accumulate of unsigned value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vuqaddd_s64)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index f24a12ad40..1ae7f3f3b0 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -1659,7 +1659,7 @@ pub fn vabsq_s32(a: int32x4_t) -> int32x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub fn vabsh_f16(a: f16) -> f16 { - unsafe { simd_extract!(vabs_f16(vdup_n_f16(a)), 0) } + vget_lane_f16::<0>(vabs_f16(vdup_n_f16(a))) } #[doc = "Floating-point Add (vector)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vadd_f16)"] @@ -10945,7 +10945,7 @@ pub fn vdupq_laneq_u8(a: uint8x16_t) -> uint8x16_t { )] pub fn vdup_laneq_s64(a: int64x2_t) -> int64x1_t { static_assert_uimm_bits!(N, 1); - unsafe { transmute::(simd_extract!(a, N as u32)) } + unsafe { transmute(vgetq_lane_s64::(a)) } } #[doc = "Set all vector lanes to the same value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u64)"] @@ -10968,7 +10968,7 @@ pub fn vdup_laneq_s64(a: int64x2_t) -> int64x1_t { )] pub fn vdup_laneq_u64(a: uint64x2_t) -> uint64x1_t { static_assert_uimm_bits!(N, 1); - unsafe { transmute::(simd_extract!(a, N as u32)) } + unsafe { transmute(vgetq_lane_u64::(a)) } } #[doc = "Create a new vector with all lanes set to a value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_n_f16)"] @@ -35281,7 +35281,7 @@ pub fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { )] pub fn vqdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32))) } + vqdmulh_s16(a, vdup_n_s16(vgetq_lane_s16::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_laneq_s16)"] @@ -35304,7 +35304,7 @@ pub fn vqdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4 )] pub fn vqdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32))) } + vqdmulhq_s16(a, vdupq_n_s16(vgetq_lane_s16::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_laneq_s32)"] @@ -35327,7 +35327,7 @@ pub fn vqdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x )] pub fn vqdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32))) } + vqdmulh_s32(a, vdup_n_s32(vgetq_lane_s32::(b))) } #[doc = "Vector saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulhq_laneq_s32)"] @@ -35350,7 +35350,7 @@ pub fn vqdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2 )] pub fn vqdmulhq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32))) } + vqdmulhq_s32(a, vdupq_n_s32(vgetq_lane_s32::(b))) } #[doc = "Vector saturating doubling multiply high with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmulh_n_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 6950f69731..75e7e88362 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -182,12 +182,12 @@ intrinsics: - ['d_f64', 'f64'] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vabd_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - 0 + - - 0 - name: "vabd{type[0]}" doc: "Floating-point absolute difference" @@ -203,12 +203,12 @@ intrinsics: - ['h_f16', 'f16'] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vabd_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - 0 + - - 0 - name: "vabdl_high{neon_type[0].noq}" doc: Signed Absolute difference Long @@ -375,12 +375,12 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vceq_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - 0 - name: "vceq{type[0]}" @@ -397,12 +397,12 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vceq_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - 0 - name: "vceqd_{type[0]}" doc: "Compare bitwise equal" @@ -491,12 +491,12 @@ intrinsics: - ["h_s16", "i16", "u16", "s16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[3]}' - - FnCall: - "vuqadd_{type[3]}" - - FnCall: ["vdup_n_{type[3]}", [a]] - FnCall: ["vdup_n_{type[2]}", [b]] - - '0' + - - '0' - name: "vabs{neon_type.no}" doc: "Floating-point absolute value" @@ -567,12 +567,12 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - 'simd_extract!' + - 'vget_lane_{type[2]}' - - FnCall: - "vcgt_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vcgt{type[0]}" @@ -589,12 +589,12 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - 'simd_extract!' + - 'vget_lane_{type[2]}' - - FnCall: - "vcgt_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vclt{neon_type[0].no}" doc: "Compare signed less than" @@ -651,12 +651,12 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcle_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vcle{type[0]}" @@ -673,12 +673,12 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcle_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vcge{neon_type[0].no}" doc: "Compare signed greater than or equal" @@ -789,11 +789,11 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vclez_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vclez{type[0]}" doc: "Floating-point compare less than or equal to zero" @@ -809,11 +809,11 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vclez_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcltz{neon_type[0].no}" doc: "Compare signed less than zero" @@ -872,11 +872,11 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcltz_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcltz{type[0]}" doc: "Floating-point compare less than zero" @@ -892,11 +892,11 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcltz_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcltzd_s64" doc: "Compare less than zero" @@ -1612,11 +1612,11 @@ intrinsics: - ["f64", "f32"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - vcvtx_f32_f64 - - FnCall: [vdupq_n_f64, [a]] - - '0' + - - '0' - name: "vcvtx_high_f32_f64" doc: "Floating-point convert to lower precision narrow, rounding to odd" @@ -2521,7 +2521,7 @@ intrinsics: - [float64x1_t, "f64"] compose: - FnCall: [static_assert!, ['N == 0']] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup_laneq_{neon_type[0]}" doc: "Set all vector lanes to the same value" @@ -2539,8 +2539,8 @@ intrinsics: compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - FnCall: - - "transmute::<{type[2]}, _>" - - - FnCall: [simd_extract!, [a, 'N as u32']] + - transmute + - - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" doc: "Set all vector lanes to the same value" @@ -2561,7 +2561,7 @@ intrinsics: - [float64x2_t, "f64", d_laneq_f64] compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" doc: "Set all vector lanes to the same value" @@ -2582,7 +2582,7 @@ intrinsics: - [poly16x8_t, "p16", h_laneq_p16] compose: - FnCall: [static_assert_uimm_bits!, [N, 3]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" @@ -2601,7 +2601,7 @@ intrinsics: - [float16x4_t, "f16", h_lane_f16] compose: - FnCall: [static_assert_uimm_bits!, [N, 2]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" @@ -2620,7 +2620,7 @@ intrinsics: - [float16x8_t, "f16", h_laneq_f16] compose: - FnCall: [static_assert_uimm_bits!, [N, 4]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" @@ -2639,7 +2639,7 @@ intrinsics: - [poly8x16_t, "p8", b_laneq_p8] compose: - FnCall: [static_assert_uimm_bits!, [N, 4]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vdup{type[2]}" doc: "Set all vector lanes to the same value" @@ -2660,7 +2660,7 @@ intrinsics: - [float32x4_t, "f32", s_laneq_f32] compose: - FnCall: [static_assert_uimm_bits!, [N, 2]] - - FnCall: [simd_extract!, [a, 'N as u32']] + - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - name: "vext{neon_type[0].no}" doc: "Extract vector from pair of vectors" @@ -2877,11 +2877,11 @@ intrinsics: - [i64, 'd_s64', 's64'] compose: - FnCall: - - 'simd_extract!' + - 'vget_lane_{type[2]}' - - FnCall: - 'vqneg_{type[2]}' - - FnCall: ['vdup_n_{type[2]}', [a]] - - 0 + - - 0 - name: "vqneg{neon_type[0].no}" doc: Signed saturating negate @@ -2958,12 +2958,12 @@ intrinsics: - "vdup_n_{type[2]}" - - b - FnCall: - - 'simd_extract!' + - 'vget_lane_{type[2]}' - - FnCall: - "vqsub_{type[2]}" - - a - b - - "0" + - - "0" - name: "vqsub{type[3]}" doc: Saturating subtract @@ -2989,12 +2989,12 @@ intrinsics: - "vdup_n_{type[2]}" - - b - FnCall: - - 'simd_extract!' + - 'vget_lane_{type[2]}' - - FnCall: - "vqsub_{type[2]}" - - a - b - - "0" + - - "0" - name: "vrbit{neon_type.no}" doc: Reverse bit order @@ -3443,12 +3443,12 @@ intrinsics: - "vdup_n_{type[0]}" - - b - FnCall: - - simd_extract! + - 'vget_lane_{type[0]}' - - FnCall: - "vqadd_{type[0]}" - - a - b - - "0" + - - "0" - name: "vqadd{type[2]}" doc: Saturating add @@ -3474,12 +3474,12 @@ intrinsics: - "vdup_n_{type[0]}" - - b - FnCall: - - simd_extract! + - 'vget_lane_{type[0]}' - - FnCall: - "vqadd_{type[0]}" - - a - b - - "0" + - - "0" - name: "vld1{neon_type[1].no}" doc: "Load multiple single-element structures to one, two, three, or four registers" @@ -4484,7 +4484,7 @@ intrinsics: - Let: - "lane" - i64 - - FnCall: [simd_extract!, [val, 'LANE as u32']] + - FnCall: ['vget{neon_type[1].lane_nox}', [val], [LANE]] - MethodCall: - "(*atomic_dst)" - store @@ -5254,8 +5254,8 @@ intrinsics: compose: - FnCall: - "vmull_{neon_type[0]}" - - - FnCall: [simd_extract!, [a, '1']] - - FnCall: [simd_extract!, [b, '1']] + - - FnCall: ['vget{neon_type[0].lane_nox}', [a], [1]] + - FnCall: ['vget{neon_type[0].lane_nox}', [b], [1]] - name: "vmulx{neon_type.no}" doc: Floating-point multiply extended @@ -5354,11 +5354,8 @@ intrinsics: - vmulx_f64 - - a - FnCall: - - 'transmute::' - - - FnCall: - - "simd_extract!" - - - b - - 'LANE as u32' + - 'transmute' + - - FnCall: ['vget{neon_type.lane_nox}', [b], [LANE]] - name: "vmulx{type[0]}" doc: Floating-point multiply extended @@ -5394,16 +5391,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - ["d_lane_f64", "f64", float64x1_t, "d_f64", 'LANE as u32'] + - ["d_lane_f64", "f64", float64x1_t, "d_f64"] compose: - FnCall: [static_assert!, ['LANE == 0']] - FnCall: - "vmulx{type[3]}" - - a - - FnCall: - - "simd_extract!" - - - b - - "{type[4]}" + - FnCall: ['vget{neon_type[2].lane_nox}', [b], [LANE]] - name: "vmulx_laneq_f64" doc: Floating-point multiply extended @@ -5423,11 +5417,8 @@ intrinsics: - vmulx_f64 - - a - FnCall: - - 'transmute::' - - - FnCall: - - "simd_extract!" - - - b - - 'LANE as u32' + - 'transmute' + - - FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]] - name: "vmulx{type[0]}" doc: Floating-point multiply extended @@ -5497,18 +5488,15 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - ['s_lane_f32', f32, float32x2_t, '1', 's_f32', 'LANE as u32'] - - ['s_laneq_f32', f32, float32x4_t, '2', 's_f32', 'LANE as u32'] - - ['d_laneq_f64', f64, float64x2_t, '1', 'd_f64', 'LANE as u32'] + - ['s_lane_f32', f32, float32x2_t, '1', 's_f32'] + - ['s_laneq_f32', f32, float32x4_t, '2', 's_f32'] + - ['d_laneq_f64', f64, float64x2_t, '1', 'd_f64'] compose: - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]] - FnCall: - "vmulx{type[4]}" - - a - - FnCall: - - "simd_extract!" - - - b - - "{type[5]}" + - FnCall: ['vget{neon_type[2].lane_nox}', [b], [LANE]] - name: "vmulx{type[0]}" @@ -5531,10 +5519,7 @@ intrinsics: - FnCall: - "vmulx{type[4]}" - - a - - FnCall: - - "simd_extract!" - - - b - - "{type[5]}" + - FnCall: ['vget{neon_type[2].lane_nox}', [b], [LANE]] - name: "vmulx{neon_type[0].N}" @@ -7084,11 +7069,11 @@ intrinsics: - Let: - a1 - "{type[2]}" - - FnCall: [simd_extract!, [a, '0']] + - FnCall: ['vget{neon_type[1].lane_nox}', [a], [0]] - Let: - a2 - "{type[2]}" - - FnCall: [simd_extract!, [a, '1']] + - FnCall: ['vget{neon_type[1].lane_nox}', [a], [1]] - Identifier: ['a1 + a2', Symbol] - name: "vpmin{type[0]}" @@ -7121,7 +7106,7 @@ intrinsics: compose: - Let: [a, int16x4_t, {FnCall: [vdup_n_s16, [a]]}] - Let: [b, int16x4_t, {FnCall: [vdup_n_s16, [b]]}] - - FnCall: [simd_extract!, [{FnCall: [vqdmull_s16, [a, b]]}, '0']] + - FnCall: ['vgetq_lane_{type[1]}', [{FnCall: [vqdmull_s16, [a, b]]}], ['0']] - name: "vqdmulls_s32" doc: "Signed saturating doubling multiply long" @@ -7187,7 +7172,7 @@ intrinsics: - ["i32", int32x4_t, "i64", 's_laneq_s32', 's_s32'] compose: - FnCall: [static_assert_uimm_bits!, [N, 2]] - - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}] + - Let: [b, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ["vqdmull{type[4]}", [a, b]] - name: "vqdmullh_laneq_s16" @@ -7204,7 +7189,7 @@ intrinsics: - ["i16", int16x8_t, "i32"] compose: - FnCall: [static_assert_uimm_bits!, [N, 3]] - - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}] + - Let: [b, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ["vqdmullh_s16", [a, b]] - name: "vqdmulls_lane_s32" @@ -7221,7 +7206,7 @@ intrinsics: - ["i32", int32x2_t, "i64"] compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - - Let: [b, "{type[0]}", {FnCall: [simd_extract!, [b, 'N as u32']]}] + - Let: [b, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ["vqdmulls_s32", [a, b]] - name: "vqdmull{type[6]}" @@ -7360,7 +7345,7 @@ intrinsics: - ["i32", "i16", "s16"] compose: - Let: [x, int32x4_t, {FnCall: [vqdmull_s16, [{FnCall: [vdup_n_s16, [b]]}, {FnCall: [vdup_n_s16, [c]]}]]}] - - FnCall: [vqadds_s32, [a, {FnCall: [simd_extract!, [x, 0]]}]] + - FnCall: [vqadds_s32, [a, {FnCall: ['vgetq_lane_s32', [x], [0]]}]] - name: "vqdmlals_s32" doc: "Signed saturating doubling multiply-add long" @@ -7393,7 +7378,7 @@ intrinsics: - ["i64", "i32", int32x4_t, "i64", s_laneq_s32, '2', s_s32] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]] - - FnCall: ["vqdmlal{type[6]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vqdmlal{type[6]}", [a, b, {FnCall: ['vget{neon_type[2].lane_nox}', [c], [LANE]]}]] - name: "vqdmlal_laneq_s16" doc: "Vector widening saturating doubling multiply accumulate with scalar" @@ -7474,7 +7459,7 @@ intrinsics: - ["i32", "i16"] compose: - Let: [x, int32x4_t, {FnCall: [vqdmull_s16, [{FnCall: [vdup_n_s16, [b]]}, {FnCall: [vdup_n_s16, [c]]}]]}] - - FnCall: [vqsubs_s32, [a, {FnCall: [simd_extract!, [x, '0']]}]] + - FnCall: [vqsubs_s32, [a, {FnCall: ['vgetq_lane_s32', [x], [0]]}]] - name: "vqdmlsls_s32" doc: "Signed saturating doubling multiply-subtract long" @@ -7507,7 +7492,7 @@ intrinsics: - ["i64", "i32", int32x4_t, "i64", 's_laneq_s32', '2', 's_s32'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[5]}"]] - - FnCall: ["vqdmlsl{type[6]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vqdmlsl{type[6]}", [a, b, {FnCall: ['vget{neon_type[2].lane_nox}', [c], [LANE]]}]] - name: "vqdmlsl_laneq_s16" doc: "Vector widening saturating doubling multiply subtract with scalar" @@ -7555,7 +7540,7 @@ intrinsics: compose: - Let: [a, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[3].no}", [a]]}] - Let: [b, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[3].no}", [b]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqdmulh{neon_type[3].no}", [a, b]]}, '0']] + - FnCall: ['vget{neon_type[3].lane_nox}', [{FnCall: ["vqdmulh{neon_type[3].no}", [a, b]]}], ['0']] - name: "vqdmulhh{type[3]}" doc: "Signed saturating doubling multiply returning high half" @@ -7572,7 +7557,7 @@ intrinsics: - ["i16", int16x8_t, "i16", '_laneq_s16', '3'] compose: - FnCall: [static_assert_uimm_bits!, [N, "{type[4]}"]] - - Let: [b, 'i16', {FnCall: [simd_extract!, [b, 'N as u32']]}] + - Let: [b, 'i16', {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ['vqdmulhh_s16', [a, b]] - name: "vqdmulhs{type[3]}" @@ -7590,7 +7575,7 @@ intrinsics: - ["i32", int32x4_t, "i32", "_laneq_s32", '2'] compose: - FnCall: [static_assert_uimm_bits!, [N, "{type[4]}"]] - - Let: [b, 'i32', {FnCall: [simd_extract!, [b, 'N as u32']]}] + - Let: [b, 'i32', {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ['vqdmulhs_s32', [a, b]] - name: "vqmovn_high{neon_type[1].noq}" @@ -7635,7 +7620,7 @@ intrinsics: - ["i16", "i8", 'h_s16', s16] - ["i32", "i16", 's_s32', s32] compose: - - FnCall: [simd_extract!, [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']] + - FnCall: ['vget_lane_{type[1]}', [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}], ['0']] - name: "vqmovn{type[2]}" doc: "Saturating extract narrow" @@ -7649,7 +7634,7 @@ intrinsics: - ["u16", "u8", 'h_u16', 'u16'] - ["u32", "u16", 's_u32', 'u32'] compose: - - FnCall: [simd_extract!, [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']] + - FnCall: ['vget_lane_{type[1]}', [{FnCall: ["vqmovn_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}], ['0']] - name: "vqmovnd_s64" doc: "Saturating extract narrow" @@ -7698,7 +7683,7 @@ intrinsics: - ["i32", "u16", 's_s32', s32] - ["i64", "u32", 'd_s64', s64] compose: - - FnCall: [simd_extract!, [{FnCall: ["vqmovun_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}, '0']] + - FnCall: ['vget_lane_{type[1]}', [{FnCall: ["vqmovun_{type[3]}", [{FnCall: ["vdupq_n_{type[3]}", [a]]}]]}], ['0']] - name: "vqmovun_high_{neon_type[1]}" doc: "Signed saturating extract unsigned narrow" @@ -7727,7 +7712,7 @@ intrinsics: - ["i16", 'h_s16', 's16'] - ["i32", 's_s32', 's32'] compose: - - FnCall: [simd_extract!, [{FnCall: ["vqrdmulh_{type[2]}", [{FnCall: ["vdup_n_{type[2]}", [a]]}, {FnCall: ["vdup_n_{type[2]}", [b]]}]]}, '0']] + - FnCall: ['vget_lane_{type[2]}', [{FnCall: ["vqrdmulh_{type[2]}", [{FnCall: ["vdup_n_{type[2]}", [a]]}, {FnCall: ["vdup_n_{type[2]}", [b]]}]]}], ['0']] - name: "vqrdmulh{type[2]}" doc: "Signed saturating rounding doubling multiply returning high half" @@ -7746,7 +7731,7 @@ intrinsics: - ["i32", int32x4_t, 's_laneq_s32', 's_s32', '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]] - - FnCall: ["vqrdmulh{type[3]}", [a, {FnCall: [simd_extract!, [b, 'LANE as u32']]}]] + - FnCall: ["vqrdmulh{type[3]}", [a, {FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]]}]] - name: "vqrdmlah{neon_type.no}" doc: "Signed saturating rounding doubling multiply accumulate returning high half" @@ -7785,7 +7770,7 @@ intrinsics: - Let: [a, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [a]]}] - Let: [b, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [b]]}] - Let: [c, "{neon_type[1]}", {FnCall: ["vdup_n_{type[2]}", [c]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrdmlah_{type[2]}", [a, b, c]]}, '0']] + - FnCall: ['vget_lane_{type[2]}', [{FnCall: ["vqrdmlah_{type[2]}", [a, b, c]]}], ['0']] - name: "vqrdmlah{type[0]}" doc: "Signed saturating rounding doubling multiply accumulate returning high half" @@ -7830,7 +7815,7 @@ intrinsics: - ["i32", int32x4_t, '2', "s_s32", s_laneq_s32, s_s32] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]] - - FnCall: ["vqrdmlah{type[5]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vqrdmlah{type[5]}", [a, b, {FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vqrdmlsh{neon_type.no}" doc: "Signed saturating rounding doubling multiply subtract returning high half" @@ -7869,7 +7854,7 @@ intrinsics: - Let: [a, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [a]]}] - Let: [b, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [b]]}] - Let: [c, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [c]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrdmlsh_{type[3]}", [a, b, c]]}, '0']] + - FnCall: ['vget{neon_type[2].lane_nox}', [{FnCall: ["vqrdmlsh_{type[3]}", [a, b, c]]}], ['0']] - name: "vqrdmlsh{type[0]}" doc: "Signed saturating rounding doubling multiply subtract returning high half" @@ -7914,7 +7899,7 @@ intrinsics: - ["i32", int32x4_t, '2', s_laneq_s32, s_s32] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[2]}"]] - - FnCall: ["vqrdmlsh{type[4]}", [a, b, {FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vqrdmlsh{type[4]}", [a, b, {FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vqrshl{type[0]}" doc: "Signed saturating rounding shift left" @@ -7948,7 +7933,7 @@ intrinsics: compose: - Let: [a, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [a]]}] - Let: [b, "{neon_type[2]}", {FnCall: ["vdup_n_{type[3]}", [b]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrshl_{type[3]}", [a, b]]}, '0']] + - FnCall: ['vget{neon_type[2].lane_nox}', [{FnCall: ["vqrshl_{type[3]}", [a, b]]}], ['0']] - name: "vqrshl{type[2]}" doc: "Unsigned signed saturating rounding shift left" @@ -7982,7 +7967,7 @@ intrinsics: compose: - Let: [a, "{neon_type[3]}", {FnCall: ["vdup_n_{type[0]}", [a]]}] - Let: [b, "{neon_type[4]}", {FnCall: ["vdup_n_{type[5]}", [b]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrshl_{type[0]}", [a, b]]}, '0']] + - FnCall: ['vget{neon_type[3].lane_nox}', [{FnCall: ["vqrshl_{type[0]}", [a, b]]}], ['0']] - name: "vqrshrn{type[2]}" doc: "Signed saturating rounded shift right narrow" @@ -8001,7 +7986,7 @@ intrinsics: compose: - FnCall: [static_assert!, ["{type[3]}"]] - Let: [a, "{neon_type[4]}", {FnCall: ["vdup{type[5]}", [a]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrshrn_n{neon_type[4].noq}::", [a]]}, '0']] + - FnCall: ['vget_lane_{type[1]}', [{FnCall: ["vqrshrn_n{neon_type[4].noq}::", [a]]}], ['0']] - name: "vqrshrn{type[3]}" doc: "Signed saturating rounded shift right narrow" @@ -8038,7 +8023,7 @@ intrinsics: compose: - FnCall: [static_assert!, ['{type[3]}']] - Let: [a, "{neon_type[4]}", {FnCall: ["vdup{type[5]}", [a]]}] - - FnCall: [simd_extract!, [{FnCall: ["vqrshrn{type[6]}::", [a]]}, '0']] + - FnCall: ['vget_lane_{type[2]}', [{FnCall: ["vqrshrn{type[6]}::", [a]]}], ['0']] - name: "vqrshrn_high_n{neon_type[1].noq}" doc: "Unsigned saturating rounded shift right narrow" @@ -8085,11 +8070,11 @@ intrinsics: - "{neon_type[4]}" - FnCall: ["vdupq_n_{type[5]}", [a]] - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vqrshrun_n_{type[5]}::" - - a - - '0' + - - '0' - name: "vqrshrun_high_n{neon_type[1].noq}" doc: "Signed saturating rounded shift right unsigned narrow" @@ -8152,7 +8137,7 @@ intrinsics: - "vqshl{neon_type[2].noq}" - - FnCall: ["vdup_n{neon_type[2].no}", [a]] - FnCall: ["vdup_n{neon_type[2].no}", [b]] - - FnCall: [simd_extract!, [c, '0']] + - FnCall: ['vget{neon_type[2].lane_nox}', [c], ['0']] - name: "vqshl{type[0]}" doc: "Signed saturating shift left" @@ -8172,11 +8157,11 @@ intrinsics: compose: - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]] - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vqshl_n_{type[3]}::" - - FnCall: ["vdup_n_{type[3]}", [a]] - - '0' + - - '0' - name: "vqshld_{type[0]}" doc: "Unsigned saturating shift left" @@ -8215,7 +8200,7 @@ intrinsics: - "vqshl{neon_type[3].noq}" - - FnCall: ["vdup{neon_type[3].N}", [a]] - FnCall: ["vdup{neon_type[4].N}", [b]] - - FnCall: [simd_extract!, [c, '0']] + - FnCall: ['vget{neon_type[3].lane_nox}', [c], ['0']] - name: "vqshl{type[0]}" doc: "Unsigned saturating shift left" @@ -8235,9 +8220,9 @@ intrinsics: compose: - FnCall: [static_assert_uimm_bits!, [N, "{type[2]}"]] - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: ["vqshl_n_{type[1]}::", [{FnCall: ["vdup_n_{type[1]}", [a]]}]] - - '0' + - - '0' - name: "vqshrnd_n_s64" doc: "Signed saturating shift right narrow" @@ -8279,11 +8264,11 @@ intrinsics: compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vqshrn_n_{type[4]}::" - - FnCall: ["vdupq_n_{type[4]}", [a]] - - '0' + - - '0' - name: "vqshrn{type[0]}" doc: "Signed saturating shift right narrow" @@ -8347,11 +8332,11 @@ intrinsics: compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - "simd_extract!" + - 'vget_lane_{type[2]}' - - FnCall: - "vqshrn_n_{type[1]}::" - - FnCall: ["vdupq_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vqshrn{type[0]}" doc: "Unsigned saturating shift right narrow" @@ -8392,11 +8377,11 @@ intrinsics: compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vqshrun_n_{type[4]}::" - - FnCall: ["vdupq_n_{type[4]}", [a]] - - '0' + - - '0' - name: "vqshrun_high_n_{neon_type[1]}" doc: "Signed saturating shift right unsigned narrow" @@ -8433,12 +8418,12 @@ intrinsics: - [h_u16, "u16", "i16", s16] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vsqadd_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[2]}", [b]] - - '0' + - - '0' - name: "vsqadd{type[0]}" doc: "Unsigned saturating accumulate of signed value" @@ -9026,14 +9011,14 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [float64x2_t, float64x1_t, '0', '1', 'transmute::(simd_extract!(b, LANE2 as u32))'] - - [poly64x2_t, poly64x1_t, '0', '1', 'transmute::(simd_extract!(b, LANE2 as u32))'] - - [uint64x2_t, uint64x1_t, '0', '1', 'transmute::(simd_extract!(b, LANE2 as u32))'] - - [int64x2_t, int64x1_t, '0', '1', 'transmute::(simd_extract!(b, LANE2 as u32))'] + - [float64x2_t, float64x1_t, '0', '1'] + - [poly64x2_t, poly64x1_t, '0', '1'] + - [uint64x2_t, uint64x1_t, '0', '1'] + - [int64x2_t, int64x1_t, '0', '1'] compose: - FnCall: [static_assert!, ['LANE1 == {type[2]}']] - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[3]}']] - - Identifier: ["{type[4]}", UnsafeSymbol] + - FnCall: [transmute, [{FnCall: ['vget{neon_type[0].lane_nox}', [b], [LANE2]]}]] - name: "vcopy{neon_type[0].laneq_nox}" doc: "Insert vector element from another vector element" @@ -9566,7 +9551,7 @@ intrinsics: - transmute - - FnCall: - _vrnd32x_f64 - - - FnCall: [simd_extract!, [a, 0]] + - - FnCall: ['vget{neon_type.lane_nox}', [a], [0]] - name: "vrnd32z{neon_type.no}" doc: "Floating-point round to 32-bit integer toward zero" @@ -9610,7 +9595,7 @@ intrinsics: arch: aarch64,arm64ec - FnCall: - transmute - - - FnCall: [_vrnd32z_f64, [{FnCall: [simd_extract!, [a, 0]]}]] + - - FnCall: [_vrnd32z_f64, [{FnCall: ['vget{neon_type.lane_nox}', [a], [0]]}]] - name: "vrnd64x{neon_type.no}" doc: "Floating-point round to 64-bit integer, using current rounding mode" @@ -9654,7 +9639,7 @@ intrinsics: arch: aarch64,arm64ec - FnCall: - transmute - - - FnCall: [_vrnd64x_f64, [{FnCall: [simd_extract!, [a, 0]]}]] + - - FnCall: [_vrnd64x_f64, [{FnCall: ['vget{neon_type.lane_nox}', [a], [0]]}]] - name: "vrnd64z{neon_type.no}" doc: "Floating-point round to 64-bit integer toward zero" @@ -9698,7 +9683,7 @@ intrinsics: arch: aarch64,arm64ec - FnCall: - transmute - - - FnCall: [_vrnd64z_f64, [{FnCall: [simd_extract!, [a, 0]]}]] + - - FnCall: [_vrnd64z_f64, [{FnCall: ['vget{neon_type.lane_nox}', [a], [0]]}]] - name: "vtrn1{neon_type[0].no}" doc: Transpose vectors @@ -10323,7 +10308,7 @@ intrinsics: - "vfma{neon_type[0].no}" - - a - b - - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vfma{type[3]}" @@ -10349,7 +10334,7 @@ intrinsics: - "vfma{neon_type[0].no}" - - a - b - - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] # vfms lane f16 @@ -10376,7 +10361,7 @@ intrinsics: - "vfms{neon_type[0].no}" - - a - b - - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vfms{type[1]}" @@ -10413,7 +10398,7 @@ intrinsics: - "vfma{neon_type.no}" - - a - b - - FnCall: ["vdup{neon_type.N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type.N}", [{FnCall: ['vget{neon_type.lane_nox}', [c], [LANE]]}]] - name: "vfma_laneq_f64" doc: "Floating-point fused multiply-add to accumulator" @@ -10433,7 +10418,7 @@ intrinsics: - "vfma{neon_type[0].no}" - - a - b - - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vfmaq_lane_f64" doc: "Floating-point fused multiply-add to accumulator" @@ -10453,7 +10438,7 @@ intrinsics: - "vfma{neon_type[0].no}" - - a - b - - FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]] + - FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]] - name: "vfma{type[2]}" doc: "Floating-point fused multiply-add to accumulator" @@ -10471,7 +10456,7 @@ intrinsics: - ["f64", float64x2_t, "d_laneq_f64", '1'] compose: - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]] - - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [c, 'LANE as u32']]}] + - Let: [c, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}] - FnCall: ["fma{type[0]}", [b, c, a]] - name: "vfmad_lane_f64" @@ -10488,7 +10473,7 @@ intrinsics: - ["f64", float64x1_t] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [c, 'LANE as u32']]}] + - Let: [c, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}] - FnCall: [fmaf64, [b, c, a]] @@ -10525,7 +10510,7 @@ intrinsics: - ["f16", float16x8_t, 'q_f16', '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [v, 'LANE as u32']]}] + - Let: [c, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [v], [LANE]]}] - FnCall: ["vfmah_{type[0]}", [a, b, c]] - name: "vfmsh_lane{type[2]}" @@ -10545,7 +10530,7 @@ intrinsics: - ["f16", float16x8_t, 'q_f16', '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - - Let: [c, "{type[0]}", {FnCall: [simd_extract!, [v, 'LANE as u32']]}] + - Let: [c, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [v], [LANE]]}] - FnCall: ["vfmsh_{type[0]}", [a, b, c]] - name: "vfms_f64" @@ -10608,7 +10593,7 @@ intrinsics: - [float64x2_t, float64x2_t, '1', q_laneq_f64] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[2]}']] - - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]] + - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]]}]] - name: "vfms_lane_f64" doc: "Floating-point fused multiply-subtract to accumulator" @@ -10624,7 +10609,7 @@ intrinsics: - float64x1_t compose: - FnCall: [static_assert!, ['LANE == 0']] - - FnCall: ["vfms{neon_type.no}", [a, b, {FnCall: ["vdup{neon_type.N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]] + - FnCall: ["vfms{neon_type.no}", [a, b, {FnCall: ["vdup{neon_type.N}", [{FnCall: ['vget{neon_type.lane_nox}', [c], [LANE]]}]]}]] - name: "vfms_laneq_f64" doc: "Floating-point fused multiply-subtract to accumulator" @@ -10640,7 +10625,7 @@ intrinsics: - [float64x1_t, float64x2_t] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '1']] - - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]] + - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]]}]] - name: "vfmsq_lane_f64" doc: "Floating-point fused multiply-subtract to accumulator" @@ -10656,7 +10641,7 @@ intrinsics: - [float64x2_t, float64x1_t] compose: - FnCall: [static_assert!, ['LANE == 0']] - - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: [simd_extract!, [c, 'LANE as u32']]}]]}]] + - FnCall: ["vfms{neon_type[0].no}", [a, b, {FnCall: ["vdup{neon_type[0].N}", [{FnCall: ['vget{neon_type[1].lane_nox}', [c], [LANE]]}]]}]] - name: "vfms{type[2]}" doc: "Floating-point fused multiply-subtract to accumulator" @@ -10724,11 +10709,11 @@ intrinsics: - ["f64", "u64", "d_f64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vceqz_{type[0]}" - - FnCall: ["vdup_n_{type[0]}", [a]] - - '0' + - - '0' - name: "vceqz{type[2]}" doc: "Floating-point compare bitwise equal to zero" @@ -10744,11 +10729,11 @@ intrinsics: - ["f16", "u16", "h_f16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vceqz_{type[0]}" - - FnCall: ["vdup_n_{type[0]}", [a]] - - '0' + - - '0' - name: "vceqzd_{type[2]}" doc: "Compare bitwise equal to zero" @@ -10847,12 +10832,12 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcge_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vcge{type[0]}" @@ -10869,12 +10854,12 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcge_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - FnCall: ["vdup_n_{type[1]}", [b]] - - '0' + - - '0' - name: "vcge{neon_type[0].no}" doc: "Floating-point compare greater than or equal" @@ -10991,11 +10976,11 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcgez_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcgez{type[0]}" @@ -11012,11 +10997,11 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vcgez_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vclezd_s64" doc: "Compare less than or equal to zero" @@ -11123,11 +11108,11 @@ intrinsics: - ["d_f64", "f64", "u64"] compose: - FnCall: - - "simd_extract!" + - 'vget_lane_{type[2]}' - - FnCall: - "vcgtz_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcgtz{type[0]}" doc: "Floating-point compare greater than zero" @@ -11143,11 +11128,11 @@ intrinsics: - ["h_f16", "f16", "u16"] compose: - FnCall: - - "simd_extract!" + - 'vget_lane_{type[2]}' - - FnCall: - "vcgtz_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - '0' + - - '0' - name: "vcvt{neon_type[1].no}_{neon_type[0]}" doc: "Floating-point convert to unsigned fixed-point, rounding toward zero" @@ -11203,7 +11188,7 @@ intrinsics: - - a - FnCall: - "transmute::" - - - FnCall: [simd_extract!, [b, 'LANE as u32']] + - - FnCall: ['vget{neon_type.lane_nox}', [b], [LANE]] - name: "vmulq_lane_f64" doc: "Floating-point multiply" @@ -11238,7 +11223,7 @@ intrinsics: - ["f64", float64x1_t] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}] + - Let: [b, '{type[0]}', {FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]]}] - Identifier: ['a * b', Symbol] - name: "vmul_laneq_f64" @@ -11260,7 +11245,7 @@ intrinsics: - - a - FnCall: - "transmute::" - - - FnCall: [simd_extract!, [b, 'LANE as u32']] + - - FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]] - name: "vmulq_laneq_f64" doc: "Floating-point multiply" @@ -11338,7 +11323,7 @@ intrinsics: - ["f64", float64x2_t, "d_laneq_f64", '1'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}] + - Let: [b, '{type[0]}', {FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]]}] - Identifier: ['a * b', Symbol] @@ -11359,7 +11344,7 @@ intrinsics: - ["f16", float16x8_t, "h_laneq_f16", '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - - Let: [b, '{type[0]}', {FnCall: [simd_extract!, [b, 'LANE as u32']]}] + - Let: [b, '{type[0]}', {FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]]}] - Identifier: ['a * b', Symbol] @@ -11481,12 +11466,12 @@ intrinsics: - ["f64", "u64", 'd_f64'] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vclt_{type[0]}" - - FnCall: ["vdup_n_{type[0]}", [a]] - FnCall: ["vdup_n_{type[0]}", [b]] - - '0' + - - '0' - name: "vclt{type[2]}" @@ -11503,12 +11488,12 @@ intrinsics: - ["f16", "u16", 'h_f16'] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vclt_{type[0]}" - - FnCall: ["vdup_n_{type[0]}", [a]] - FnCall: ["vdup_n_{type[0]}", [b]] - - '0' + - - '0' - name: "vabdl_high{neon_type[0].noq}" doc: Unsigned Absolute difference Long @@ -11748,7 +11733,7 @@ intrinsics: - - a - FnCall: - "vdup{neon_type[0].N}" - - - FnCall: [simd_extract!, [b, 'LANE as u32']] + - - FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]] - name: "vqabs{type[2]}" doc: "Signed saturating absolute value" @@ -11763,9 +11748,9 @@ intrinsics: - ["i16", "s16", 'h_s16'] compose: - FnCall: - - "simd_extract!" + - 'vget_lane_{type[0]}' - - FnCall: ["vqabs_{type[1]}", [{FnCall: ["vdup_n_{type[1]}", [a]]}]] - - '0' + - - '0' - name: "vqabs{type[1]}" doc: "Signed saturating absolute value" @@ -11957,11 +11942,11 @@ intrinsics: compose: - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]] - FnCall: - - simd_extract! + - 'vget_lane_{type[2]}' - - FnCall: - "vqshlu_n_{type[4]}::" - - FnCall: ["vdup_n_{type[4]}", [a]] - - '0' + - - '0' - name: "vcvta{neon_type[1].no}_{neon_type[0]}" doc: "Floating-point convert to unsigned integer, rounding to nearest with ties to away" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 68c92aa54d..08a80c3719 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -506,11 +506,11 @@ intrinsics: - ['h_f16', 'f16'] compose: - FnCall: - - simd_extract! + - 'vget_lane_{type[1]}' - - FnCall: - "vabs_{type[1]}" - - FnCall: ["vdup_n_{type[1]}", [a]] - - 0 + - - 0 - name: "vcgt{neon_type[0].no}" doc: "Compare signed greater than" @@ -1716,8 +1716,8 @@ intrinsics: compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - FnCall: - - "transmute{type[3]}" - - - FnCall: [simd_extract!, [a, 'N as u32']] + - "transmute" + - - FnCall: ['vget{neon_type[1].lane_nox}', [a], [N]] - name: "vext{neon_type[0].no}" doc: "Extract vector from pair of vectors" @@ -11461,7 +11461,7 @@ intrinsics: - - a - FnCall: - "vdup{neon_type[0].N}" - - - FnCall: [simd_extract!, [b, 'LANE as u32']] + - - FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE]] - name: "vrecpe{neon_type.no}" doc: "Unsigned reciprocal estimate" From aa9ed50428ce6bd90efafdc3911bef0830c63513 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 03:48:26 +0530 Subject: [PATCH 02/19] Change implementation of `vext` --- .../core_arch/src/aarch64/neon/generated.rs | 16 +- crates/core_arch/src/aarch64/neon/mod.rs | 1 - .../src/arm_shared/neon/generated.rs | 588 ++++++------------ crates/core_arch/src/arm_shared/neon/mod.rs | 2 +- .../spec/neon/aarch64.spec.yml | 12 +- .../spec/neon/arm_shared.spec.yml | 100 +-- 6 files changed, 235 insertions(+), 484 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 2624e2f22d..68320e916d 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -10344,13 +10344,7 @@ pub fn veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vextq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_p64)"] @@ -10361,13 +10355,7 @@ pub fn vextq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vextq_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Floating-point fused Multiply-Add to accumulator(vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f64)"] diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs index b140131012..7ce79671f7 100644 --- a/crates/core_arch/src/aarch64/neon/mod.rs +++ b/crates/core_arch/src/aarch64/neon/mod.rs @@ -12,7 +12,6 @@ pub use self::generated::*; use crate::{ core_arch::{arm_shared::*, simd::*}, - hint::unreachable_unchecked, intrinsics::{simd::*, *}, mem::transmute, }; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 1ae7f3f3b0..8f95952084 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11956,15 +11956,7 @@ pub fn veorq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[cfg(not(target_arch = "arm64ec"))] pub fn vext_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_f32)"] @@ -11987,13 +11979,7 @@ pub fn vext_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { )] pub fn vext_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s32)"] @@ -12016,13 +12002,7 @@ pub fn vext_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { )] pub fn vext_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u32)"] @@ -12045,13 +12025,7 @@ pub fn vext_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { )] pub fn vext_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s64)"] @@ -12125,17 +12099,20 @@ pub unsafe fn vext_u64(a: uint64x1_t, _b: uint64x1_t) -> uint64x1_ pub fn vext_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12160,17 +12137,20 @@ pub fn vext_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { pub fn vextq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12195,17 +12175,20 @@ pub fn vextq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { pub fn vext_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12230,17 +12213,20 @@ pub fn vext_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { pub fn vextq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12265,17 +12251,20 @@ pub fn vextq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { pub fn vext_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12300,17 +12289,20 @@ pub fn vext_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { pub fn vextq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12336,17 +12328,20 @@ pub fn vextq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t { pub fn vextq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { static_assert_uimm_bits!(N, 3); unsafe { - match N & 0b111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), - 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), - 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), - 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), - 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12370,15 +12365,7 @@ pub fn vextq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { )] pub fn vextq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_s16)"] @@ -12401,15 +12388,7 @@ pub fn vextq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { )] pub fn vext_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s32)"] @@ -12432,15 +12411,7 @@ pub fn vext_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { )] pub fn vextq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_u16)"] @@ -12463,15 +12434,7 @@ pub fn vextq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { )] pub fn vext_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u32)"] @@ -12494,15 +12457,7 @@ pub fn vext_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { )] pub fn vextq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vext_p16)"] @@ -12525,15 +12480,7 @@ pub fn vextq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { )] pub fn vext_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - match N & 0b11 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), - 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), - 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s64)"] @@ -12556,13 +12503,7 @@ pub fn vext_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t { )] pub fn vextq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u64)"] @@ -12585,13 +12526,7 @@ pub fn vextq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { )] pub fn vextq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - match N & 0b1 { - 0 => simd_shuffle!(a, b, [0, 1]), - 1 => simd_shuffle!(a, b, [1, 2]), - _ => unreachable_unchecked(), - } - } + unsafe { simd_shuffle!(a, b, [N as u32, N as u32 + 1]) } } #[doc = "Extract vector from pair of vectors"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_s8)"] @@ -12615,85 +12550,28 @@ pub fn vextq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { pub fn vextq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { static_assert_uimm_bits!(N, 4); unsafe { - match N & 0b1111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle!( - a, - b, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] - ), - 2 => simd_shuffle!( - a, - b, - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] - ), - 3 => simd_shuffle!( - a, - b, - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] - ), - 4 => simd_shuffle!( - a, - b, - [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] - ), - 5 => simd_shuffle!( - a, - b, - [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] - ), - 6 => simd_shuffle!( - a, - b, - [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] - ), - 7 => simd_shuffle!( - a, - b, - [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] - ), - 8 => simd_shuffle!( - a, - b, - [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] - ), - 9 => simd_shuffle!( - a, - b, - [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] - ), - 10 => simd_shuffle!( - a, - b, - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] - ), - 11 => simd_shuffle!( - a, - b, - [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] - ), - 12 => simd_shuffle!( - a, - b, - [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] - ), - 13 => simd_shuffle!( - a, - b, - [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28] - ), - 14 => simd_shuffle!( - a, - b, - [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] - ), - 15 => simd_shuffle!( - a, - b, - [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] - ), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7, + N as u32 + 8, + N as u32 + 9, + N as u32 + 10, + N as u32 + 11, + N as u32 + 12, + N as u32 + 13, + N as u32 + 14, + N as u32 + 15 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12718,85 +12596,28 @@ pub fn vextq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { pub fn vextq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { static_assert_uimm_bits!(N, 4); unsafe { - match N & 0b1111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle!( - a, - b, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] - ), - 2 => simd_shuffle!( - a, - b, - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] - ), - 3 => simd_shuffle!( - a, - b, - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] - ), - 4 => simd_shuffle!( - a, - b, - [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] - ), - 5 => simd_shuffle!( - a, - b, - [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] - ), - 6 => simd_shuffle!( - a, - b, - [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] - ), - 7 => simd_shuffle!( - a, - b, - [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] - ), - 8 => simd_shuffle!( - a, - b, - [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] - ), - 9 => simd_shuffle!( - a, - b, - [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] - ), - 10 => simd_shuffle!( - a, - b, - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] - ), - 11 => simd_shuffle!( - a, - b, - [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] - ), - 12 => simd_shuffle!( - a, - b, - [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] - ), - 13 => simd_shuffle!( - a, - b, - [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28] - ), - 14 => simd_shuffle!( - a, - b, - [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] - ), - 15 => simd_shuffle!( - a, - b, - [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] - ), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7, + N as u32 + 8, + N as u32 + 9, + N as u32 + 10, + N as u32 + 11, + N as u32 + 12, + N as u32 + 13, + N as u32 + 14, + N as u32 + 15 + ] + ) } } #[doc = "Extract vector from pair of vectors"] @@ -12821,85 +12642,28 @@ pub fn vextq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { pub fn vextq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t { static_assert_uimm_bits!(N, 4); unsafe { - match N & 0b1111 { - 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle!( - a, - b, - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] - ), - 2 => simd_shuffle!( - a, - b, - [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] - ), - 3 => simd_shuffle!( - a, - b, - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] - ), - 4 => simd_shuffle!( - a, - b, - [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] - ), - 5 => simd_shuffle!( - a, - b, - [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] - ), - 6 => simd_shuffle!( - a, - b, - [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] - ), - 7 => simd_shuffle!( - a, - b, - [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] - ), - 8 => simd_shuffle!( - a, - b, - [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] - ), - 9 => simd_shuffle!( - a, - b, - [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] - ), - 10 => simd_shuffle!( - a, - b, - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] - ), - 11 => simd_shuffle!( - a, - b, - [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] - ), - 12 => simd_shuffle!( - a, - b, - [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] - ), - 13 => simd_shuffle!( - a, - b, - [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28] - ), - 14 => simd_shuffle!( - a, - b, - [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] - ), - 15 => simd_shuffle!( - a, - b, - [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] - ), - _ => unreachable_unchecked(), - } + simd_shuffle!( + a, + b, + [ + N as u32, + N as u32 + 1, + N as u32 + 2, + N as u32 + 3, + N as u32 + 4, + N as u32 + 5, + N as u32 + 6, + N as u32 + 7, + N as u32 + 8, + N as u32 + 9, + N as u32 + 10, + N as u32 + 11, + N as u32 + 12, + N as u32 + 13, + N as u32 + 14, + N as u32 + 15 + ] + ) } } #[doc = "Floating-point fused Multiply-Add to accumulator (vector)"] diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs index c4249c2b3c..12d3c090ed 100644 --- a/crates/core_arch/src/arm_shared/neon/mod.rs +++ b/crates/core_arch/src/arm_shared/neon/mod.rs @@ -7,7 +7,7 @@ mod generated; #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))] pub use self::generated::*; -use crate::{core_arch::simd::*, hint::unreachable_unchecked, intrinsics::simd::*, mem::transmute}; +use crate::{core_arch::simd::*, intrinsics::simd::*, mem::transmute}; #[cfg(test)] use stdarch_test::assert_instr; diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 75e7e88362..76a6b57410 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -2662,10 +2662,10 @@ intrinsics: - FnCall: [static_assert_uimm_bits!, [N, 2]] - FnCall: ['vget{neon_type[0].lane_nox}', [a], [N]] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [ext, 'N = 1']]}]] - FnCall: [rustc_legacy_const_generics, ['2']] @@ -2673,11 +2673,11 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [poly64x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] - - [float64x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] + - poly64x2_t + - float64x2_t compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1]']] - name: "vmla{neon_type.no}" doc: "Floating-point multiply-add to accumulator" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 08a80c3719..3ddd8db3b6 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -1719,10 +1719,10 @@ intrinsics: - "transmute" - - FnCall: ['vget{neon_type[1].lane_nox}', [a], [N]] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 7']]}]] @@ -1733,20 +1733,20 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] - - [int16x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] - - [uint8x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] - - [uint16x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] - - [poly8x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] - - [poly16x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] + - int8x8_t + - int16x8_t + - uint8x8_t + - uint16x8_t + - poly8x8_t + - poly16x8_t compose: - FnCall: [static_assert_uimm_bits!, [N, 3]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3, N as u32 + 4, N as u32 + 5, N as u32 + 6, N as u32 + 7]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 15']]}]] @@ -1757,17 +1757,17 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x16_t, 'match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), }'] - - [uint8x16_t, 'match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), }'] - - [poly8x16_t, 'match N & 0b1111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), 8 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 9 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]), 10 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), 11 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]), 12 => simd_shuffle!(a, b, [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), 13 => simd_shuffle!(a, b, [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]), 14 => simd_shuffle!(a, b, [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]), 15 => simd_shuffle!(a, b, [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), _ => unreachable_unchecked(), }'] + - int8x16_t + - uint8x16_t + - poly8x16_t compose: - FnCall: [static_assert_uimm_bits!, [N, 4]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3, N as u32 + 4, N as u32 + 5, N as u32 + 6, N as u32 + 7, N as u32 + 8, N as u32 + 9, N as u32 + 10, N as u32 + 11, N as u32 + 12, N as u32 + 13, N as u32 + 14, N as u32 + 15]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 3']]}]] @@ -1778,21 +1778,21 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int16x4_t,'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] - - [int32x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] - - [uint16x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] - - [uint32x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] - - [poly16x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] - - [float32x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] + - int16x4_t + - int32x4_t + - uint16x4_t + - uint32x4_t + - poly16x4_t + - float32x4_t compose: - FnCall: [static_assert_uimm_bits!, [N, 2]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 3']]}]] @@ -1805,15 +1805,15 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [float16x4_t, 'match N & 0b11 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6]), _ => unreachable_unchecked(), }'] + - float16x4_t compose: - FnCall: [static_assert_uimm_bits!, [N, 2]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 7']]}]] @@ -1826,17 +1826,17 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [float16x8_t, 'match N & 0b111 { 0 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]), 2 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]), 3 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]), 4 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]), 5 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]), 6 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]), 7 => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]), _ => unreachable_unchecked(), }'] + - float16x8_t compose: - FnCall: [static_assert_uimm_bits!, [N, 3]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1, N as u32 + 2, N as u32 + 3, N as u32 + 4, N as u32 + 5, N as u32 + 6, N as u32 + 7]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vext.8"', 'N = 1']]}]] @@ -1847,17 +1847,17 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int32x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] - - [uint32x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] - - [float32x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] + - int32x2_t + - uint32x2_t + - float32x2_t compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1]']] - - name: "vext{neon_type[0].no}" + - name: "vext{neon_type.no}" doc: "Extract vector from pair of vectors" - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[0]}" + arguments: ["a: {neon_type}", "b: {neon_type}"] + return_type: "{neon_type}" attr: - *neon-v7 - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vmov, 'N = 1']]}]] @@ -1868,11 +1868,11 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int64x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] - - [uint64x2_t, 'match N & 0b1 { 0 => simd_shuffle!(a, b, [0, 1]), 1 => simd_shuffle!(a, b, [1, 2]), _ => unreachable_unchecked(), }'] + - int64x2_t + - uint64x2_t compose: - FnCall: [static_assert_uimm_bits!, [N, 1]] - - Identifier: ["{type[1]}", UnsafeSymbol] + - FnCall: [simd_shuffle!, [a, b, '[N as u32, N as u32 + 1]']] - name: "vmla{neon_type[0].no}" doc: "Multiply-add to accumulator" From 5cda14d8126a23cf65b05158f1cccbf769817b38 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 03:50:54 +0530 Subject: [PATCH 03/19] Change implementation of `vcopy{q}_lane{q}` --- .../core_arch/src/aarch64/neon/generated.rs | 2588 +---------------- .../spec/neon/aarch64.spec.yml | 106 +- 2 files changed, 108 insertions(+), 2586 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 68320e916d..46aede98bb 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4131,13 +4131,7 @@ pub fn vcopy_lane_f32( ) -> float32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_f32::(vget_lane_f32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s8)"] @@ -4152,19 +4146,7 @@ pub fn vcopy_lane_f32( pub fn vcopy_lane_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s8::(vget_lane_s8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s16)"] @@ -4179,15 +4161,7 @@ pub fn vcopy_lane_s8(a: int8x8_t, b: int8x8_ pub fn vcopy_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s16::(vget_lane_s16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_s32)"] @@ -4202,13 +4176,7 @@ pub fn vcopy_lane_s16(a: int16x4_t, b: int16 pub fn vcopy_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s32::(vget_lane_s32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u8)"] @@ -4223,19 +4191,7 @@ pub fn vcopy_lane_s32(a: int32x2_t, b: int32 pub fn vcopy_lane_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u8::(vget_lane_u8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u16)"] @@ -4253,15 +4209,7 @@ pub fn vcopy_lane_u16( ) -> uint16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u16::(vget_lane_u16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_u32)"] @@ -4279,13 +4227,7 @@ pub fn vcopy_lane_u32( ) -> uint32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u32::(vget_lane_u32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_p8)"] @@ -4300,19 +4242,7 @@ pub fn vcopy_lane_u32( pub fn vcopy_lane_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_p8::(vget_lane_p8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_p16)"] @@ -4330,15 +4260,7 @@ pub fn vcopy_lane_p16( ) -> poly16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_p16::(vget_lane_p16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_lane_f64)"] @@ -4416,14 +4338,7 @@ pub fn vcopy_laneq_f32( ) -> float32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let a: float32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_f32::(vgetq_lane_f32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s8)"] @@ -4438,21 +4353,7 @@ pub fn vcopy_laneq_f32( pub fn vcopy_laneq_s8(a: int8x8_t, b: int8x16_t) -> int8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 4); - unsafe { - let a: int8x16_t = - simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s8::(vgetq_lane_s8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s16)"] @@ -4470,16 +4371,7 @@ pub fn vcopy_laneq_s16( ) -> int16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let a: int16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s16::(vgetq_lane_s16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_s32)"] @@ -4497,14 +4389,7 @@ pub fn vcopy_laneq_s32( ) -> int32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let a: int32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_s32::(vgetq_lane_s32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u8)"] @@ -4522,21 +4407,7 @@ pub fn vcopy_laneq_u8( ) -> uint8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 4); - unsafe { - let a: uint8x16_t = - simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u8::(vgetq_lane_u8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u16)"] @@ -4554,16 +4425,7 @@ pub fn vcopy_laneq_u16( ) -> uint16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let a: uint16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u16::(vgetq_lane_u16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_u32)"] @@ -4581,14 +4443,7 @@ pub fn vcopy_laneq_u32( ) -> uint32x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let a: uint32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_u32::(vgetq_lane_u32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_p8)"] @@ -4606,21 +4461,7 @@ pub fn vcopy_laneq_p8( ) -> poly8x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 4); - unsafe { - let a: poly8x16_t = - simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_p8::(vgetq_lane_p8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_p16)"] @@ -4638,16 +4479,7 @@ pub fn vcopy_laneq_p16( ) -> poly16x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let a: poly16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vset_lane_p16::(vgetq_lane_p16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopy_laneq_f64)"] @@ -4722,16 +4554,7 @@ pub fn vcopyq_lane_f32( ) -> float32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 1); - unsafe { - let b: float32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_f32::(vget_lane_f32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_f64)"] @@ -4749,14 +4572,8 @@ pub fn vcopyq_lane_f64( ) -> float64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert!(LANE2 == 0); - unsafe { - let b: float64x2_t = simd_shuffle!(b, b, [0, 1]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: float64x2_t = vcombine_f64(b, b); + vsetq_lane_f64::(vgetq_lane_f64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s64)"] @@ -4774,14 +4591,8 @@ pub fn vcopyq_lane_s64( ) -> int64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert!(LANE2 == 0); - unsafe { - let b: int64x2_t = simd_shuffle!(b, b, [0, 1]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: int64x2_t = vcombine_s64(b, b); + vsetq_lane_s64::(vgetq_lane_s64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u64)"] @@ -4799,14 +4610,8 @@ pub fn vcopyq_lane_u64( ) -> uint64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert!(LANE2 == 0); - unsafe { - let b: uint64x2_t = simd_shuffle!(b, b, [0, 1]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: uint64x2_t = vcombine_u64(b, b); + vsetq_lane_u64::(vgetq_lane_u64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p64)"] @@ -4824,14 +4629,8 @@ pub fn vcopyq_lane_p64( ) -> poly64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert!(LANE2 == 0); - unsafe { - let b: poly64x2_t = simd_shuffle!(b, b, [0, 1]); - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: poly64x2_t = vcombine_p64(b, b); + vsetq_lane_p64::(vgetq_lane_p64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s8)"] @@ -4846,365 +4645,8 @@ pub fn vcopyq_lane_p64( pub fn vcopyq_lane_s8(a: int8x16_t, b: int8x8_t) -> int8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let b: int8x16_t = - simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + let b: int8x16_t = vcombine_s8(b, b); + vsetq_lane_s8::(vgetq_lane_s8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s16)"] @@ -5222,20 +4664,8 @@ pub fn vcopyq_lane_s16( ) -> int16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let b: int16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: int16x8_t = vcombine_s16(b, b); + vsetq_lane_s16::(vgetq_lane_s16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s32)"] @@ -5253,16 +4683,8 @@ pub fn vcopyq_lane_s32( ) -> int32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 1); - unsafe { - let b: int32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: int32x4_t = vcombine_s32(b, b); + vsetq_lane_s32::(vgetq_lane_s32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u8)"] @@ -5280,365 +4702,8 @@ pub fn vcopyq_lane_u8( ) -> uint8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let b: uint8x16_t = - simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + let b: uint8x16_t = vcombine_u8(b, b); + vsetq_lane_u8::(vgetq_lane_u8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u16)"] @@ -5656,20 +4721,8 @@ pub fn vcopyq_lane_u16( ) -> uint16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let b: uint16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: uint16x8_t = vcombine_u16(b, b); + vsetq_lane_u16::(vgetq_lane_u16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_u32)"] @@ -5687,16 +4740,8 @@ pub fn vcopyq_lane_u32( ) -> uint32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 1); - unsafe { - let b: uint32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]); - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: uint32x4_t = vcombine_u32(b, b); + vsetq_lane_u32::(vgetq_lane_u32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p8)"] @@ -5714,365 +4759,8 @@ pub fn vcopyq_lane_p8( ) -> poly8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 3); - unsafe { - let b: poly8x16_t = - simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + let b: poly8x16_t = vcombine_p8(b, b); + vsetq_lane_p8::(vgetq_lane_p8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_p16)"] @@ -6090,20 +4778,8 @@ pub fn vcopyq_lane_p16( ) -> poly16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 2); - unsafe { - let b: poly16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]); - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + let b: poly16x8_t = vcombine_p16(b, b); + vsetq_lane_p16::(vgetq_lane_p16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_f32)"] @@ -6121,15 +4797,7 @@ pub fn vcopyq_laneq_f32( ) -> float32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_f32::(vgetq_lane_f32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_f64)"] @@ -6147,13 +4815,7 @@ pub fn vcopyq_laneq_f64( ) -> float64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_f64::(vgetq_lane_f64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s8)"] @@ -6171,363 +4833,7 @@ pub fn vcopyq_laneq_s8( ) -> int8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 4); - unsafe { - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + vsetq_lane_s8::(vgetq_lane_s8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s16)"] @@ -6545,19 +4851,7 @@ pub fn vcopyq_laneq_s16( ) -> int16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_s16::(vgetq_lane_s16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s32)"] @@ -6575,15 +4869,7 @@ pub fn vcopyq_laneq_s32( ) -> int32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_s32::(vgetq_lane_s32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_s64)"] @@ -6601,13 +4887,7 @@ pub fn vcopyq_laneq_s64( ) -> int64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_s64::(vgetq_lane_s64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u8)"] @@ -6625,363 +4905,7 @@ pub fn vcopyq_laneq_u8( ) -> uint8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 4); - unsafe { - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + vsetq_lane_u8::(vgetq_lane_u8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u16)"] @@ -6999,19 +4923,7 @@ pub fn vcopyq_laneq_u16( ) -> uint16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_u16::(vgetq_lane_u16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u32)"] @@ -7029,15 +4941,7 @@ pub fn vcopyq_laneq_u32( ) -> uint32x4_t { static_assert_uimm_bits!(LANE1, 2); static_assert_uimm_bits!(LANE2, 2); - unsafe { - match LANE1 & 0b11 { - 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), - 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), - 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_u32::(vgetq_lane_u32::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_u64)"] @@ -7055,13 +4959,7 @@ pub fn vcopyq_laneq_u64( ) -> uint64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_u64::(vgetq_lane_u64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p8)"] @@ -7079,363 +4977,7 @@ pub fn vcopyq_laneq_p8( ) -> poly8x16_t { static_assert_uimm_bits!(LANE1, 4); static_assert_uimm_bits!(LANE2, 4); - unsafe { - match LANE1 & 0b1111 { - 0 => simd_shuffle!( - a, - b, - [ - 16 + LANE2 as u32, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 1 => simd_shuffle!( - a, - b, - [ - 0, - 16 + LANE2 as u32, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 2 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 16 + LANE2 as u32, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 3 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 16 + LANE2 as u32, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 4 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 16 + LANE2 as u32, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 5 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 16 + LANE2 as u32, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 6 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 16 + LANE2 as u32, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 7 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 16 + LANE2 as u32, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 8 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 16 + LANE2 as u32, - 9, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 9 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 16 + LANE2 as u32, - 10, - 11, - 12, - 13, - 14, - 15 - ] - ), - 10 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 16 + LANE2 as u32, - 11, - 12, - 13, - 14, - 15 - ] - ), - 11 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 16 + LANE2 as u32, - 12, - 13, - 14, - 15 - ] - ), - 12 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 16 + LANE2 as u32, - 13, - 14, - 15 - ] - ), - 13 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 16 + LANE2 as u32, - 14, - 15 - ] - ), - 14 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 16 + LANE2 as u32, - 15 - ] - ), - 15 => simd_shuffle!( - a, - b, - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 16 + LANE2 as u32 - ] - ), - _ => unreachable_unchecked(), - } - } + vsetq_lane_p8::(vgetq_lane_p8::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p16)"] @@ -7453,19 +4995,7 @@ pub fn vcopyq_laneq_p16( ) -> poly16x8_t { static_assert_uimm_bits!(LANE1, 3); static_assert_uimm_bits!(LANE2, 3); - unsafe { - match LANE1 & 0b111 { - 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), - 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), - 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), - 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), - 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), - 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), - 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), - 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_p16::(vgetq_lane_p16::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_laneq_p64)"] @@ -7483,13 +5013,7 @@ pub fn vcopyq_laneq_p64( ) -> poly64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - unsafe { - match LANE1 & 0b1 { - 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), - 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), - _ => unreachable_unchecked(), - } - } + vsetq_lane_p64::(vgetq_lane_p64::(b), a) } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 76a6b57410..e88ebc2b7f 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -8941,19 +8941,19 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [int8x8_t, int8x8_t, int8x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int16x4_t, int16x4_t, int16x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int32x2_t, int32x2_t, int32x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint8x8_t, uint8x8_t, uint8x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint16x4_t, uint16x4_t, uint16x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint32x2_t, uint32x2_t, uint32x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly8x8_t, poly8x8_t, poly8x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly16x4_t, poly16x4_t, poly16x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [float32x2_t, float32x2_t, float32x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [int8x8_t, int8x8_t, int8x8_t, '3', '3'] + - [int16x4_t, int16x4_t, int16x4_t, '2', '2'] + - [int32x2_t, int32x2_t, int32x2_t, '1', '1'] + - [uint8x8_t, uint8x8_t, uint8x8_t, '3', '3'] + - [uint16x4_t, uint16x4_t, uint16x4_t, '2', '2'] + - [uint32x2_t, uint32x2_t, uint32x2_t, '1', '1'] + - [poly8x8_t, poly8x8_t, poly8x8_t, '3', '3'] + - [poly16x4_t, poly16x4_t, poly16x4_t, '2', '2'] + - [float32x2_t, float32x2_t, float32x2_t, '1', '1'] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']] - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] - - Identifier: ["{type[5]}", UnsafeSymbol] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcopy{neon_type[0].lane_nox}" doc: "Insert vector element from another vector element" @@ -8966,19 +8966,19 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [int8x16_t, int8x8_t, int8x16_t, '4', '3', ' let b: int8x16_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int16x8_t, int16x4_t, int16x8_t, '3', '2', ' let b: int16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int32x4_t, int32x2_t, int32x4_t, '2', '1', ' let b: int32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint8x16_t, uint8x8_t, uint8x16_t, '4', '3', ' let b: uint8x16_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint16x8_t, uint16x4_t, uint16x8_t, '3', '2', ' let b: uint16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint32x4_t, uint32x2_t, uint32x4_t, '2', '1', ' let b: uint32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly8x16_t, poly8x8_t, poly8x16_t, '4', '3', ' let b: poly8x16_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly16x8_t, poly16x4_t, poly16x8_t, '3', '2', ' let b: poly16x8_t = simd_shuffle!(b, b, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [int8x16_t, int8x8_t, int8x16_t, '4', '3'] + - [int16x8_t, int16x4_t, int16x8_t, '3', '2'] + - [int32x4_t, int32x2_t, int32x4_t, '2', '1'] + - [uint8x16_t, uint8x8_t, uint8x16_t, '4', '3'] + - [uint16x8_t, uint16x4_t, uint16x8_t, '3', '2'] + - [uint32x4_t, uint32x2_t, uint32x4_t, '2', '1'] + - [poly8x16_t, poly8x8_t, poly8x16_t, '4', '3'] + - [poly16x8_t, poly16x4_t, poly16x8_t, '3', '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']] - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] - - Identifier: ["{type[5]}", UnsafeSymbol] - - Identifier: ["{type[6]}", UnsafeSymbol] + - Let: [b, '{neon_type[2]}', {FnCall: ['vcombine{neon_type[1].no}', [b, b]]}] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[2].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcopy_lane_{neon_type[0]}" doc: "Insert vector element from another vector element" @@ -9031,23 +9031,23 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [int8x16_t, int8x16_t, int8x16_t, '4', '4', ' match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int16x8_t, int16x8_t, int16x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int32x4_t, int32x4_t, int32x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int64x2_t, int64x2_t, int64x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint8x16_t, uint8x16_t, uint8x16_t, '4', '4', ' match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint16x8_t, uint16x8_t, uint16x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint32x4_t, uint32x4_t, uint32x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint64x2_t, uint64x2_t, uint64x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly8x16_t, poly8x16_t, poly8x16_t, '4', '4', ' match LANE1 & 0b1111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32, 8, 9, 10, 11, 12, 13, 14, 15]), 8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16 + LANE2 as u32, 9, 10, 11, 12, 13, 14, 15]), 9 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 16 + LANE2 as u32, 10, 11, 12, 13, 14, 15]), 10 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16 + LANE2 as u32, 11, 12, 13, 14, 15]), 11 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 + LANE2 as u32, 12, 13, 14, 15]), 12 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16 + LANE2 as u32, 13, 14, 15]), 13 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16 + LANE2 as u32, 14, 15]), 14 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16 + LANE2 as u32, 15]), 15 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly16x8_t, poly16x8_t, poly16x8_t, '3', '3', ' match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 8 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 8 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly64x2_t, poly64x2_t, poly64x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [float32x4_t, float32x4_t, float32x4_t, '2', '2', ' match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [float64x2_t, float64x2_t, float64x2_t, '1', '1', ' match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [int8x16_t, int8x16_t, int8x16_t, '4', '4'] + - [int16x8_t, int16x8_t, int16x8_t, '3', '3'] + - [int32x4_t, int32x4_t, int32x4_t, '2', '2'] + - [int64x2_t, int64x2_t, int64x2_t, '1', '1'] + - [uint8x16_t, uint8x16_t, uint8x16_t, '4', '4'] + - [uint16x8_t, uint16x8_t, uint16x8_t, '3', '3'] + - [uint32x4_t, uint32x4_t, uint32x4_t, '2', '2'] + - [uint64x2_t, uint64x2_t, uint64x2_t, '1', '1'] + - [poly8x16_t, poly8x16_t, poly8x16_t, '4', '4'] + - [poly16x8_t, poly16x8_t, poly16x8_t, '3', '3'] + - [poly64x2_t, poly64x2_t, poly64x2_t, '1', '1'] + - [float32x4_t, float32x4_t, float32x4_t, '2', '2'] + - [float64x2_t, float64x2_t, float64x2_t, '1', '1'] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']] - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] - - Identifier: ["{type[5]}", UnsafeSymbol] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcopy{neon_type[0].laneq_nox}" doc: "Insert vector element from another vector element" @@ -9060,20 +9060,19 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [int8x8_t, int8x16_t, int8x8_t, '3', '4', ' let a: int8x16_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int16x4_t, int16x8_t, int16x4_t, '2', '3', ' let a: int16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [int32x2_t, int32x4_t, int32x2_t, '1', '2', ' let a: int32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint8x8_t, uint8x16_t, uint8x8_t, '3', '4', ' let a: uint8x16_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint16x4_t, uint16x8_t, uint16x4_t, '2', '3', ' let a: uint16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint32x2_t, uint32x4_t, uint32x2_t, '1', '2', 'let a: uint32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly8x8_t, poly8x16_t, poly8x8_t, '3', '4', ' let a: poly8x16_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);', 'match LANE1 & 0b111 { 0 => simd_shuffle!(a, b, [16 + LANE2 as u32, 1, 2, 3, 4, 5, 6, 7]), 1 => simd_shuffle!(a, b, [0, 16 + LANE2 as u32, 2, 3, 4, 5, 6, 7]), 2 => simd_shuffle!(a, b, [0, 1, 16 + LANE2 as u32, 3, 4, 5, 6, 7]), 3 => simd_shuffle!(a, b, [0, 1, 2, 16 + LANE2 as u32, 4, 5, 6, 7]), 4 => simd_shuffle!(a, b, [0, 1, 2, 3, 16 + LANE2 as u32, 5, 6, 7]), 5 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 16 + LANE2 as u32, 6, 7]), 6 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 16 + LANE2 as u32, 7]), 7 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 16 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly16x4_t, poly16x8_t, poly16x4_t, '2', '3', ' let a: poly16x8_t = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [8 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 8 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 8 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 8 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [float32x2_t, float32x4_t, float32x2_t, '1', '2', ' let a: float32x4_t = simd_shuffle!(a, a, [0, 1, 2, 3]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [int8x8_t, int8x16_t, int8x8_t, '3', '4'] + - [int16x4_t, int16x8_t, int16x4_t, '2', '3'] + - [int32x2_t, int32x4_t, int32x2_t, '1', '2'] + - [uint8x8_t, uint8x16_t, uint8x8_t, '3', '4'] + - [uint16x4_t, uint16x8_t, uint16x4_t, '2', '3'] + - [uint32x2_t, uint32x4_t, uint32x2_t, '1', '2'] + - [poly8x8_t, poly8x16_t, poly8x8_t, '3', '4'] + - [poly16x4_t, poly16x8_t, poly16x4_t, '2', '3'] + - [float32x2_t, float32x4_t, float32x2_t, '1', '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']] - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] - - Identifier: ["{type[5]}", UnsafeSymbol] - - Identifier: ["{type[6]}", UnsafeSymbol] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcopyq_lane_{neon_type[0]}" doc: "Insert vector element from another vector element" @@ -9086,15 +9085,15 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [int64x2_t, int64x1_t, ' let b: int64x2_t = simd_shuffle!(b, b, [0, 1]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [uint64x2_t, uint64x1_t, ' let b: uint64x2_t = simd_shuffle!(b, b, [0, 1]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [poly64x2_t, poly64x1_t, ' let b: poly64x2_t = simd_shuffle!(b, b, [0, 1]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] - - [float64x2_t, float64x1_t, ' let b: float64x2_t = simd_shuffle!(b, b, [0, 1]);', 'match LANE1 & 0b1 { 0 => simd_shuffle!(a, b, [2 + LANE2 as u32, 1]), 1 => simd_shuffle!(a, b, [0, 2 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [int64x2_t, int64x1_t] + - [uint64x2_t, uint64x1_t] + - [poly64x2_t, poly64x1_t] + - [float64x2_t, float64x1_t] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '1']] - FnCall: [static_assert!, ['LANE2 == 0']] - - Identifier: ['{type[2]}', UnsafeSymbol] - - Identifier: ['{type[3]}', UnsafeSymbol] + - Let: [b, '{neon_type[0]}', {FnCall: ['vcombine{neon_type[1].no}', [b, b]]}] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[0].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcopyq_lane_f32" doc: "Insert vector element from another vector element" @@ -9107,12 +9106,11 @@ intrinsics: static_defs: ['const LANE1: i32, const LANE2: i32'] safety: safe types: - - [float32x4_t, float32x2_t, ' let b: float32x4_t = simd_shuffle!(b, b, [0, 1, 2, 3]);', 'match LANE1 & 0b11 { 0 => simd_shuffle!(a, b, [4 + LANE2 as u32, 1, 2, 3]), 1 => simd_shuffle!(a, b, [0, 4 + LANE2 as u32, 2, 3]), 2 => simd_shuffle!(a, b, [0, 1, 4 + LANE2 as u32, 3]), 3 => simd_shuffle!(a, b, [0, 1, 2, 4 + LANE2 as u32]), _ => unreachable_unchecked(), }'] + - [float32x4_t, float32x2_t] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, 2]] - FnCall: [static_assert_uimm_bits!, [LANE2, 1]] - - Identifier: ["{type[2]}", UnsafeSymbol] - - Identifier: ["{type[3]}", UnsafeSymbol] + - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE2]]}, a], [LANE1]] - name: "vcreate_f64" doc: "Insert vector element from another vector element" From f50725352ff9690a4ecb3a89e35fafe71be65adc Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:04:28 +0530 Subject: [PATCH 04/19] Change implementation of `vab{a,d}l_high` --- .../core_arch/src/aarch64/neon/generated.rs | 114 ++++----- .../spec/neon/aarch64.spec.yml | 222 ++---------------- 2 files changed, 71 insertions(+), 265 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 46aede98bb..7059b0e4fa 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -70,10 +70,10 @@ pub fn __jcvt(a: f64) -> i32 { assert_instr(sabal2) )] pub fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { + let d = vget_high_s8(b); + let e = vget_high_s8(c); + let f = vabd_s8(d, e); unsafe { - let d: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let e: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - let f: int8x8_t = vabd_s8(d, e); let f: uint8x8_t = simd_cast(f); simd_add(a, simd_cast(f)) } @@ -88,10 +88,10 @@ pub fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { assert_instr(sabal2) )] pub fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + let d = vget_high_s16(b); + let e = vget_high_s16(c); + let f = vabd_s16(d, e); unsafe { - let d: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let e: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - let f: int16x4_t = vabd_s16(d, e); let f: uint16x4_t = simd_cast(f); simd_add(a, simd_cast(f)) } @@ -106,10 +106,10 @@ pub fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { assert_instr(sabal2) )] pub fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + let d = vget_high_s32(b); + let e = vget_high_s32(c); + let f = vabd_s32(d, e); unsafe { - let d: int32x2_t = simd_shuffle!(b, b, [2, 3]); - let e: int32x2_t = simd_shuffle!(c, c, [2, 3]); - let f: int32x2_t = vabd_s32(d, e); let f: uint32x2_t = simd_cast(f); simd_add(a, simd_cast(f)) } @@ -124,12 +124,10 @@ pub fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { assert_instr(uabal2) )] pub fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t { - unsafe { - let d: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let e: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - let f: uint8x8_t = vabd_u8(d, e); - simd_add(a, simd_cast(f)) - } + let d = vget_high_u8(b); + let e = vget_high_u8(c); + let f = vabd_u8(d, e); + unsafe { simd_add(a, simd_cast(f)) } } #[doc = "Unsigned Absolute difference and Accumulate Long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_u16)"] @@ -141,12 +139,10 @@ pub fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t assert_instr(uabal2) )] pub fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t { - unsafe { - let d: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let e: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - let f: uint16x4_t = vabd_u16(d, e); - simd_add(a, simd_cast(f)) - } + let d = vget_high_u16(b); + let e = vget_high_u16(c); + let f = vabd_u16(d, e); + unsafe { simd_add(a, simd_cast(f)) } } #[doc = "Unsigned Absolute difference and Accumulate Long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabal_high_u32)"] @@ -158,12 +154,10 @@ pub fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t assert_instr(uabal2) )] pub fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t { - unsafe { - let d: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - let e: uint32x2_t = simd_shuffle!(c, c, [2, 3]); - let f: uint32x2_t = vabd_u32(d, e); - simd_add(a, simd_cast(f)) - } + let d = vget_high_u32(b); + let e = vget_high_u32(c); + let f = vabd_u32(d, e); + unsafe { simd_add(a, simd_cast(f)) } } #[doc = "Absolute difference between the arguments of Floating"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabd_f64)"] @@ -226,85 +220,79 @@ pub fn vabdh_f16(a: f16, b: f16) -> f16 { vget_lane_f16::<0>(vabd_f16(vdup_n_f16(a), vdup_n_f16(b))) } #[doc = "Signed Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s16)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s8)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sabdl2))] -pub fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { +pub fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { + let c = vget_high_s8(a); + let d = vget_high_s8(b); unsafe { - let c: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let d: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let e: uint16x4_t = simd_cast(vabd_s16(c, d)); + let e: uint8x8_t = simd_cast(vabd_s8(c, d)); simd_cast(e) } } #[doc = "Signed Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s32)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s16)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sabdl2))] -pub fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { +pub fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + let c = vget_high_s16(a); + let d = vget_high_s16(b); unsafe { - let c: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let d: int32x2_t = simd_shuffle!(b, b, [2, 3]); - let e: uint32x2_t = simd_cast(vabd_s32(c, d)); + let e: uint16x4_t = simd_cast(vabd_s16(c, d)); simd_cast(e) } } #[doc = "Signed Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s8)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_s32)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sabdl2))] -pub fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { +pub fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + let c = vget_high_s32(a); + let d = vget_high_s32(b); unsafe { - let c: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let d: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let e: uint8x8_t = simd_cast(vabd_s8(c, d)); + let e: uint32x2_t = simd_cast(vabd_s32(c, d)); simd_cast(e) } } #[doc = "Unsigned Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u16)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u8)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uabdl2))] -pub fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { - unsafe { - let c: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let d: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - simd_cast(vabd_u16(c, d)) - } +pub fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { + let c = vget_high_u8(a); + let d = vget_high_u8(b); + unsafe { simd_cast(vabd_u8(c, d)) } } #[doc = "Unsigned Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u32)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u16)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uabdl2))] -pub fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { - unsafe { - let c: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - let d: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - simd_cast(vabd_u32(c, d)) - } +pub fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { + let c = vget_high_u16(a); + let d = vget_high_u16(b); + unsafe { simd_cast(vabd_u16(c, d)) } } #[doc = "Unsigned Absolute difference Long"] -#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u8)"] +#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabdl_high_u32)"] #[inline] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uabdl2))] -pub fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { - unsafe { - let c: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let d: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - simd_cast(vabd_u8(c, d)) - } +pub fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { + let c = vget_high_u32(a); + let d = vget_high_u32(b); + unsafe { simd_cast(vabd_u32(c, d)) } } #[doc = "Floating-point absolute value"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vabs_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index e88ebc2b7f..26a542cdc3 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -220,103 +220,11 @@ intrinsics: safety: safe types: - [int8x16_t, int16x8_t, int8x8_t, uint8x8_t] - compose: - - Let: - - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [8, 9, 10, 11, 12, 13, 14, 15] - - Let: - - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [8, 9, 10, 11, 12, 13, 14, 15] - - Let: - - e - - "{neon_type[3]}" - - FnCall: - - simd_cast - - - FnCall: - - "vabd_{neon_type[0]}" - - - c - - d - - FnCall: - - simd_cast - - - e - - - name: "vabdl_high{neon_type[0].noq}" - doc: Signed Absolute difference Long - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[1]}" - attr: - - *neon-stable - - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [sabdl2]]}]] - safety: safe - types: - [int16x8_t, int32x4_t, int16x4_t, uint16x4_t] - compose: - - Let: - - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [4, 5, 6, 7] - - Let: - - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [4, 5, 6, 7] - - Let: - - e - - "{neon_type[3]}" - - FnCall: - - simd_cast - - - FnCall: - - "vabd_{neon_type[0]}" - - - c - - d - - FnCall: - - simd_cast - - - e - - - name: "vabdl_high{neon_type[0].noq}" - doc: Signed Absolute difference Long - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[1]}" - attr: - - *neon-stable - - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [sabdl2]]}]] - safety: safe - types: - [int32x4_t, int64x2_t, int32x2_t, uint32x2_t] compose: - - Let: - - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [2, 3] - - Let: - - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [2, 3] + - Let: [c, FnCall: ['vget_high_{neon_type[0]}', [a]]] + - Let: [d, FnCall: ['vget_high_{neon_type[0]}', [b]]] - Let: - e - "{neon_type[3]}" @@ -10050,19 +9958,17 @@ intrinsics: - FnCall: [cfg_attr, [*cfg-test-not-msvc-little-endian, {FnCall: [assert_instr, [uabal2]]}]] safety: safe types: - - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]', '[4, 5, 6, 7]'] - - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]', '[2, 3]'] + - [uint16x8_t, uint8x16_t, uint8x8_t] + - [uint32x4_t, uint16x8_t, uint16x4_t] + - [uint64x2_t, uint32x4_t, uint32x2_t] compose: - Let: - d - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] + - FnCall: ['vget_high_{neon_type[1]}', [b]] - Let: - e - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] - - Let: [f, "{neon_type[2]}", {FnCall: ["vabd_{neon_type[2]}", [d, e]]}] + - FnCall: ['vget_high_{neon_type[1]}', [c]] + - Let: [f, {FnCall: ["vabd_{neon_type[2]}", [d, e]]}] - FnCall: - simd_add - - a @@ -10077,36 +9983,25 @@ intrinsics: - FnCall: [cfg_attr, [*cfg-test-not-msvc-little-endian, {FnCall: [assert_instr, [sabal2]]}]] safety: safe types: - - [int16x8_t, int8x16_t, int8x16_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t, uint8x8_t] - - [int32x4_t, int16x8_t, int16x8_t, '[4, 5, 6, 7]', int16x4_t, uint16x4_t] - - [int64x2_t, int32x4_t, int32x4_t, '[2, 3]', int32x2_t, uint32x2_t] + - [int16x8_t, int8x16_t, int8x16_t, int8x8_t, uint8x8_t] + - [int32x4_t, int16x8_t, int16x8_t, int16x4_t, uint16x4_t] + - [int64x2_t, int32x4_t, int32x4_t, int32x2_t, uint32x2_t] compose: - Let: - d - - "{neon_type[4]}" - - FnCall: - - simd_shuffle! - - - b - - b - - "{type[3]}" + - FnCall: ['vget_high_{neon_type[1]}', [b]] - Let: - e - - "{neon_type[4]}" - - FnCall: - - simd_shuffle! - - - c - - c - - "{type[3]}" + - FnCall: ['vget_high_{neon_type[2]}', [c]] - Let: - f - - "{neon_type[4]}" - FnCall: - - "vabd{neon_type[4].no}" + - "vabd{neon_type[3].no}" - - d - e - Let: - f - - "{neon_type[5]}" + - "{neon_type[4]}" - FnCall: - simd_cast - - f @@ -11502,93 +11397,16 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [uabdl2]]}]] safety: safe types: - - [uint8x16_t, uint16x8_t, uint8x8_t] + - [uint8x16_t, uint16x8_t] + - [uint16x8_t, uint32x4_t] + - [uint32x4_t, uint64x2_t] compose: - Let: - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [8, 9, 10, 11, 12, 13, 14, 15] + - FnCall: ['vget_high_{neon_type[0]}', [a]] - Let: - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [8, 9, 10, 11, 12, 13, 14, 15] - - FnCall: - - simd_cast - - - FnCall: - - "vabd_{neon_type[0]}" - - - c - - d - - - name: "vabdl_high{neon_type[0].noq}" - doc: Unsigned Absolute difference Long - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[1]}" - attr: - - *neon-stable - - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [uabdl2]]}]] - safety: safe - types: - - [uint16x8_t, uint32x4_t, uint16x4_t] - compose: - - Let: - - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [4, 5, 6, 7] - - Let: - - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [4, 5, 6, 7] - - - FnCall: - - simd_cast - - - FnCall: - - "vabd_{neon_type[0]}" - - - c - - d - - - name: "vabdl_high{neon_type[0].noq}" - doc: Unsigned Absolute difference Long - arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[1]}" - attr: - - *neon-stable - - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [uabdl2]]}]] - safety: safe - types: - - [uint32x4_t, uint64x2_t, uint32x2_t, int32x2_t] - compose: - - Let: - - c - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - a - - a - - [2, 3] - - Let: - - d - - "{neon_type[2]}" - - FnCall: - - simd_shuffle! - - - b - - b - - [2, 3] + - FnCall: ['vget_high_{neon_type[0]}', [b]] - FnCall: - simd_cast - - FnCall: From d67c66f70109ca1e48d020e62dffe02846603721 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:14:58 +0530 Subject: [PATCH 05/19] Change implementation of `vcvt{x}_f32_f64` --- crates/core_arch/src/aarch64/neon/generated.rs | 9 +++------ .../stdarch-gen-arm/spec/neon/aarch64.spec.yml | 18 ++++-------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 7059b0e4fa..448c4aacda 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5093,7 +5093,7 @@ pub fn vcvt_high_f32_f16(a: float16x8_t) -> float32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(fcvtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t { - unsafe { simd_shuffle!(a, simd_cast(b), [0, 1, 2, 3]) } + vcombine_f32(a, vcvt_f32_f64(b)) } #[doc = "Floating-point convert to higher precision long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_high_f64_f32)"] @@ -5102,10 +5102,7 @@ pub fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(fcvtl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcvt_high_f64_f32(a: float32x4_t) -> float64x2_t { - unsafe { - let b: float32x2_t = simd_shuffle!(a, a, [2, 3]); - simd_cast(b) - } + unsafe { simd_cast(vget_high_f32(a)) } } #[doc = "Fixed-point convert to floating-point"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f64_s64)"] @@ -7266,7 +7263,7 @@ pub fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(fcvtxn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t { - unsafe { simd_shuffle!(a, vcvtx_f32_f64(b), [0, 1, 2, 3]) } + vcombine_f32(a, vcvtx_f32_f64(b)) } #[doc = "Floating-point convert to lower precision narrow, rounding to odd"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtxd_f32_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 26a542cdc3..ac30a65127 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -1417,15 +1417,7 @@ intrinsics: types: - [float32x4_t, float64x2_t] compose: - - Let: - - b - - float32x2_t - - FnCall: - - simd_shuffle! - - - a - - a - - '[2, 3]' - - FnCall: [simd_cast, [b]] + - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [a]]}]] - name: "vcvt_high_f16_f32" doc: "Floating-point convert to lower precision" @@ -1486,10 +1478,9 @@ intrinsics: - [float32x2_t, float64x2_t, float32x4_t] compose: - FnCall: - - simd_shuffle! + - vcombine_f32 - - a - - FnCall: [simd_cast, [b]] - - '[0, 1, 2, 3]' + - FnCall: [vcvt_f32_f64, [b]] - name: "vcvtx_f32_f64" doc: "Floating-point convert to lower precision narrow, rounding to odd" @@ -1538,10 +1529,9 @@ intrinsics: - [float32x2_t, float64x2_t, float32x4_t] compose: - FnCall: - - simd_shuffle! + - vcombine_f32 - - a - FnCall: [vcvtx_f32_f64, [b]] - - '[0, 1, 2, 3]' - name: "vcvt{type[2]}" doc: "Floating-point convert to fixed-point, rounding toward zero" From 65fd2fe62b080ff9e58422a545710deaf3abdb0d Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:15:47 +0530 Subject: [PATCH 06/19] Change implementation of `vml{a,s}{l}_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 96 ++-- .../src/arm_shared/neon/generated.rs | 432 +++--------------- .../spec/neon/aarch64.spec.yml | 52 +-- .../spec/neon/arm_shared.spec.yml | 138 +++--- 4 files changed, 181 insertions(+), 537 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 448c4aacda..72462dcb60 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -12147,11 +12147,9 @@ pub fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { - unsafe { - let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let c: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - vmlal_s8(a, b, c) - } + let b = vget_high_s8(b); + let c = vget_high_s8(c); + vmlal_s8(a, b, c) } #[doc = "Signed multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_s16)"] @@ -12160,11 +12158,9 @@ pub fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { - unsafe { - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let c: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - vmlal_s16(a, b, c) - } + let b = vget_high_s16(b); + let c = vget_high_s16(c); + vmlal_s16(a, b, c) } #[doc = "Signed multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_s32)"] @@ -12173,11 +12169,9 @@ pub fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); - let c: int32x2_t = simd_shuffle!(c, c, [2, 3]); - vmlal_s32(a, b, c) - } + let b = vget_high_s32(b); + let c = vget_high_s32(c); + vmlal_s32(a, b, c) } #[doc = "Unsigned multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u8)"] @@ -12186,11 +12180,9 @@ pub fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t { - unsafe { - let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let c: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - vmlal_u8(a, b, c) - } + let b = vget_high_u8(b); + let c = vget_high_u8(c); + vmlal_u8(a, b, c) } #[doc = "Unsigned multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u16)"] @@ -12199,11 +12191,9 @@ pub fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t { - unsafe { - let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let c: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - vmlal_u16(a, b, c) - } + let b = vget_high_u16(b); + let c = vget_high_u16(c); + vmlal_u16(a, b, c) } #[doc = "Unsigned multiply-add long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_u32)"] @@ -12212,11 +12202,9 @@ pub fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlal2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t { - unsafe { - let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - let c: uint32x2_t = simd_shuffle!(c, c, [2, 3]); - vmlal_u32(a, b, c) - } + let b = vget_high_u32(b); + let c = vget_high_u32(c); + vmlal_u32(a, b, c) } #[doc = "Floating-point multiply-subtract from accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_f64)"] @@ -12391,11 +12379,9 @@ pub fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { - unsafe { - let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let c: int8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - vmlsl_s8(a, b, c) - } + let b = vget_high_s8(b); + let c = vget_high_s8(c); + vmlsl_s8(a, b, c) } #[doc = "Signed multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_s16)"] @@ -12404,11 +12390,9 @@ pub fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { - unsafe { - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let c: int16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - vmlsl_s16(a, b, c) - } + let b = vget_high_s16(b); + let c = vget_high_s16(c); + vmlsl_s16(a, b, c) } #[doc = "Signed multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_s32)"] @@ -12417,11 +12401,9 @@ pub fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(smlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); - let c: int32x2_t = simd_shuffle!(c, c, [2, 3]); - vmlsl_s32(a, b, c) - } + let b = vget_high_s32(b); + let c = vget_high_s32(c); + vmlsl_s32(a, b, c) } #[doc = "Unsigned multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u8)"] @@ -12430,11 +12412,9 @@ pub fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t { - unsafe { - let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let c: uint8x8_t = simd_shuffle!(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); - vmlsl_u8(a, b, c) - } + let b = vget_high_u8(b); + let c = vget_high_u8(c); + vmlsl_u8(a, b, c) } #[doc = "Unsigned multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u16)"] @@ -12443,11 +12423,9 @@ pub fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t { - unsafe { - let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let c: uint16x4_t = simd_shuffle!(c, c, [4, 5, 6, 7]); - vmlsl_u16(a, b, c) - } + let b = vget_high_u16(b); + let c = vget_high_u16(c); + vmlsl_u16(a, b, c) } #[doc = "Unsigned multiply-subtract long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_u32)"] @@ -12456,11 +12434,9 @@ pub fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t #[cfg_attr(all(test, target_endian = "little"), assert_instr(umlsl2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t { - unsafe { - let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - let c: uint32x2_t = simd_shuffle!(c, c, [2, 3]); - vmlsl_u32(a, b, c) - } + let b = vget_high_u32(b); + let c = vget_high_u32(c); + vmlsl_u32(a, b, c) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s8)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 8f95952084..134a549daf 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -26170,7 +26170,7 @@ pub fn vmla_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmla_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_f32(a, b, vdup_lane_f32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_f32)"] @@ -26197,7 +26197,7 @@ pub fn vmla_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmla_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_f32(a, b, vdup_laneq_f32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_f32)"] @@ -26224,13 +26224,7 @@ pub fn vmlaq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlaq_f32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_f32(a, b, vdupq_lane_f32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_f32)"] @@ -26257,13 +26251,7 @@ pub fn vmlaq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlaq_f32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_f32(a, b, vdupq_laneq_f32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_s16)"] @@ -26286,13 +26274,7 @@ pub fn vmlaq_laneq_f32( )] pub fn vmla_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmla_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmla_s16(a, b, vdup_lane_s16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_u16)"] @@ -26315,13 +26297,7 @@ pub fn vmla_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) )] pub fn vmla_lane_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmla_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmla_u16(a, b, vdup_lane_u16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_s16)"] @@ -26344,13 +26320,7 @@ pub fn vmla_lane_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_ )] pub fn vmla_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmla_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmla_s16(a, b, vdup_laneq_s16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_u16)"] @@ -26373,13 +26343,7 @@ pub fn vmla_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) )] pub fn vmla_laneq_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmla_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmla_u16(a, b, vdup_laneq_u16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_s16)"] @@ -26402,26 +26366,7 @@ pub fn vmla_laneq_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x8 )] pub fn vmlaq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlaq_s16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlaq_s16(a, b, vdupq_lane_s16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_u16)"] @@ -26444,26 +26389,7 @@ pub fn vmlaq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) )] pub fn vmlaq_lane_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlaq_u16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlaq_u16(a, b, vdupq_lane_u16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_s16)"] @@ -26486,26 +26412,7 @@ pub fn vmlaq_lane_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x4 )] pub fn vmlaq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlaq_s16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlaq_s16(a, b, vdupq_laneq_s16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_u16)"] @@ -26528,26 +26435,7 @@ pub fn vmlaq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t )] pub fn vmlaq_laneq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlaq_u16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlaq_u16(a, b, vdupq_laneq_u16::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_s32)"] @@ -26570,7 +26458,7 @@ pub fn vmlaq_laneq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x )] pub fn vmla_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmla_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_s32(a, b, vdup_lane_s32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_lane_u32)"] @@ -26593,7 +26481,7 @@ pub fn vmla_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) )] pub fn vmla_lane_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmla_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_u32(a, b, vdup_lane_u32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_s32)"] @@ -26616,7 +26504,7 @@ pub fn vmla_lane_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_ )] pub fn vmla_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmla_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_s32(a, b, vdup_laneq_s32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_laneq_u32)"] @@ -26639,7 +26527,7 @@ pub fn vmla_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) )] pub fn vmla_laneq_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmla_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmla_u32(a, b, vdup_laneq_u32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_s32)"] @@ -26662,13 +26550,7 @@ pub fn vmla_laneq_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x4 )] pub fn vmlaq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlaq_s32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_s32(a, b, vdupq_lane_s32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_lane_u32)"] @@ -26691,13 +26573,7 @@ pub fn vmlaq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) )] pub fn vmlaq_lane_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlaq_u32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_u32(a, b, vdupq_lane_u32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_s32)"] @@ -26720,13 +26596,7 @@ pub fn vmlaq_lane_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x2 )] pub fn vmlaq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlaq_s32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_s32(a, b, vdupq_laneq_s32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlaq_laneq_u32)"] @@ -26749,13 +26619,7 @@ pub fn vmlaq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t )] pub fn vmlaq_laneq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlaq_u32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlaq_u32(a, b, vdupq_laneq_u32::(c)) } #[doc = "Vector multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_n_f32)"] @@ -27240,13 +27104,7 @@ pub fn vmlaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { )] pub fn vmlal_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlal_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlal_s16(a, b, vdup_lane_s16::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_s16)"] @@ -27269,13 +27127,7 @@ pub fn vmlal_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) )] pub fn vmlal_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlal_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlal_s16(a, b, vdup_laneq_s16::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_s32)"] @@ -27298,7 +27150,7 @@ pub fn vmlal_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t )] pub fn vmlal_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmlal_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlal_s32(a, b, vdup_lane_s32::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_s32)"] @@ -27321,7 +27173,7 @@ pub fn vmlal_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) )] pub fn vmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmlal_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlal_s32(a, b, vdup_laneq_s32::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_u16)"] @@ -27344,13 +27196,7 @@ pub fn vmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t )] pub fn vmlal_lane_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlal_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlal_u16(a, b, vdup_lane_u16::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_u16)"] @@ -27373,13 +27219,7 @@ pub fn vmlal_lane_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4 )] pub fn vmlal_laneq_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlal_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlal_u16(a, b, vdup_laneq_u16::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_lane_u32)"] @@ -27402,7 +27242,7 @@ pub fn vmlal_laneq_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x )] pub fn vmlal_lane_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmlal_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlal_u32(a, b, vdup_lane_u32::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_laneq_u32)"] @@ -27425,7 +27265,7 @@ pub fn vmlal_lane_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2 )] pub fn vmlal_laneq_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmlal_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlal_u32(a, b, vdup_laneq_u32::(c)) } #[doc = "Vector widening multiply accumulate with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_n_s16)"] @@ -27704,7 +27544,7 @@ pub fn vmls_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmls_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_f32(a, b, vdup_lane_f32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_f32)"] @@ -27731,7 +27571,7 @@ pub fn vmls_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmls_f32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_f32(a, b, vdup_laneq_f32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_f32)"] @@ -27758,13 +27598,7 @@ pub fn vmlsq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlsq_f32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_f32(a, b, vdupq_lane_f32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_f32)"] @@ -27791,13 +27625,7 @@ pub fn vmlsq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsq_f32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_f32(a, b, vdupq_laneq_f32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_s16)"] @@ -27820,13 +27648,7 @@ pub fn vmlsq_laneq_f32( )] pub fn vmls_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmls_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmls_s16(a, b, vdup_lane_s16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_u16)"] @@ -27849,13 +27671,7 @@ pub fn vmls_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) )] pub fn vmls_lane_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmls_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmls_u16(a, b, vdup_lane_u16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_s16)"] @@ -27878,13 +27694,7 @@ pub fn vmls_lane_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_ )] pub fn vmls_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmls_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmls_s16(a, b, vdup_laneq_s16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_u16)"] @@ -27907,13 +27717,7 @@ pub fn vmls_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) )] pub fn vmls_laneq_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmls_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmls_u16(a, b, vdup_laneq_u16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_s16)"] @@ -27936,26 +27740,7 @@ pub fn vmls_laneq_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x8 )] pub fn vmlsq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsq_s16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlsq_s16(a, b, vdupq_lane_s16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_u16)"] @@ -27978,26 +27763,7 @@ pub fn vmlsq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) )] pub fn vmlsq_lane_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsq_u16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlsq_u16(a, b, vdupq_lane_u16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_s16)"] @@ -28020,26 +27786,7 @@ pub fn vmlsq_lane_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x4 )] pub fn vmlsq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlsq_s16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlsq_s16(a, b, vdupq_laneq_s16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_u16)"] @@ -28062,26 +27809,7 @@ pub fn vmlsq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t )] pub fn vmlsq_laneq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlsq_u16( - a, - b, - simd_shuffle!( - c, - c, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + vmlsq_u16(a, b, vdupq_laneq_u16::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_s32)"] @@ -28104,7 +27832,7 @@ pub fn vmlsq_laneq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x )] pub fn vmls_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmls_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_s32(a, b, vdup_lane_s32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_lane_u32)"] @@ -28127,7 +27855,7 @@ pub fn vmls_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) )] pub fn vmls_lane_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmls_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_u32(a, b, vdup_lane_u32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_s32)"] @@ -28150,7 +27878,7 @@ pub fn vmls_lane_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_ )] pub fn vmls_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmls_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_s32(a, b, vdup_laneq_s32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_laneq_u32)"] @@ -28173,7 +27901,7 @@ pub fn vmls_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) )] pub fn vmls_laneq_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmls_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmls_u32(a, b, vdup_laneq_u32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_s32)"] @@ -28196,13 +27924,7 @@ pub fn vmls_laneq_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x4 )] pub fn vmlsq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlsq_s32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_s32(a, b, vdupq_lane_s32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_lane_u32)"] @@ -28225,13 +27947,7 @@ pub fn vmlsq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) )] pub fn vmlsq_lane_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - vmlsq_u32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_u32(a, b, vdupq_lane_u32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_s32)"] @@ -28254,13 +27970,7 @@ pub fn vmlsq_lane_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x2 )] pub fn vmlsq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsq_s32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_s32(a, b, vdupq_laneq_s32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsq_laneq_u32)"] @@ -28283,13 +27993,7 @@ pub fn vmlsq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t )] pub fn vmlsq_laneq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsq_u32( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsq_u32(a, b, vdupq_laneq_u32::(c)) } #[doc = "Vector multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmls_n_f32)"] @@ -28774,13 +28478,7 @@ pub fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { )] pub fn vmlsl_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsl_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsl_s16(a, b, vdup_lane_s16::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_s16)"] @@ -28803,13 +28501,7 @@ pub fn vmlsl_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) )] pub fn vmlsl_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlsl_s16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsl_s16(a, b, vdup_laneq_s16::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_s32)"] @@ -28832,7 +28524,7 @@ pub fn vmlsl_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t )] pub fn vmlsl_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmlsl_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlsl_s32(a, b, vdup_lane_s32::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_s32)"] @@ -28855,7 +28547,7 @@ pub fn vmlsl_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) )] pub fn vmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmlsl_s32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlsl_s32(a, b, vdup_laneq_s32::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_u16)"] @@ -28878,13 +28570,7 @@ pub fn vmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t )] pub fn vmlsl_lane_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmlsl_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsl_u16(a, b, vdup_lane_u16::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_u16)"] @@ -28907,13 +28593,7 @@ pub fn vmlsl_lane_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4 )] pub fn vmlsl_laneq_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmlsl_u16( - a, - b, - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmlsl_u16(a, b, vdup_laneq_u16::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_lane_u32)"] @@ -28936,7 +28616,7 @@ pub fn vmlsl_laneq_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x )] pub fn vmlsl_lane_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmlsl_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlsl_u32(a, b, vdup_lane_u32::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_laneq_u32)"] @@ -28959,7 +28639,7 @@ pub fn vmlsl_lane_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2 )] pub fn vmlsl_laneq_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmlsl_u32(a, b, simd_shuffle!(c, c, [LANE as u32, LANE as u32])) } + vmlsl_u32(a, b, vdup_laneq_u32::(c)) } #[doc = "Vector widening multiply subtract with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_n_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index ac30a65127..37f28ca129 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -2600,12 +2600,12 @@ intrinsics: - *neon-stable safety: safe types: - - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]', '[4, 5, 6, 7]'] - - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]', '[2, 3]'] + - [int16x8_t, int8x16_t, int8x8_t] + - [int32x4_t, int16x8_t, int16x4_t] + - [int64x2_t, int32x4_t, int32x2_t] compose: - - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] - - Let: [c, "{neon_type[2]}", {FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[1]}', [b]]}] + - Let: [c, {FnCall: ['vget_high_{neon_type[1]}', [c]]}] - FnCall: ["vmlal_{neon_type[2]}", [a, b, c]] - name: "vmlal_high_{neon_type[1]}" @@ -2617,18 +2617,12 @@ intrinsics: - *neon-stable safety: safe types: - - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]'] - - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]'] + - [uint16x8_t, uint8x16_t, uint8x8_t] + - [uint32x4_t, uint16x8_t, uint16x4_t] + - [uint64x2_t, uint32x4_t, uint32x2_t] compose: - - Let: - - b - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] - - Let: - - c - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] + - Let: [b, {FnCall: ['vget_high_{neon_type[1]}', [b]]}] + - Let: [c, {FnCall: ['vget_high_{neon_type[1]}', [c]]}] - FnCall: ["vmlal_{neon_type[1]}", [a, b, c]] - name: "vmlsl_high_{neon_type[1]}" @@ -2640,18 +2634,12 @@ intrinsics: - *neon-stable safety: safe types: - - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]'] - - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]'] + - [int16x8_t, int8x16_t, int8x8_t] + - [int32x4_t, int16x8_t, int16x4_t] + - [int64x2_t, int32x4_t, int32x2_t] compose: - - Let: - - b - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] - - Let: - - c - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] + - Let: [b, {FnCall: ['vget_high_{neon_type[1]}', [b]]}] + - Let: [c, {FnCall: ['vget_high_{neon_type[1]}', [c]]}] - FnCall: ["vmlsl_{neon_type[1]}", [a, b, c]] - name: "vmlsl_high_{neon_type[1]}" @@ -2663,12 +2651,12 @@ intrinsics: - *neon-stable safety: safe types: - - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]'] - - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]'] + - [uint16x8_t, uint8x16_t, uint8x8_t] + - [uint32x4_t, uint16x8_t, uint16x4_t] + - [uint64x2_t, uint32x4_t, uint32x2_t] compose: - - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] - - Let: [c, "{neon_type[2]}", {FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[1]}', [b]]}] + - Let: [c, {FnCall: ['vget_high_{neon_type[1]}', [c]]}] - FnCall: ["vmlsl_{neon_type[1]}", [a, b, c]] - name: "vmovn_high{neon_type[1].noq}" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 3ddd8db3b6..a56c6079dc 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -1992,17 +1992,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [int32x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int32x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int64x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [int64x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]'] + - [int32x4_t, int16x4_t, int16x4_t, '2'] + - [int32x4_t, int16x4_t, int16x8_t, '3'] + - [int64x2_t, int32x2_t, int32x2_t, '1'] + - [int64x2_t, int32x2_t, int32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmlal_{neon_type[1]}" - - a - b - - FnCall: [simd_shuffle!, [c, c, '{type[4]}']] + - FnCall: ['vdup_lane{neon_type[2].no}', [c], [LANE]] - name: "vmlal_lane{neon_type[2].no}" doc: "Vector widening multiply accumulate with scalar" @@ -2018,17 +2018,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [uint32x4_t, uint16x4_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint32x4_t, uint16x4_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint64x2_t, uint32x2_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32]'] - - [uint64x2_t, uint32x2_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32]'] + - [uint32x4_t, uint16x4_t, uint16x4_t, '2'] + - [uint32x4_t, uint16x4_t, uint16x8_t, '3'] + - [uint64x2_t, uint32x2_t, uint32x2_t, '1'] + - [uint64x2_t, uint32x2_t, uint32x4_t, '2'] compose: - - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]] + - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmlal_{neon_type[1]}" - - a - b - - FnCall: [simd_shuffle!, [c, c, '{type[5]}']] + - FnCall: ['vdup_lane{neon_type[2].no}', [c], [LANE]] - name: "vmlal_{neon_type[1]}" doc: "Unsigned multiply-add long" @@ -2147,15 +2147,15 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [int32x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int32x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [int32x4_t, int16x4_t, int16x4_t, '2'] + - [int32x4_t, int16x4_t, int16x8_t, '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmlsl_{neon_type[1]}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup_lane{neon_type[2].no}', [c], [LANE]] - name: "vmlsl_lane{neon_type[2].no}" doc: "Vector widening multiply subtract with scalar" @@ -2171,15 +2171,15 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [int64x2_t, int32x2_t, int32x2_t, '[LANE as u32, LANE as u32]', '1'] - - [int64x2_t, int32x2_t, int32x4_t, '[LANE as u32, LANE as u32]', '2'] + - [int64x2_t, int32x2_t, int32x2_t, '1'] + - [int64x2_t, int32x2_t, int32x4_t, '2'] compose: - - FnCall: [static_assert_uimm_bits!, [LANE, "{type[4]}"]] + - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmlsl_{neon_type[1]}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] + - FnCall: ['vdup_lane{neon_type[2].no}', [c], [LANE]] - name: "vmlsl_lane{neon_type[2].no}" doc: "Vector widening multiply subtract with scalar" @@ -2195,17 +2195,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [uint32x4_t, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint32x4_t, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint64x2_t, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [uint64x2_t, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]'] + - [uint32x4_t, uint16x4_t, uint16x4_t, '2'] + - [uint32x4_t, uint16x4_t, uint16x8_t, '3'] + - [uint64x2_t, uint32x2_t, uint32x2_t, '1'] + - [uint64x2_t, uint32x2_t, uint32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmlsl_{neon_type[1]}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup_lane{neon_type[2].no}', [c], [LANE]] - name: "vmlsl_{neon_type[1]}" doc: "Unsigned multiply-subtract long" @@ -10901,21 +10901,21 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_laneq_u16, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_lane_u16, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_s16, int16x4_t, int16x4_t, '2'] + - [_laneq_s16, int16x4_t, int16x8_t, '3'] + - [q_lane_s16, int16x8_t, int16x4_t, '2'] + - [q_laneq_s16, int16x8_t, int16x8_t, '3'] + - [_lane_u16, uint16x4_t, uint16x4_t, '2'] + - [_laneq_u16, uint16x4_t, uint16x8_t, '3'] + - [q_lane_u16, uint16x8_t, uint16x4_t, '2'] + - [q_laneq_u16, uint16x8_t, uint16x8_t, '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmla{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmla{type[0]}" doc: "Vector multiply accumulate with scalar" @@ -10931,21 +10931,21 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_lane_u32, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_u32, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_u32, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_u32, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_s32, int32x2_t, int32x2_t, '1'] + - [_laneq_s32, int32x2_t, int32x4_t, '2'] + - [q_lane_s32, int32x4_t, int32x2_t, '1'] + - [q_laneq_s32, int32x4_t, int32x4_t, '2'] + - [_lane_u32, uint32x2_t, uint32x2_t, '1'] + - [_laneq_u32, uint32x2_t, uint32x4_t, '2'] + - [q_lane_u32, uint32x4_t, uint32x2_t, '1'] + - [q_laneq_u32, uint32x4_t, uint32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmla{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmla{type[0]}" doc: "Vector multiply accumulate with scalar" @@ -10961,17 +10961,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_f32, float32x2_t, float32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_f32, float32x2_t, float32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_f32, float32x4_t, float32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_f32, float32x4_t, float32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_f32, float32x2_t, float32x2_t, '1'] + - [_laneq_f32, float32x2_t, float32x4_t, '2'] + - [q_lane_f32, float32x4_t, float32x2_t, '1'] + - [q_laneq_f32, float32x4_t, float32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmla{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmls{neon_type[0].N}" doc: "Vector multiply subtract with scalar" @@ -11050,21 +11050,21 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_laneq_u16, uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_lane_u16, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_s16, int16x4_t, int16x4_t, '2'] + - [_laneq_s16, int16x4_t, int16x8_t, '3'] + - [q_lane_s16, int16x8_t, int16x4_t, '2'] + - [q_laneq_s16, int16x8_t, int16x8_t, '3'] + - [_lane_u16, uint16x4_t, uint16x4_t, '2'] + - [_laneq_u16, uint16x4_t, uint16x8_t, '3'] + - [q_lane_u16, uint16x8_t, uint16x4_t, '2'] + - [q_laneq_u16, uint16x8_t, uint16x8_t, '3'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmls{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmls{type[0]}" doc: "Vector multiply subtract with scalar" @@ -11080,21 +11080,21 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_lane_u32, uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_u32, uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_u32, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_u32, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_s32, int32x2_t, int32x2_t, '1'] + - [_laneq_s32, int32x2_t, int32x4_t, '2'] + - [q_lane_s32, int32x4_t, int32x2_t, '1'] + - [q_laneq_s32, int32x4_t, int32x4_t, '2'] + - [_lane_u32, uint32x2_t, uint32x2_t, '1'] + - [_laneq_u32, uint32x2_t, uint32x4_t, '2'] + - [q_lane_u32, uint32x4_t, uint32x2_t, '1'] + - [q_laneq_u32, uint32x4_t, uint32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmls{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmls{type[0]}" doc: "Vector multiply subtract with scalar" @@ -11110,17 +11110,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_f32, float32x2_t, float32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_f32, float32x2_t, float32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_f32, float32x4_t, float32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_f32, float32x4_t, float32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_f32, float32x2_t, float32x2_t, '1'] + - [_laneq_f32, float32x2_t, float32x4_t, '2'] + - [q_lane_f32, float32x4_t, float32x2_t, '1'] + - [q_laneq_f32, float32x4_t, float32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - "vmls{neon_type[1].no}" - - a - b - - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]] + - FnCall: ['vdup{type[0]}', [c], [LANE]] - name: "vmul{neon_type[0].N}" doc: "Vector multiply by scalar" From 5e2812c60b0face0076b1ab22c18ae01384d0f99 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:23:22 +0530 Subject: [PATCH 07/19] Change implementation of `vmov{n,l}_high` --- .../core_arch/src/aarch64/neon/generated.rs | 66 +++++-------------- .../spec/neon/aarch64.spec.yml | 41 +++++------- 2 files changed, 34 insertions(+), 73 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 72462dcb60..fc6747b24d 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -12445,10 +12445,8 @@ pub fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sxtl2))] pub fn vmovl_high_s8(a: int8x16_t) -> int16x8_t { - unsafe { - let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - vmovl_s8(a) - } + let a = vget_high_s8(a); + vmovl_s8(a) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s16)"] @@ -12457,10 +12455,8 @@ pub fn vmovl_high_s8(a: int8x16_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sxtl2))] pub fn vmovl_high_s16(a: int16x8_t) -> int32x4_t { - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - vmovl_s16(a) - } + let a = vget_high_s16(a); + vmovl_s16(a) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_s32)"] @@ -12469,10 +12465,8 @@ pub fn vmovl_high_s16(a: int16x8_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(sxtl2))] pub fn vmovl_high_s32(a: int32x4_t) -> int64x2_t { - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - vmovl_s32(a) - } + let a = vget_high_s32(a); + vmovl_s32(a) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u8)"] @@ -12481,10 +12475,8 @@ pub fn vmovl_high_s32(a: int32x4_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uxtl2))] pub fn vmovl_high_u8(a: uint8x16_t) -> uint16x8_t { - unsafe { - let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - vmovl_u8(a) - } + let a = vget_high_u8(a); + vmovl_u8(a) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u16)"] @@ -12493,10 +12485,8 @@ pub fn vmovl_high_u8(a: uint8x16_t) -> uint16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uxtl2))] pub fn vmovl_high_u16(a: uint16x8_t) -> uint32x4_t { - unsafe { - let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - vmovl_u16(a) - } + let a = vget_high_u16(a); + vmovl_u16(a) } #[doc = "Vector move"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovl_high_u32)"] @@ -12505,10 +12495,8 @@ pub fn vmovl_high_u16(a: uint16x8_t) -> uint32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(uxtl2))] pub fn vmovl_high_u32(a: uint32x4_t) -> uint64x2_t { - unsafe { - let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - vmovl_u32(a) - } + let a = vget_high_u32(a); + vmovl_u32(a) } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s16)"] @@ -12517,10 +12505,7 @@ pub fn vmovl_high_u32(a: uint32x4_t) -> uint64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { - unsafe { - let c: int8x8_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) - } + unsafe { vcombine_s8(a, simd_cast(b)) } } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s32)"] @@ -12529,10 +12514,7 @@ pub fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { - unsafe { - let c: int16x4_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7]) - } + unsafe { vcombine_s16(a, simd_cast(b)) } } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_s64)"] @@ -12541,10 +12523,7 @@ pub fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { - unsafe { - let c: int32x2_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3]) - } + unsafe { vcombine_s32(a, simd_cast(b)) } } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u16)"] @@ -12553,10 +12532,7 @@ pub fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { - unsafe { - let c: uint8x8_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) - } + unsafe { vcombine_u8(a, simd_cast(b)) } } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u32)"] @@ -12565,10 +12541,7 @@ pub fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { - unsafe { - let c: uint16x4_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3, 4, 5, 6, 7]) - } + unsafe { vcombine_u16(a, simd_cast(b)) } } #[doc = "Extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmovn_high_u64)"] @@ -12577,10 +12550,7 @@ pub fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(xtn2))] pub fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { - unsafe { - let c: uint32x2_t = simd_cast(b); - simd_shuffle!(a, c, [0, 1, 2, 3]) - } + unsafe { vcombine_u32(a, simd_cast(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 37f28ca129..074e2af492 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -2668,24 +2668,17 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [xtn2]]}]] safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int8x16_t] + - [int16x4_t, int32x4_t, int16x8_t] + - [int32x2_t, int64x2_t, int32x4_t] + - [uint8x8_t, uint16x8_t, uint8x16_t] + - [uint16x4_t, uint32x4_t, uint16x8_t] + - [uint32x2_t, uint64x2_t, uint32x4_t] compose: - - Let: - - c - - "{neon_type[0]}" - FnCall: - - simd_cast - - - b - - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - - c - - "{type[3]}" + - FnCall: ['simd_cast', [b]] - name: "vneg{neon_type.no}" doc: Negate @@ -6788,14 +6781,13 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [sxtl2]]}]] safety: safe types: - - [int8x16_t, int16x8_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]'] - - [int32x4_t, int64x2_t, int32x2_t, '[2, 3]'] + - [int8x16_t, int16x8_t] + - [int16x8_t, int32x4_t] + - [int32x4_t, int64x2_t] compose: - Let: - a - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]] + - FnCall: ['vget_high_{neon_type[0]}', [a]] - FnCall: ["vmovl{neon_type[0].noq}", [a]] - name: "vmovl_high{neon_type[0].noq}" @@ -6807,14 +6799,13 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [uxtl2]]}]] safety: safe types: - - [uint8x16_t, uint16x8_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x8_t, uint32x4_t, uint16x4_t, '[4, 5, 6, 7]'] - - [uint32x4_t, uint64x2_t, uint32x2_t, '[2, 3]'] + - [uint8x16_t, uint16x8_t] + - [uint16x8_t, uint32x4_t] + - [uint32x4_t, uint64x2_t] compose: - Let: - a - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]] + - FnCall: ['vget_high_{neon_type[0]}', [a]] - FnCall: ["vmovl{neon_type[0].noq}", [a]] - name: "vpadd{neon_type[0].no}" From 580da003fd08c8fa6655b95c3732bbea747bda35 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:23:51 +0530 Subject: [PATCH 08/19] Change implementation of `vmull_{high,lane}` --- .../core_arch/src/aarch64/neon/generated.rs | 56 +++++++------------ .../src/arm_shared/neon/generated.rs | 36 +++--------- .../spec/neon/aarch64.spec.yml | 52 ++++++----------- .../spec/neon/arm_shared.spec.yml | 20 +++---- 4 files changed, 56 insertions(+), 108 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index fc6747b24d..7f53a0c628 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -12822,11 +12822,9 @@ pub fn vmull_high_p64(a: poly64x2_t, b: poly64x2_t) -> p128 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(pmull2))] pub fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t { - unsafe { - let a: poly8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let b: poly8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - vmull_p8(a, b) - } + let a = vget_high_p8(a); + let b = vget_high_p8(b); + vmull_p8(a, b) } #[doc = "Signed multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s8)"] @@ -12835,11 +12833,9 @@ pub fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(smull2))] pub fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { - unsafe { - let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - vmull_s8(a, b) - } + let a = vget_high_s8(a); + let b = vget_high_s8(b); + vmull_s8(a, b) } #[doc = "Signed multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s16)"] @@ -12848,11 +12844,9 @@ pub fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(smull2))] pub fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - vmull_s16(a, b) - } + let a = vget_high_s16(a); + let b = vget_high_s16(b); + vmull_s16(a, b) } #[doc = "Signed multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_s32)"] @@ -12861,11 +12855,9 @@ pub fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(smull2))] pub fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); - vmull_s32(a, b) - } + let a = vget_high_s32(a); + let b = vget_high_s32(b); + vmull_s32(a, b) } #[doc = "Unsigned multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u8)"] @@ -12874,11 +12866,9 @@ pub fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(umull2))] pub fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { - unsafe { - let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - vmull_u8(a, b) - } + let a = vget_high_u8(a); + let b = vget_high_u8(b); + vmull_u8(a, b) } #[doc = "Unsigned multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u16)"] @@ -12887,11 +12877,9 @@ pub fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(umull2))] pub fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { - unsafe { - let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - vmull_u16(a, b) - } + let a = vget_high_u16(a); + let b = vget_high_u16(b); + vmull_u16(a, b) } #[doc = "Unsigned multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_u32)"] @@ -12900,11 +12888,9 @@ pub fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(umull2))] pub fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { - unsafe { - let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - vmull_u32(a, b) - } + let a = vget_high_u32(a); + let b = vget_high_u32(b); + vmull_u32(a, b) } #[doc = "Polynomial multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_p64)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 134a549daf..111397752b 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -30965,12 +30965,7 @@ pub fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { )] pub fn vmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmull_s16( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmull_s16(a, vdup_lane_s16::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_s16)"] @@ -30993,12 +30988,7 @@ pub fn vmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t )] pub fn vmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmull_s16( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmull_s16(a, vdup_laneq_s16::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_s32)"] @@ -31021,7 +31011,7 @@ pub fn vmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t )] pub fn vmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmull_s32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + vmull_s32(a, vdup_lane_s32::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_s32)"] @@ -31044,7 +31034,7 @@ pub fn vmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t )] pub fn vmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmull_s32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + vmull_s32(a, vdup_laneq_s32::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_u16)"] @@ -31067,12 +31057,7 @@ pub fn vmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t )] pub fn vmull_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - vmull_u16( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmull_u16(a, vdup_lane_u16::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_u16)"] @@ -31095,12 +31080,7 @@ pub fn vmull_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4 )] pub fn vmull_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - vmull_u16( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + vmull_u16(a, vdup_laneq_u16::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_lane_u32)"] @@ -31123,7 +31103,7 @@ pub fn vmull_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint32x )] pub fn vmull_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmull_u32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + vmull_u32(a, vdup_lane_u32::(b)) } #[doc = "Vector long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_laneq_u32)"] @@ -31146,7 +31126,7 @@ pub fn vmull_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2 )] pub fn vmull_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmull_u32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + vmull_u32(a, vdup_laneq_u32::(b)) } #[doc = "Vector long multiply with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_n_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 074e2af492..35116dd3a0 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -2675,7 +2675,7 @@ intrinsics: - [uint16x4_t, uint32x4_t, uint16x8_t] - [uint32x2_t, uint64x2_t, uint32x4_t] compose: - - FnCall: + - FnCall: - 'vcombine_{neon_type[0]}' - - a - FnCall: ['simd_cast', [b]] @@ -5035,47 +5035,35 @@ intrinsics: - name: "vmull_high{neon_type[0].noq}" doc: Signed multiply long arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[3]}" + return_type: "{neon_type[1]}" attr: - *neon-stable - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [smull2]]}]] safety: safe types: - - [int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int16x8_t] - - [int16x8_t, int16x4_t, '[4, 5, 6, 7]', int32x4_t] - - [int32x4_t, int32x2_t, '[2, 3]', int64x2_t] + - [int8x16_t, int16x8_t] + - [int16x8_t, int32x4_t] + - [int32x4_t, int64x2_t] compose: - - Let: - - a - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]] - - Let: - - b - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[0]}', [b]]}] - FnCall: ["vmull_{neon_type[0]}", [a, b]] - name: "vmull_high{neon_type[0].noq}" doc: "Unsigned multiply long" arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[3]}" + return_type: "{neon_type[1]}" attr: - *neon-stable - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [umull2]]}]] safety: safe types: - - [uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint16x8_t] - - [uint16x8_t, uint16x4_t, '[4, 5, 6, 7]', uint32x4_t] - - [uint32x4_t, uint32x2_t, '[2, 3]', uint64x2_t] + - [uint8x16_t, uint16x8_t] + - [uint16x8_t, uint32x4_t] + - [uint32x4_t, uint64x2_t] compose: - - Let: - - a - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]] - - Let: - - b - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[0]}', [b]]}] - FnCall: ["vmull_{neon_type[0]}", [a, b]] - name: "vmull_p64" @@ -5101,22 +5089,16 @@ intrinsics: - name: "vmull_high{neon_type[0].noq}" doc: "Polynomial multiply long" arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"] - return_type: "{neon_type[3]}" + return_type: "{neon_type[1]}" attr: - *neon-stable - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [pmull2]]}]] safety: safe types: - - [poly8x16_t, poly8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', poly16x8_t] + - [poly8x16_t, poly16x8_t] compose: - - Let: - - a - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]] - - Let: - - b - - "{neon_type[1]}" - - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[0]}', [b]]}] - FnCall: ["vmull_{neon_type[0]}", [a, b]] - name: "vmull_high{neon_type[0].noq}" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index a56c6079dc..47ec2d0956 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -11357,16 +11357,16 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [int16x4_t, int16x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int16x4_t, int16x8_t, int32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int32x2_t, int32x2_t, int64x2_t, '1', '[LANE as u32, LANE as u32]'] - - [int32x2_t, int32x4_t, int64x2_t, '2', '[LANE as u32, LANE as u32]'] + - [int16x4_t, int16x4_t, int32x4_t, '2'] + - [int16x4_t, int16x8_t, int32x4_t, '3'] + - [int32x2_t, int32x2_t, int64x2_t, '1'] + - [int32x2_t, int32x4_t, int64x2_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmull_{neon_type[0]}" - - a - - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]] + - FnCall: ['vdup_lane{neon_type[1].nox}', [b], [LANE]] - name: "vmull_lane{neon_type[1].no}" doc: "Vector long multiply by scalar" @@ -11382,16 +11382,16 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [uint16x4_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint16x4_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint32x2_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32]'] - - [uint32x2_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32]'] + - [uint16x4_t, uint16x4_t, uint32x4_t, '2'] + - [uint16x4_t, uint16x8_t, uint32x4_t, '3'] + - [uint32x2_t, uint32x2_t, uint64x2_t, '1'] + - [uint32x2_t, uint32x4_t, uint64x2_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - FnCall: - "vmull_{neon_type[0]}" - - a - - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]] + - FnCall: ['vdup_lane{neon_type[1].nox}', [b], [LANE]] - name: "vfms{neon_type[0].N}" doc: "Floating-point fused Multiply-subtract to accumulator(vector)" From c44826108a10fd1bddf0b4afa538bd0e3a9bbb47 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:31:21 +0530 Subject: [PATCH 09/19] Change implementation of `vmulx_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 20 +++++------ .../spec/neon/aarch64.spec.yml | 36 +++++++------------ 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 7f53a0c628..b7db2f0c9a 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -13074,7 +13074,7 @@ pub fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[cfg(not(target_arch = "arm64ec"))] pub fn vmulx_lane_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulx_f16(a, simd_shuffle!(b, b, [LANE as u32; 4])) } + vmulx_f16(a, vdup_lane_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f16)"] @@ -13086,7 +13086,7 @@ pub fn vmulx_lane_f16(a: float16x4_t, b: float16x4_t) -> float1 #[cfg(not(target_arch = "arm64ec"))] pub fn vmulx_laneq_f16(a: float16x4_t, b: float16x8_t) -> float16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vmulx_f16(a, simd_shuffle!(b, b, [LANE as u32; 4])) } + vmulx_f16(a, vdup_laneq_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f16)"] @@ -13098,7 +13098,7 @@ pub fn vmulx_laneq_f16(a: float16x4_t, b: float16x8_t) -> float #[cfg(not(target_arch = "arm64ec"))] pub fn vmulxq_lane_f16(a: float16x8_t, b: float16x4_t) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulxq_f16(a, simd_shuffle!(b, b, [LANE as u32; 8])) } + vmulxq_f16(a, vdupq_lane_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f16)"] @@ -13110,7 +13110,7 @@ pub fn vmulxq_lane_f16(a: float16x8_t, b: float16x4_t) -> float #[cfg(not(target_arch = "arm64ec"))] pub fn vmulxq_laneq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { vmulxq_f16(a, simd_shuffle!(b, b, [LANE as u32; 8])) } + vmulxq_f16(a, vdupq_laneq_f16::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f32)"] @@ -13121,7 +13121,7 @@ pub fn vmulxq_laneq_f16(a: float16x8_t, b: float16x8_t) -> floa #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulx_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + vmulx_f32(a, vdup_lane_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f32)"] @@ -13132,7 +13132,7 @@ pub fn vmulx_lane_f32(a: float32x2_t, b: float32x2_t) -> float3 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulx_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + vmulx_f32(a, vdup_laneq_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f32)"] @@ -13143,7 +13143,7 @@ pub fn vmulx_laneq_f32(a: float32x2_t, b: float32x4_t) -> float #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulxq_f32(a, simd_shuffle!(b, b, [LANE as u32; 4])) } + vmulxq_f32(a, vdupq_lane_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f32)"] @@ -13154,7 +13154,7 @@ pub fn vmulxq_lane_f32(a: float32x4_t, b: float32x2_t) -> float #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { vmulxq_f32(a, simd_shuffle!(b, b, [LANE as u32; 4])) } + vmulxq_f32(a, vdupq_laneq_f32::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f64)"] @@ -13165,7 +13165,7 @@ pub fn vmulxq_laneq_f32(a: float32x4_t, b: float32x4_t) -> floa #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + vmulxq_f64(a, vdupq_laneq_f64::(b)) } #[doc = "Floating-point multiply extended"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f64)"] @@ -13335,7 +13335,7 @@ pub fn vmulxh_laneq_f16(a: f16, b: float16x8_t) -> f16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulxq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t { static_assert!(LANE == 0); - unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + vmulxq_f64(a, vdupq_lane_f64::(b)) } #[doc = "Negate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 35116dd3a0..ba248f4f07 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -5235,11 +5235,7 @@ intrinsics: - FnCall: - "vmulx{type[3]}" - - a - - FnCall: - - "simd_shuffle!" - - - b - - b - - "{type[4]}" + - FnCall: ['vdup{type[0]}', [b], [LANE]] - name: "vmulx{type[0]}" doc: Floating-point multiply extended @@ -5292,21 +5288,17 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - ['_lane_f32', float32x2_t, float32x2_t, '1', '_f32', '[LANE as u32; 2]'] - - ['_laneq_f32', float32x2_t, float32x4_t, '2', '_f32', '[LANE as u32; 2]'] - - ['q_lane_f32', float32x4_t, float32x2_t, '1', 'q_f32', '[LANE as u32; 4]'] - - ['q_laneq_f32', float32x4_t, float32x4_t, '2', 'q_f32', '[LANE as u32; 4]'] - - ['q_laneq_f64', float64x2_t, float64x2_t, '1', 'q_f64', '[LANE as u32; 2]'] + - ['_lane_f32', float32x2_t, float32x2_t, '1', '_f32'] + - ['_laneq_f32', float32x2_t, float32x4_t, '2', '_f32'] + - ['q_lane_f32', float32x4_t, float32x2_t, '1', 'q_f32'] + - ['q_laneq_f32', float32x4_t, float32x4_t, '2', 'q_f32'] + - ['q_laneq_f64', float64x2_t, float64x2_t, '1', 'q_f64'] compose: - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]] - FnCall: - "vmulx{type[4]}" - - a - - FnCall: - - "simd_shuffle!" - - - b - - b - - "{type[5]}" + - FnCall: ['vdup{type[0]}', [b], [LANE]] - name: "vmulx{type[0]}" @@ -5322,20 +5314,16 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - ['_lane_f16', float16x4_t, float16x4_t, '2', '_f16', '[LANE as u32; 4]'] - - ['_laneq_f16', float16x4_t, float16x8_t, '3', '_f16', '[LANE as u32; 4]'] - - ['q_lane_f16', float16x8_t, float16x4_t, '2', 'q_f16', '[LANE as u32; 8]'] - - ['q_laneq_f16', float16x8_t, float16x8_t, '3', 'q_f16', '[LANE as u32; 8]'] + - ['_lane_f16', float16x4_t, float16x4_t, '2', '_f16'] + - ['_laneq_f16', float16x4_t, float16x8_t, '3', '_f16'] + - ['q_lane_f16', float16x8_t, float16x4_t, '2', 'q_f16'] + - ['q_laneq_f16', float16x8_t, float16x8_t, '3', 'q_f16'] compose: - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]] - FnCall: - "vmulx{type[4]}" - - a - - FnCall: - - "simd_shuffle!" - - - b - - b - - "{type[5]}" + - FnCall: ['vdup{type[0]}', [b], [LANE]] - name: "vmulx{type[0]}" From 781f23f437756120d75de4d7a9dfb97395f59cc5 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 04:39:21 +0530 Subject: [PATCH 10/19] Change implementation of `v{add,sub}{l,w}_high` --- .../core_arch/src/aarch64/neon/generated.rs | 84 +++++++------------ .../src/arm_shared/neon/generated.rs | 36 ++++---- .../spec/neon/aarch64.spec.yml | 62 +++++--------- .../spec/neon/arm_shared.spec.yml | 31 ++++--- 4 files changed, 85 insertions(+), 128 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index b7db2f0c9a..eb590b3af3 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -23299,11 +23299,9 @@ pub fn vsubh_f16(a: f16, b: f16) -> f16 { #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))] pub fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { unsafe { - let c: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let d: int16x8_t = simd_cast(c); - let e: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let f: int16x8_t = simd_cast(e); - simd_sub(d, f) + let c: int16x8_t = simd_cast(vget_high_s8(a)); + let d: int16x8_t = simd_cast(vget_high_s8(b)); + simd_sub(c, d) } } #[doc = "Signed Subtract Long"] @@ -23314,11 +23312,9 @@ pub fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))] pub fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { unsafe { - let c: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let d: int32x4_t = simd_cast(c); - let e: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let f: int32x4_t = simd_cast(e); - simd_sub(d, f) + let c: int32x4_t = simd_cast(vget_high_s16(a)); + let d: int32x4_t = simd_cast(vget_high_s16(b)); + simd_sub(c, d) } } #[doc = "Signed Subtract Long"] @@ -23329,11 +23325,9 @@ pub fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))] pub fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { unsafe { - let c: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let d: int64x2_t = simd_cast(c); - let e: int32x2_t = simd_shuffle!(b, b, [2, 3]); - let f: int64x2_t = simd_cast(e); - simd_sub(d, f) + let c: int64x2_t = simd_cast(vget_high_s32(a)); + let d: int64x2_t = simd_cast(vget_high_s32(b)); + simd_sub(c, d) } } #[doc = "Unsigned Subtract Long"] @@ -23344,11 +23338,9 @@ pub fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))] pub fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { unsafe { - let c: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let d: uint16x8_t = simd_cast(c); - let e: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - let f: uint16x8_t = simd_cast(e); - simd_sub(d, f) + let c: uint16x8_t = simd_cast(vget_high_u8(a)); + let d: uint16x8_t = simd_cast(vget_high_u8(b)); + simd_sub(c, d) } } #[doc = "Unsigned Subtract Long"] @@ -23359,11 +23351,9 @@ pub fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))] pub fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { unsafe { - let c: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let d: uint32x4_t = simd_cast(c); - let e: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - let f: uint32x4_t = simd_cast(e); - simd_sub(d, f) + let c: uint32x4_t = simd_cast(vget_high_u16(a)); + let d: uint32x4_t = simd_cast(vget_high_u16(b)); + simd_sub(c, d) } } #[doc = "Unsigned Subtract Long"] @@ -23374,11 +23364,9 @@ pub fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))] pub fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { unsafe { - let c: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - let d: uint64x2_t = simd_cast(c); - let e: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - let f: uint64x2_t = simd_cast(e); - simd_sub(d, f) + let c: uint64x2_t = simd_cast(vget_high_u32(a)); + let d: uint64x2_t = simd_cast(vget_high_u32(b)); + simd_sub(c, d) } } #[doc = "Signed Subtract Wide"] @@ -23388,10 +23376,8 @@ pub fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))] pub fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t { - unsafe { - let c: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_s8(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Signed Subtract Wide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s16)"] @@ -23400,10 +23386,8 @@ pub fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))] pub fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t { - unsafe { - let c: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_s16(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Signed Subtract Wide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s32)"] @@ -23412,10 +23396,8 @@ pub fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))] pub fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t { - unsafe { - let c: int32x2_t = simd_shuffle!(b, b, [2, 3]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_s32(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Unsigned Subtract Wide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u8)"] @@ -23424,10 +23406,8 @@ pub fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))] pub fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t { - unsafe { - let c: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_u8(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Unsigned Subtract Wide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u16)"] @@ -23436,10 +23416,8 @@ pub fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))] pub fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t { - unsafe { - let c: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_u16(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Unsigned Subtract Wide"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u32)"] @@ -23448,10 +23426,8 @@ pub fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))] pub fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t { - unsafe { - let c: uint32x2_t = simd_shuffle!(b, b, [2, 3]); - simd_sub(a, simd_cast(c)) - } + let c = vget_high_u32(b); + unsafe { simd_sub(a, simd_cast(c)) } } #[doc = "Table look-up"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_s8)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 111397752b..6d54e84f25 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -2499,9 +2499,9 @@ pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + let a: int16x4_t = vget_high_s16(a); + let b: int16x4_t = vget_high_s16(b); unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); let a: int32x4_t = simd_cast(a); let b: int32x4_t = simd_cast(b); simd_add(a, b) @@ -2530,9 +2530,9 @@ pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + let a: int32x2_t = vget_high_s32(a); + let b: int32x2_t = vget_high_s32(b); unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); let a: int64x2_t = simd_cast(a); let b: int64x2_t = simd_cast(b); simd_add(a, b) @@ -2561,9 +2561,9 @@ pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { + let a: int8x8_t = vget_high_s8(a); + let b: int8x8_t = vget_high_s8(b); unsafe { - let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); let a: int16x8_t = simd_cast(a); let b: int16x8_t = simd_cast(b); simd_add(a, b) @@ -2592,9 +2592,9 @@ pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { + let a: uint16x4_t = vget_high_u16(a); + let b: uint16x4_t = vget_high_u16(b); unsafe { - let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); let a: uint32x4_t = simd_cast(a); let b: uint32x4_t = simd_cast(b); simd_add(a, b) @@ -2623,9 +2623,9 @@ pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { + let a: uint32x2_t = vget_high_u32(a); + let b: uint32x2_t = vget_high_u32(b); unsafe { - let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]); let a: uint64x2_t = simd_cast(a); let b: uint64x2_t = simd_cast(b); simd_add(a, b) @@ -2654,9 +2654,9 @@ pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { + let a: uint8x8_t = vget_high_u8(a); + let b: uint8x8_t = vget_high_u8(b); unsafe { - let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); let a: uint16x8_t = simd_cast(a); let b: uint16x8_t = simd_cast(b); simd_add(a, b) @@ -2856,8 +2856,8 @@ pub fn vaddq_p128(a: p128, b: p128) -> p128 { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t { + let b = vget_high_s16(b); unsafe { - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); let b: int32x4_t = simd_cast(b); simd_add(a, b) } @@ -2885,8 +2885,8 @@ pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t { + let b = vget_high_s32(b); unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); let b: int64x2_t = simd_cast(b); simd_add(a, b) } @@ -2914,8 +2914,8 @@ pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t { + let b = vget_high_s8(b); unsafe { - let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); let b: int16x8_t = simd_cast(b); simd_add(a, b) } @@ -2943,8 +2943,8 @@ pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t { + let b = vget_high_u16(b); unsafe { - let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); let b: uint32x4_t = simd_cast(b); simd_add(a, b) } @@ -2972,8 +2972,8 @@ pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t { + let b = vget_high_u32(b); unsafe { - let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]); let b: uint64x2_t = simd_cast(b); simd_add(a, b) } @@ -3001,8 +3001,8 @@ pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t { + let b = vget_high_u8(b); unsafe { - let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); let b: uint16x8_t = simd_cast(b); simd_add(a, b) } diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index ba248f4f07..a6208fac7c 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -5667,14 +5667,13 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [ssubw2]]}]] safety: safe types: - - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]'] - - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]'] + - [int16x8_t, int8x16_t] + - [int32x4_t, int16x8_t] + - [int64x2_t, int32x4_t] compose: - Let: - c - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] + - FnCall: ['vget_high_{neon_type[1]}', [b]] - FnCall: - simd_sub - - a @@ -5689,14 +5688,13 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [usubw2]]}]] safety: safe types: - - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]'] - - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]'] + - [uint16x8_t, uint8x16_t] + - [uint32x4_t, uint16x8_t] + - [uint64x2_t, uint32x4_t] compose: - Let: - c - - "{neon_type[2]}" - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] + - FnCall: ['vget_high_{neon_type[1]}', [b]] - FnCall: - simd_sub - - a @@ -5711,27 +5709,19 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [ssubl2]]}]] safety: safe types: - - [int8x16_t, int16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t] - - [int16x8_t, int32x4_t, '[4, 5, 6, 7]', int16x4_t] - - [int32x4_t, int64x2_t, '[2, 3]', int32x2_t] + - [int8x16_t, int16x8_t] + - [int32x4_t, int64x2_t] + - [int16x8_t, int32x4_t] compose: - Let: - c - - "{neon_type[3]}" - - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]] - - Let: - - d - "{neon_type[1]}" - - FnCall: [simd_cast, [c]] - - Let: - - e - - "{neon_type[3]}" - - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]] + - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [a]]}]] - Let: - - f + - d - "{neon_type[1]}" - - FnCall: [simd_cast, [e]] - - FnCall: [simd_sub, [d, f]] + - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [b]]}]] + - FnCall: [simd_sub, [c, d]] - name: "vsubl_high{neon_type[0].noq}" doc: "Unsigned Subtract Long" @@ -5742,27 +5732,19 @@ intrinsics: - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [usubl2]]}]] safety: safe types: - - [uint8x16_t, uint16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint8x8_t] - - [uint16x8_t, uint32x4_t, '[4, 5, 6, 7]', uint16x4_t] - - [uint32x4_t, uint64x2_t, '[2, 3]', uint32x2_t] + - [uint8x16_t, uint16x8_t] + - [uint16x8_t, uint32x4_t] + - [uint32x4_t, uint64x2_t] compose: - Let: - c - - "{neon_type[3]}" - - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]] - - Let: - - d - "{neon_type[1]}" - - FnCall: [simd_cast, [c]] - - Let: - - e - - "{neon_type[3]}" - - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]] + - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [a]]}]] - Let: - - f + - d - "{neon_type[1]}" - - FnCall: [simd_cast, [e]] - - FnCall: [simd_sub, [d, f]] + - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [b]]}]] + - FnCall: [simd_sub, [c, d]] - name: "vbcax{neon_type.no}" doc: Bit clear and exclusive OR diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 47ec2d0956..4024e624ca 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -14478,21 +14478,21 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - ['vaddl_high_s8', 'int8x16_t', 'int16x8_t', 'vaddl', 'saddl2', 'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddl_high_s16', 'int16x8_t', 'int32x4_t', 'vaddl', 'saddl2', 'int16x4_t', '[4, 5, 6, 7]'] - - ['vaddl_high_s32', 'int32x4_t', 'int64x2_t', 'vaddl', 'saddl2', 'int32x2_t', '[2, 3]'] - - ['vaddl_high_u8', 'uint8x16_t', 'uint16x8_t', 'vaddl', 'uaddl2', 'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddl_high_u16', 'uint16x8_t', 'uint32x4_t', 'vaddl', 'uaddl2', 'uint16x4_t', '[4, 5, 6, 7]'] - - ['vaddl_high_u32', 'uint32x4_t', 'uint64x2_t', 'vaddl', 'uaddl2', 'uint32x2_t', '[2, 3]'] + - ['vaddl_high_s8', 'int8x16_t', 'int16x8_t', 'vaddl', 'saddl2', 'int8x8_t'] + - ['vaddl_high_s16', 'int16x8_t', 'int32x4_t', 'vaddl', 'saddl2', 'int16x4_t'] + - ['vaddl_high_s32', 'int32x4_t', 'int64x2_t', 'vaddl', 'saddl2', 'int32x2_t'] + - ['vaddl_high_u8', 'uint8x16_t', 'uint16x8_t', 'vaddl', 'uaddl2', 'uint8x8_t'] + - ['vaddl_high_u16', 'uint16x8_t', 'uint32x4_t', 'vaddl', 'uaddl2', 'uint16x4_t'] + - ['vaddl_high_u32', 'uint32x4_t', 'uint64x2_t', 'vaddl', 'uaddl2', 'uint32x2_t'] compose: - Let: - a - '{neon_type[5]}' - - FnCall: ['simd_shuffle!', [a, a, '{type[6]}']] + - FnCall: ['vget_high_{neon_type[1]}', [a]] - Let: - b - '{neon_type[5]}' - - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']] + - FnCall: ['vget_high_{neon_type[1]}', [b]] - Let: [a, '{neon_type[2]}', {FnCall: [simd_cast, [a]]}] - Let: [b, '{neon_type[2]}', {FnCall: [simd_cast, [b]]}] - FnCall: [simd_add, [a, b]] @@ -14534,17 +14534,16 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - ['vaddw_high_s8', 'int16x8_t', 'int8x16_t', 'vaddw', 'saddw2', 'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddw_high_s16', 'int32x4_t', 'int16x8_t', 'vaddw', 'saddw2', 'int16x4_t', '[4, 5, 6, 7]'] - - ['vaddw_high_s32', 'int64x2_t', 'int32x4_t', 'vaddw', 'saddw2', 'int32x2_t', '[2, 3]'] - - ['vaddw_high_u8', 'uint16x8_t', 'uint8x16_t', 'vaddw', 'uaddw2', 'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddw_high_u16', 'uint32x4_t', 'uint16x8_t', 'vaddw', 'uaddw2', 'uint16x4_t', '[4, 5, 6, 7]'] - - ['vaddw_high_u32', 'uint64x2_t', 'uint32x4_t', 'vaddw', 'uaddw2', 'uint32x2_t', '[2, 3]'] + - ['vaddw_high_s8', 'int16x8_t', 'int8x16_t', 'vaddw', 'saddw2', 'int8x8_t'] + - ['vaddw_high_s16', 'int32x4_t', 'int16x8_t', 'vaddw', 'saddw2', 'int16x4_t'] + - ['vaddw_high_s32', 'int64x2_t', 'int32x4_t', 'vaddw', 'saddw2', 'int32x2_t'] + - ['vaddw_high_u8', 'uint16x8_t', 'uint8x16_t', 'vaddw', 'uaddw2', 'uint8x8_t'] + - ['vaddw_high_u16', 'uint32x4_t', 'uint16x8_t', 'vaddw', 'uaddw2', 'uint16x4_t'] + - ['vaddw_high_u32', 'uint64x2_t', 'uint32x4_t', 'vaddw', 'uaddw2', 'uint32x2_t'] compose: - Let: - b - - '{neon_type[5]}' - - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']] + - FnCall: ['vget_high_{neon_type[2]}', [b]] - Let: - b - '{neon_type[1]}' From a8039144205f1042db8a96bfda08cb5c02fc75b2 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 05:03:26 +0530 Subject: [PATCH 11/19] Change implementation of `vqdmull{_high}{_lane}` and `vqrdmulh_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 76 ++++++---------- .../src/arm_shared/neon/generated.rs | 90 +++++-------------- .../spec/neon/aarch64.spec.yml | 46 +++++----- .../spec/neon/arm_shared.spec.yml | 24 ++--- 4 files changed, 83 insertions(+), 153 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index eb590b3af3..2c49925431 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -14837,11 +14837,9 @@ pub fn vqdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]); - vqdmull_s16(a, b) - } + let a = vget_high_s16(a); + let b = vdup_lane_s16::(b); + vqdmull_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_laneq_s32)"] @@ -14852,11 +14850,9 @@ pub fn vqdmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { static_assert_uimm_bits!(N, 2); - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]); - vqdmull_s32(a, b) - } + let a = vget_high_s32(a); + let b = vdup_laneq_s32::(b); + vqdmull_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_lane_s32)"] @@ -14867,11 +14863,9 @@ pub fn vqdmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]); - vqdmull_s32(a, b) - } + let a = vget_high_s32(a); + let b = vdup_lane_s32::(b); + vqdmull_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_laneq_s16)"] @@ -14882,11 +14876,9 @@ pub fn vqdmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { static_assert_uimm_bits!(N, 3); - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]); - vqdmull_s16(a, b) - } + let a = vget_high_s16(a); + let b = vdup_laneq_s16::(b); + vqdmull_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_n_s16)"] @@ -14895,11 +14887,9 @@ pub fn vqdmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32 #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqdmull2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t { - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = vdup_n_s16(b); - vqdmull_s16(a, b) - } + let a = vget_high_s16(a); + let b = vdup_n_s16(b); + vqdmull_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_n_s32)"] @@ -14908,11 +14898,9 @@ pub fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqdmull2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t { - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = vdup_n_s32(b); - vqdmull_s32(a, b) - } + let a = vget_high_s32(a); + let b = vdup_n_s32(b); + vqdmull_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_s16)"] @@ -14921,11 +14909,9 @@ pub fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqdmull2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { - unsafe { - let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]); - vqdmull_s16(a, b) - } + let a = vget_high_s16(a); + let b = vget_high_s16(b); + vqdmull_s16(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_high_s32)"] @@ -14934,11 +14920,9 @@ pub fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqdmull2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { - unsafe { - let a: int32x2_t = simd_shuffle!(a, a, [2, 3]); - let b: int32x2_t = simd_shuffle!(b, b, [2, 3]); - vqdmull_s32(a, b) - } + let a = vget_high_s32(a); + let b = vget_high_s32(b); + vqdmull_s32(a, b) } #[doc = "Vector saturating doubling long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_laneq_s16)"] @@ -14949,10 +14933,8 @@ pub fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t { static_assert_uimm_bits!(N, 3); - unsafe { - let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]); - vqdmull_s16(a, b) - } + let b = vdup_laneq_s16::(b); + vqdmull_s16(a, b) } #[doc = "Vector saturating doubling long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_laneq_s32)"] @@ -14963,10 +14945,8 @@ pub fn vqdmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqdmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t { static_assert_uimm_bits!(N, 2); - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]); - vqdmull_s32(a, b) - } + let b = vdup_laneq_s32::(b); + vqdmull_s32(a, b) } #[doc = "Signed saturating doubling multiply long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmullh_lane_s16)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 6d54e84f25..3f5b7073e2 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -35001,10 +35001,8 @@ pub fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { )] pub fn vqdmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { static_assert_uimm_bits!(N, 2); - unsafe { - let b: int16x4_t = simd_shuffle!(b, b, [N as u32; 4]); - vqdmull_s16(a, b) - } + let b = vdup_lane_s16::(b); + vqdmull_s16(a, b) } #[doc = "Vector saturating doubling long multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_lane_s32)"] @@ -35027,10 +35025,8 @@ pub fn vqdmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { )] pub fn vqdmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { static_assert_uimm_bits!(N, 1); - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [N as u32; 2]); - vqdmull_s32(a, b) - } + let b = vdup_lane_s32::(b); + vqdmull_s32(a, b) } #[doc = "Vector saturating doubling long multiply with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqdmull_n_s16)"] @@ -35588,11 +35584,8 @@ pub fn vqnegq_s32(a: int32x4_t) -> int32x4_t { )] pub fn vqrdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: int16x4_t = - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vqrdmulh_s16(a, b) - } + let b = vdup_lane_s16::(b); + vqrdmulh_s16(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_lane_s32)"] @@ -35615,10 +35608,8 @@ pub fn vqrdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4 )] pub fn vqrdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [LANE as u32, LANE as u32]); - vqrdmulh_s32(a, b) - } + let b = vdup_lane_s32::(b); + vqrdmulh_s32(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_laneq_s16)"] @@ -35641,11 +35632,8 @@ pub fn vqrdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2 )] pub fn vqrdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let b: int16x4_t = - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vqrdmulh_s16(a, b) - } + let b = vdup_laneq_s16::(b); + vqrdmulh_s16(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_laneq_s32)"] @@ -35668,10 +35656,8 @@ pub fn vqrdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x )] pub fn vqrdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: int32x2_t = simd_shuffle!(b, b, [LANE as u32, LANE as u32]); - vqrdmulh_s32(a, b) - } + let b = vdup_laneq_s32::(b); + vqrdmulh_s32(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_lane_s16)"] @@ -35694,23 +35680,8 @@ pub fn vqrdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x )] pub fn vqrdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: int16x8_t = simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ); - vqrdmulhq_s16(a, b) - } + let b = vdupq_lane_s16::(b); + vqrdmulhq_s16(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_lane_s32)"] @@ -35733,11 +35704,8 @@ pub fn vqrdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x )] pub fn vqrdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let b: int32x4_t = - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vqrdmulhq_s32(a, b) - } + let b = vdupq_lane_s32::(b); + vqrdmulhq_s32(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_laneq_s16)"] @@ -35760,23 +35728,8 @@ pub fn vqrdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x )] pub fn vqrdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let b: int16x8_t = simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ); - vqrdmulhq_s16(a, b) - } + let b = vdupq_laneq_s16::(b); + vqrdmulhq_s16(a, b) } #[doc = "Vector rounding saturating doubling multiply high by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulhq_laneq_s32)"] @@ -35799,11 +35752,8 @@ pub fn vqrdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16 )] pub fn vqrdmulhq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let b: int32x4_t = - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vqrdmulhq_s32(a, b) - } + let b = vdupq_laneq_s32::(b); + vqrdmulhq_s32(a, b) } #[doc = "Vector saturating rounding doubling multiply high with scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmulh_n_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index a6208fac7c..11ba7c955d 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -6963,11 +6963,11 @@ intrinsics: - *neon-stable safety: safe types: - - [int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]'] - - [int32x4_t, int64x2_t, int32x2_t, '[2, 3]'] + - [int16x8_t, int32x4_t] + - [int32x4_t, int64x2_t] compose: - - Let: [a, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, '{type[3]}']]}] - - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [b, b, '{type[3]}']]}] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vget_high_{neon_type[0]}', [b]]}] - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]] - name: "vqdmull_high_n_{type[1]}" @@ -6979,11 +6979,11 @@ intrinsics: - *neon-stable safety: safe types: - - [int16x8_t, "i16", int32x4_t, int16x4_t, '[4, 5, 6, 7]'] - - [int32x4_t, "i32", int64x2_t, int32x2_t, '[2, 3]'] + - [int16x8_t, "i16", int32x4_t] + - [int32x4_t, "i32", int64x2_t] compose: - - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - Let: [b, "{neon_type[3]}", {FnCall: ["vdup_n{neon_type[0].noq}", [b]]}] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ["vdup_n{neon_type[0].noq}", [b]]}] - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]] - name: "vqdmull{type[3]}" @@ -7038,7 +7038,7 @@ intrinsics: - Let: [b, "{type[0]}", {FnCall: ['vget{neon_type[1].lane_nox}', [b], [N]]}] - FnCall: ["vqdmulls_s32", [a, b]] - - name: "vqdmull{type[6]}" + - name: "vqdmull{type[3]}" doc: "Signed saturating doubling multiply long" arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"] return_type: "{neon_type[2]}" @@ -7049,12 +7049,12 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int16x8_t, int16x4_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]', '[N as u32, N as u32, N as u32, N as u32]', '_high_lane_s16'] - - [int32x4_t, int32x4_t, int64x2_t, int32x2_t, '[2, 3]', '[N as u32, N as u32]', '_high_laneq_s32'] + - [int16x8_t, int16x4_t, int32x4_t, '_high_lane_s16'] + - [int32x4_t, int32x4_t, int64x2_t, '_high_laneq_s32'] compose: - FnCall: [static_assert_uimm_bits!, [N, '2']] - - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vdup_lane{neon_type[1].nox}', [b], [N]]}] - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]] - name: "vqdmull_high_lane_s32" @@ -7068,11 +7068,11 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int32x4_t, int32x2_t, int64x2_t, int32x2_t, '[2, 3]', '[N as u32, N as u32]'] + - [int32x4_t, int32x2_t, int64x2_t] compose: - FnCall: [static_assert_uimm_bits!, [N, '1']] - - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vdup_lane{neon_type[1].nox}', [b], [N]]}] - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]] - name: "vqdmull_high_laneq_s16" @@ -7086,11 +7086,11 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int16x8_t, int16x8_t, int32x4_t, int16x4_t, '[4, 5, 6, 7]', '[N as u32, N as u32, N as u32, N as u32]'] + - [int16x8_t, int16x8_t, int32x4_t] compose: - FnCall: [static_assert_uimm_bits!, [N, '3']] - - Let: [a, "{neon_type[3]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - Let: [b, "{neon_type[3]}", {FnCall: [simd_shuffle!, [b, b, "{type[5]}"]]}] + - Let: [a, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - Let: [b, {FnCall: ['vdup_lane{neon_type[1].nox}', [b], [N]]}] - FnCall: ["vqdmull{neon_type[0].noq}", [a, b]] - name: "vqdmull_laneq_s16" @@ -7104,10 +7104,10 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int16x4_t, int16x8_t, int32x4_t, '[N as u32, N as u32, N as u32, N as u32]'] + - [int16x4_t, int16x8_t, int32x4_t] compose: - FnCall: [static_assert_uimm_bits!, [N, '3']] - - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] + - Let: [b, {FnCall: ['vdup_lane{neon_type[1].nox}', [b], [N]]}] - FnCall: [vqdmull_s16, [a, b]] - name: "vqdmull_laneq_s32" @@ -7121,10 +7121,10 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int32x2_t, int32x4_t, int64x2_t, '[N as u32, N as u32]'] + - [int32x2_t, int32x4_t, int64x2_t] compose: - FnCall: [static_assert_uimm_bits!, [N, '2']] - - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] + - Let: [b, {FnCall: ['vdup_lane{neon_type[1].nox}', [b], [N]]}] - FnCall: [vqdmull_s32, [a, b]] - name: "vqdmlal{type[4]}" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 4024e624ca..e646d22369 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -7590,7 +7590,7 @@ intrinsics: - [int16x4_t, int16x4_t, int32x4_t, '[N as u32; 4]'] compose: - FnCall: [static_assert_uimm_bits!, [N, '2']] - - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] + - Let: [b, {FnCall: ['vdup{neon_type[0].lane_nox}', [b], [N]]}] - FnCall: [vqdmull_s16, [a, b]] - name: "vqdmull_lane_s32" @@ -7607,10 +7607,10 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int32x2_t, int32x2_t, int64x2_t, '[N as u32; 2]'] + - [int32x2_t, int32x2_t, int64x2_t] compose: - FnCall: [static_assert_uimm_bits!, [N, '1']] - - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}] + - Let: [b, {FnCall: ['vdup{neon_type[0].lane_nox}', [b], [N]]}] - FnCall: [vqdmull_s32, [a, b]] - name: "vqdmlal{neon_type[1].noq}" @@ -11230,17 +11230,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s16, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_laneq_s16, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_lane_s16, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [_lane_s32, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [_laneq_s32, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [q_lane_s32, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [q_laneq_s32, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [_lane_s16, int16x4_t, int16x4_t, '2'] + - [_laneq_s16, int16x4_t, int16x8_t, '3'] + - [q_lane_s16, int16x8_t, int16x4_t, '2'] + - [q_laneq_s16, int16x8_t, int16x8_t, '3'] + - [_lane_s32, int32x2_t, int32x2_t, '1'] + - [_laneq_s32, int32x2_t, int32x4_t, '2'] + - [q_lane_s32, int32x4_t, int32x2_t, '1'] + - [q_laneq_s32, int32x4_t, int32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]] - - Let: [b, "{neon_type[1]}", {FnCall: [simd_shuffle!, [b, b, '{type[4]}']]}] + - Let: [b, {FnCall: ['vdup{type[0]}', [b], [LANE]]}] - FnCall: ["vqrdmulh{neon_type[1].no}", [a, b]] - name: "vqrdmulh{neon_type[0].N}" From e1217f2ea63640bb32f8a392b6d5c7bac432c7b4 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 05:11:39 +0530 Subject: [PATCH 12/19] Change implementation of `vqmov{u}n_high` --- .../core_arch/src/aarch64/neon/generated.rs | 36 +++++-------------- .../spec/neon/aarch64.spec.yml | 24 ++++++------- 2 files changed, 21 insertions(+), 39 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 2c49925431..60a9e99a3b 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -15030,13 +15030,7 @@ pub fn vqdmulls_s32(a: i32, b: i32) -> i64 { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { - unsafe { - simd_shuffle!( - a, - vqmovn_s16(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_s8(a, vqmovn_s16(b)) } #[doc = "Signed saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_s32)"] @@ -15045,7 +15039,7 @@ pub fn vqmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { - unsafe { simd_shuffle!(a, vqmovn_s32(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vqmovn_s32(b)) } #[doc = "Signed saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_s64)"] @@ -15054,7 +15048,7 @@ pub fn vqmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { - unsafe { simd_shuffle!(a, vqmovn_s64(b), [0, 1, 2, 3]) } + vcombine_s32(a, vqmovn_s64(b)) } #[doc = "Signed saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u16)"] @@ -15063,13 +15057,7 @@ pub fn vqmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(uqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { - unsafe { - simd_shuffle!( - a, - vqmovn_u16(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqmovn_u16(b)) } #[doc = "Signed saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u32)"] @@ -15078,7 +15066,7 @@ pub fn vqmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(uqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { - unsafe { simd_shuffle!(a, vqmovn_u32(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqmovn_u32(b)) } #[doc = "Signed saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovn_high_u64)"] @@ -15087,7 +15075,7 @@ pub fn vqmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(uqxtn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { - unsafe { simd_shuffle!(a, vqmovn_u64(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqmovn_u64(b)) } #[doc = "Saturating extract narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovnd_s64)"] @@ -15164,13 +15152,7 @@ pub fn vqmovns_u32(a: u32) -> u16 { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtun2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovun_high_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t { - unsafe { - simd_shuffle!( - a, - vqmovun_s16(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqmovun_s16(b)) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s32)"] @@ -15179,7 +15161,7 @@ pub fn vqmovun_high_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtun2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovun_high_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t { - unsafe { simd_shuffle!(a, vqmovun_s32(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqmovun_s32(b)) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovun_high_s64)"] @@ -15188,7 +15170,7 @@ pub fn vqmovun_high_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t { #[cfg_attr(all(test, target_endian = "little"), assert_instr(sqxtun2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t { - unsafe { simd_shuffle!(a, vqmovun_s64(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqmovun_s64(b)) } #[doc = "Signed saturating extract unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqmovunh_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 11ba7c955d..cb20ff24d2 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -7416,11 +7416,11 @@ intrinsics: - *neon-stable safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int8x16_t] + - [int16x4_t, int32x4_t, int16x8_t] + - [int32x2_t, int64x2_t, int32x4_t] compose: - - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}, "{type[3]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}]] - name: "vqmovn_high{neon_type[1].noq}" doc: "Signed saturating extract narrow" @@ -7431,11 +7431,11 @@ intrinsics: - *neon-stable safety: safe types: - - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]'] + - [uint8x8_t, uint16x8_t, uint8x16_t] + - [uint16x4_t, uint32x4_t, uint16x8_t] + - [uint32x2_t, uint64x2_t, uint32x4_t] compose: - - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}, "{type[3]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vqmovn{neon_type[1].noq}", [b]]}]] - name: "vqmovn{type[2]}" doc: "Saturating extract narrow" @@ -7523,11 +7523,11 @@ intrinsics: - *neon-stable safety: safe types: - - [uint8x8_t, int16x8_t, uint8x16_t, s16, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, int32x4_t, uint16x8_t, s32, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, int64x2_t, uint32x4_t, s64, '[0, 1, 2, 3]'] + - [uint8x8_t, int16x8_t, uint8x16_t, s16] + - [uint16x4_t, int32x4_t, uint16x8_t, s32] + - [uint32x2_t, int64x2_t, uint32x4_t, s64] compose: - - FnCall: [simd_shuffle!, [a, {FnCall: ["vqmovun_{type[3]}", [b]]}, "{type[4]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vqmovun_{type[3]}", [b]]}]] - name: "vqrdmulh{type[1]}" doc: "Signed saturating rounding doubling multiply returning high half" From f1156d8f34d6f7ee8bf9fb7e58ceedef38640c0e Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 05:15:21 +0530 Subject: [PATCH 13/19] Change implementation of `vqrdml{a,s}h_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 96 +++++++------------ .../spec/neon/aarch64.spec.yml | 36 +++---- 2 files changed, 50 insertions(+), 82 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 60a9e99a3b..112c84036f 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -15276,10 +15276,8 @@ pub fn vqnegd_s64(a: i64) -> i64 { #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlah_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlah_s16(a, b, c) - } + let c = vdup_lane_s16::(c); + vqrdmlah_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_lane_s32)"] @@ -15290,10 +15288,8 @@ pub fn vqrdmlah_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlah_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]); - vqrdmlah_s32(a, b, c) - } + let c = vdup_lane_s32::(c); + vqrdmlah_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_laneq_s16)"] @@ -15304,10 +15300,8 @@ pub fn vqrdmlah_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlah_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlah_s16(a, b, c) - } + let c = vdup_laneq_s16::(c); + vqrdmlah_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_laneq_s32)"] @@ -15318,10 +15312,8 @@ pub fn vqrdmlah_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlah_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]); - vqrdmlah_s32(a, b, c) - } + let c = vdup_laneq_s32::(c); + vqrdmlah_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_lane_s16)"] @@ -15332,10 +15324,8 @@ pub fn vqrdmlah_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]); - vqrdmlahq_s16(a, b, c) - } + let c = vdupq_lane_s16::(c); + vqrdmlahq_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_lane_s32)"] @@ -15346,10 +15336,8 @@ pub fn vqrdmlahq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlahq_s32(a, b, c) - } + let c = vdupq_lane_s32::(c); + vqrdmlahq_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_laneq_s16)"] @@ -15360,10 +15348,8 @@ pub fn vqrdmlahq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]); - vqrdmlahq_s16(a, b, c) - } + let c = vdupq_laneq_s16::(c); + vqrdmlahq_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlahq_laneq_s32)"] @@ -15374,10 +15360,8 @@ pub fn vqrdmlahq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlahq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlahq_s32(a, b, c) - } + let c = vdupq_laneq_s32::(c); + vqrdmlahq_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply accumulate returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlah_s16)"] @@ -15520,10 +15504,8 @@ pub fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 { #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlsh_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlsh_s16(a, b, c) - } + let c = vdup_lane_s16::(c); + vqrdmlsh_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_lane_s32)"] @@ -15534,10 +15516,8 @@ pub fn vqrdmlsh_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlsh_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]); - vqrdmlsh_s32(a, b, c) - } + let c = vdup_lane_s32::(c); + vqrdmlsh_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_laneq_s16)"] @@ -15548,10 +15528,8 @@ pub fn vqrdmlsh_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlsh_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlsh_s16(a, b, c) - } + let c = vdup_laneq_s16::(c); + vqrdmlsh_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_laneq_s32)"] @@ -15562,10 +15540,8 @@ pub fn vqrdmlsh_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlsh_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]); - vqrdmlsh_s32(a, b, c) - } + let c = vdup_laneq_s32::(c); + vqrdmlsh_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_lane_s16)"] @@ -15576,10 +15552,8 @@ pub fn vqrdmlsh_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]); - vqrdmlshq_s16(a, b, c) - } + let c = vdupq_lane_s16::(c); + vqrdmlshq_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_lane_s32)"] @@ -15590,10 +15564,8 @@ pub fn vqrdmlshq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlshq_s32(a, b, c) - } + let c = vdupq_lane_s32::(c); + vqrdmlshq_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_laneq_s16)"] @@ -15604,10 +15576,8 @@ pub fn vqrdmlshq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]); - vqrdmlshq_s16(a, b, c) - } + let c = vdupq_laneq_s16::(c); + vqrdmlshq_s16(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlshq_laneq_s32)"] @@ -15618,10 +15588,8 @@ pub fn vqrdmlshq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16 #[stable(feature = "rdm_intrinsics", since = "1.62.0")] pub fn vqrdmlshq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]); - vqrdmlshq_s32(a, b, c) - } + let c = vdupq_laneq_s32::(c); + vqrdmlshq_s32(a, b, c) } #[doc = "Signed saturating rounding doubling multiply subtract returning high half"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrdmlsh_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index cb20ff24d2..a805fbd205 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -7613,17 +7613,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32; 4]'] - - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32; 4]'] - - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]'] - - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]'] - - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32; 2]'] - - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32; 2]'] - - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]'] - - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]'] + - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2'] + - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3'] + - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2'] + - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3'] + - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1'] + - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2'] + - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1'] + - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']] - - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}] + - Let: [c, {FnCall: ['vdup{type[0]}', [c], [LANE]]}] - FnCall: ["vqrdmlah{neon_type[2].no}", [a, b, c]] - name: "vqrdmlah{type[4]}" @@ -7697,17 +7697,17 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32; 4]'] - - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32; 4]'] - - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]'] - - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]'] - - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32; 2]'] - - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32; 2]'] - - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]'] - - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]'] + - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2'] + - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3'] + - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2'] + - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3'] + - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1'] + - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2'] + - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1'] + - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']] - - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}] + - Let: [c, {FnCall: ['vdup{type[0]}', [c], [LANE]]}] - FnCall: ["vqrdmlsh{neon_type[2].no}", [a, b, c]] - name: "vqrdmlsh{type[3]}" From 705c2c84cfe74fb9599c16bd3b5b698629be25e9 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 05:49:07 +0530 Subject: [PATCH 14/19] Change implementation of `v{q}{r}shr{u}n_high_n` --- .../core_arch/src/aarch64/neon/generated.rs | 156 +++++------------- .../spec/neon/aarch64.spec.yml | 111 ++++++------- 2 files changed, 94 insertions(+), 173 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 112c84036f..1cbac3e284 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -15902,13 +15902,7 @@ pub fn vqrshld_u64(a: u64, b: i64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqrshrn_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_s8(a, vqrshrn_n_s16::(b)) } #[doc = "Signed saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_s32)"] @@ -15919,7 +15913,7 @@ pub fn vqrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqrshrn_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vqrshrn_n_s32::(b)) } #[doc = "Signed saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_s64)"] @@ -15930,7 +15924,7 @@ pub fn vqrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqrshrn_n_s64::(b), [0, 1, 2, 3]) } + vcombine_s32(a, vqrshrn_n_s64::(b)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u16)"] @@ -15941,13 +15935,7 @@ pub fn vqrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqrshrn_n_u16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqrshrn_n_u16::(b)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u32)"] @@ -15958,7 +15946,7 @@ pub fn vqrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqrshrn_n_u32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqrshrn_n_u32::(b)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_high_n_u64)"] @@ -15969,7 +15957,7 @@ pub fn vqrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqrshrn_n_u64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqrshrn_n_u64::(b)) } #[doc = "Unsigned saturating rounded shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrnd_n_u64)"] @@ -16052,13 +16040,7 @@ pub fn vqrshrnd_n_s64(a: i64) -> i32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqrshrun_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqrshrun_n_s16::(b)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s32)"] @@ -16069,7 +16051,7 @@ pub fn vqrshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqrshrun_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqrshrun_n_s32::(b)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_high_n_s64)"] @@ -16080,7 +16062,7 @@ pub fn vqrshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqrshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqrshrun_n_s64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqrshrun_n_s64::(b)) } #[doc = "Signed saturating rounded shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrund_n_s64)"] @@ -16351,13 +16333,7 @@ pub fn vqshlus_n_s32(a: i32) -> u32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqshrn_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_s8(a, vqshrn_n_s16::(b)) } #[doc = "Signed saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s32)"] @@ -16368,7 +16344,7 @@ pub fn vqshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqshrn_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vqshrn_n_s32::(b)) } #[doc = "Signed saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_s64)"] @@ -16379,7 +16355,7 @@ pub fn vqshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqshrn_n_s64::(b), [0, 1, 2, 3]) } + vcombine_s32(a, vqshrn_n_s64::(b)) } #[doc = "Unsigned saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u16)"] @@ -16390,13 +16366,7 @@ pub fn vqshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqshrn_n_u16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqshrn_n_u16::(b)) } #[doc = "Unsigned saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u32)"] @@ -16407,7 +16377,7 @@ pub fn vqshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqshrn_n_u32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqshrn_n_u32::(b)) } #[doc = "Unsigned saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_high_n_u64)"] @@ -16418,7 +16388,7 @@ pub fn vqshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqshrn_n_u64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqshrn_n_u64::(b)) } #[doc = "Signed saturating shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrnd_n_s64)"] @@ -16509,13 +16479,7 @@ pub fn vqshrns_n_u32(a: u32) -> u16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vqshrun_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vqshrun_n_s16::(b)) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s32)"] @@ -16526,7 +16490,7 @@ pub fn vqshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vqshrun_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vqshrun_n_s32::(b)) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_high_n_s64)"] @@ -16537,7 +16501,7 @@ pub fn vqshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vqshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vqshrun_n_s64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vqshrun_n_s64::(b)) } #[doc = "Signed saturating shift right unsigned narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrund_n_s64)"] @@ -20128,13 +20092,7 @@ pub fn vrshrd_n_u64(a: u64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vrshrn_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_s8(a, vrshrn_n_s16::(b)) } #[doc = "Rounding shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_s32)"] @@ -20145,7 +20103,7 @@ pub fn vrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vrshrn_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vrshrn_n_s32::(b)) } #[doc = "Rounding shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_s64)"] @@ -20156,7 +20114,7 @@ pub fn vrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vrshrn_n_s64::(b), [0, 1, 2, 3]) } + vcombine_s32(a, vrshrn_n_s64::(b)) } #[doc = "Rounding shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u16)"] @@ -20167,13 +20125,7 @@ pub fn vrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vrshrn_n_u16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vrshrn_n_u16::(b)) } #[doc = "Rounding shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u32)"] @@ -20184,7 +20136,7 @@ pub fn vrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vrshrn_n_u32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vrshrn_n_u32::(b)) } #[doc = "Rounding shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_high_n_u64)"] @@ -20195,7 +20147,7 @@ pub fn vrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vrshrn_n_u64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vrshrn_n_u64::(b)) } #[doc = "Reciprocal square-root estimate."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsqrte_f64)"] @@ -20708,10 +20660,8 @@ pub fn vshld_u64(a: u64, b: i64) -> u64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_s8(a: int8x16_t) -> int16x8_t { static_assert!(N >= 0 && N <= 8); - unsafe { - let b: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - vshll_n_s8::(b) - } + let b = vget_high_s8(a); + vshll_n_s8::(b) } #[doc = "Signed shift left long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_s16)"] @@ -20722,10 +20672,8 @@ pub fn vshll_high_n_s8(a: int8x16_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_s16(a: int16x8_t) -> int32x4_t { static_assert!(N >= 0 && N <= 16); - unsafe { - let b: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - vshll_n_s16::(b) - } + let b = vget_high_s16(a); + vshll_n_s16::(b) } #[doc = "Signed shift left long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_s32)"] @@ -20736,10 +20684,8 @@ pub fn vshll_high_n_s16(a: int16x8_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_s32(a: int32x4_t) -> int64x2_t { static_assert!(N >= 0 && N <= 32); - unsafe { - let b: int32x2_t = simd_shuffle!(a, a, [2, 3]); - vshll_n_s32::(b) - } + let b = vget_high_s32(a); + vshll_n_s32::(b) } #[doc = "Signed shift left long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u8)"] @@ -20750,10 +20696,8 @@ pub fn vshll_high_n_s32(a: int32x4_t) -> int64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_u8(a: uint8x16_t) -> uint16x8_t { static_assert!(N >= 0 && N <= 8); - unsafe { - let b: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); - vshll_n_u8::(b) - } + let b: uint8x8_t = vget_high_u8(a); + vshll_n_u8::(b) } #[doc = "Signed shift left long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u16)"] @@ -20764,10 +20708,8 @@ pub fn vshll_high_n_u8(a: uint8x16_t) -> uint16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_u16(a: uint16x8_t) -> uint32x4_t { static_assert!(N >= 0 && N <= 16); - unsafe { - let b: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]); - vshll_n_u16::(b) - } + let b: uint16x4_t = vget_high_u16(a); + vshll_n_u16::(b) } #[doc = "Signed shift left long"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshll_high_n_u32)"] @@ -20778,10 +20720,8 @@ pub fn vshll_high_n_u16(a: uint16x8_t) -> uint32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshll_high_n_u32(a: uint32x4_t) -> uint64x2_t { static_assert!(N >= 0 && N <= 32); - unsafe { - let b: uint32x2_t = simd_shuffle!(a, a, [2, 3]); - vshll_n_u32::(b) - } + let b: uint32x2_t = vget_high_u32(a); + vshll_n_u32::(b) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s16)"] @@ -20792,13 +20732,7 @@ pub fn vshll_high_n_u32(a: uint32x4_t) -> uint64x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vshrn_n_s16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_s8(a, vshrn_n_s16::(b)) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s32)"] @@ -20809,7 +20743,7 @@ pub fn vshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vshrn_n_s32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vshrn_n_s32::(b)) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_s64)"] @@ -20820,7 +20754,7 @@ pub fn vshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vshrn_n_s64::(b), [0, 1, 2, 3]) } + vcombine_s32(a, vshrn_n_s64::(b)) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u16)"] @@ -20831,13 +20765,7 @@ pub fn vshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t { static_assert!(N >= 1 && N <= 8); - unsafe { - simd_shuffle!( - a, - vshrn_n_u16::(b), - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] - ) - } + vcombine_u8(a, vshrn_n_u16::(b)) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u32)"] @@ -20848,7 +20776,7 @@ pub fn vshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t { static_assert!(N >= 1 && N <= 16); - unsafe { simd_shuffle!(a, vshrn_n_u32::(b), [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vshrn_n_u32::(b)) } #[doc = "Shift right narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vshrn_high_n_u64)"] @@ -20859,7 +20787,7 @@ pub fn vshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_ #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t { static_assert!(N >= 1 && N <= 32); - unsafe { simd_shuffle!(a, vshrn_n_u64::(b), [0, 1, 2, 3]) } + vcombine_u32(a, vshrn_n_u64::(b)) } #[doc = "Shift Left and Insert (immediate)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsli_n_s8)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index a805fbd205..f6705033f9 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -7828,12 +7828,12 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, '_high_n_s16', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]', 'N >= 1 && N <= 8'] - - [int16x4_t, int32x4_t, int16x8_t, '_high_n_s32', '[0, 1, 2, 3, 4, 5, 6, 7]', 'N >= 1 && N <= 16'] - - [int32x2_t, int64x2_t, int32x4_t, '_high_n_s64', '[0, 1, 2, 3]', 'N >= 1 && N <= 32'] + - [int8x8_t, int16x8_t, int8x16_t, '_high_n_s16', 'N >= 1 && N <= 8'] + - [int16x4_t, int32x4_t, int16x8_t, '_high_n_s32', 'N >= 1 && N <= 16'] + - [int32x2_t, int64x2_t, int32x4_t, '_high_n_s64', 'N >= 1 && N <= 32'] compose: - - FnCall: [static_assert!, ["{type[5]}"]] - - FnCall: [simd_shuffle!, [a, {FnCall: ["vqrshrn_n{neon_type[1].noq}::", [b]]}, "{type[4]}"]] + - FnCall: [static_assert!, ["{type[4]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vqrshrn_n{neon_type[1].noq}::", [b]]}]] - name: "vqrshrn{type[0]}" doc: "Unsigned saturating rounded shift right narrow" @@ -7865,18 +7865,17 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] + - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ['{type[3]}']] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - FnCall: - "vqrshrn_n{neon_type[1].noq}::" - - b - - "{type[4]}" - name: "vqrshrun{type[0]}" doc: "Signed saturating rounded shift right unsigned narrow" @@ -7916,18 +7915,17 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8', s16, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16', s32, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32', s64, '[0, 1, 2, 3]'] + - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - FnCall: - - "vqrshrun_n_{type[4]}::" + - "vqrshrun_n_{neon_type[1]}::" - - b - - "{type[5]}" - name: "vqshld_{type}" doc: "Signed saturating shift left" @@ -8110,16 +8108,15 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [_high_n_s16, int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]', s16] - - [_high_n_s32, int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]', s32] - - [_high_n_s64, int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]', s64] + - [_high_n_s16, int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8'] + - [_high_n_s32, int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16'] + - [_high_n_s64, int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[4]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[1]}' - - a - - FnCall: ["vqshrn_n_{type[6]}::", [b]] - - "{type[5]}" + - FnCall: ["vqshrn_n_{neon_type[2]}::", [b]] - name: "vqshrnd_n_u64" doc: "Unsigned saturating shift right narrow" @@ -8178,16 +8175,15 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [_high_n_u16, uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [_high_n_u32, uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [_high_n_u64, uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] + - [_high_n_u16, uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [_high_n_u32, uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [_high_n_u64, uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[4]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[1]}' - - a - FnCall: ["vqshrn_n_{neon_type[2]}::", [b]] - - "{type[5]}" - name: "vqshrun{type[0]}" doc: "Signed saturating shift right unsigned narrow" @@ -8223,16 +8219,15 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] + - [uint8x8_t, int16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [uint16x4_t, int32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [uint32x2_t, int64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - FnCall: ["vqshrun_n_{neon_type[1]}::", [b]] - - "{type[4]}" - name: "vsqadd{type[0]}" doc: "Unsigned saturating accumulate of signed value" @@ -8699,19 +8694,18 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8'] + - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16'] + - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32'] + - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - FnCall: ["vrshrn_n_{neon_type[1]}::", [b]] - - "{type[4]}" - name: "vrsubhn_high_{neon_type[1]}" doc: "Rounding subtract returning high narrow" @@ -9033,13 +9027,13 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x16_t, int16x8_t, int8x8_t, 'N >= 0 && N <= 8', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x8_t, int32x4_t, int16x4_t, 'N >= 0 && N <= 16', '[4, 5, 6, 7]'] - - [int32x4_t, int64x2_t, int32x2_t, 'N >= 0 && N <= 32', '[2, 3]'] + - [int8x16_t, int16x8_t, int8x8_t, 'N >= 0 && N <= 8'] + - [int16x8_t, int32x4_t, int16x4_t, 'N >= 0 && N <= 16'] + - [int32x4_t, int64x2_t, int32x2_t, 'N >= 0 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - FnCall: ["vshll_n_{neon_type[2]}::", [b]] + - Let: [b, {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - FnCall: ["vshll_n_{neon_type[2]}", [b], [N]] - name: "vshll_high_n_{neon_type[0]}" doc: "Signed shift left long" @@ -9052,13 +9046,13 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [uint8x16_t, uint16x8_t, uint8x8_t, 'N >= 0 && N <= 8', '[8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x8_t, uint32x4_t, uint16x4_t, 'N >= 0 && N <= 16', '[4, 5, 6, 7]'] - - [uint32x4_t, uint64x2_t, uint32x2_t, 'N >= 0 && N <= 32', '[2, 3]'] + - [uint8x16_t, uint16x8_t, uint8x8_t, 'N >= 0 && N <= 8'] + - [uint16x8_t, uint32x4_t, uint16x4_t, 'N >= 0 && N <= 16'] + - [uint32x4_t, uint64x2_t, uint32x2_t, 'N >= 0 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - - Let: [b, "{neon_type[2]}", {FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]}] - - FnCall: ["vshll_n_{neon_type[2]}::", [b]] + - Let: [b, "{neon_type[2]}", {FnCall: ['vget_high_{neon_type[0]}', [a]]}] + - FnCall: ["vshll_n_{neon_type[2]}", [b], [N]] - name: "vshrn_high_n_{neon_type[1]}" doc: "Shift right narrow" @@ -9071,19 +9065,18 @@ intrinsics: static_defs: ['const N: i32'] safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32', '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int8x16_t, 'N >= 1 && N <= 8'] + - [int16x4_t, int32x4_t, int16x8_t, 'N >= 1 && N <= 16'] + - [int32x2_t, int64x2_t, int32x4_t, 'N >= 1 && N <= 32'] + - [uint8x8_t, uint16x8_t, uint8x16_t, 'N >= 1 && N <= 8'] + - [uint16x4_t, uint32x4_t, uint16x8_t, 'N >= 1 && N <= 16'] + - [uint32x2_t, uint64x2_t, uint32x4_t, 'N >= 1 && N <= 32'] compose: - FnCall: [static_assert!, ["{type[3]}"]] - FnCall: - - simd_shuffle! + - 'vcombine_{neon_type[0]}' - - a - - FnCall: ["vshrn_n_{neon_type[1]}::", [b]] - - "{type[4]}" + - FnCall: ["vshrn_n_{neon_type[1]}", [b], [N]] - name: "vsm3partw1{neon_type.no}" doc: "SM3PARTW1" From 8c61739f310109f138688c2ea57ba5221b699450 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 05:53:35 +0530 Subject: [PATCH 15/19] Change implementation of `vr{add,sub}hn_high` --- .../core_arch/src/aarch64/neon/generated.rs | 36 +++++--------- .../src/arm_shared/neon/generated.rs | 48 +++++++++---------- .../spec/neon/aarch64.spec.yml | 36 ++++++-------- .../spec/neon/arm_shared.spec.yml | 45 +++++++++-------- 4 files changed, 72 insertions(+), 93 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 1cbac3e284..d09171a753 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -20343,8 +20343,7 @@ pub fn vrsrad_n_u64(a: u64, b: u64) -> u64 { #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { - let x: int8x8_t = vrsubhn_s16(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + vcombine_s8(a, vrsubhn_s16(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"] @@ -20354,8 +20353,7 @@ pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { - let x: int16x4_t = vrsubhn_s32(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vrsubhn_s32(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"] @@ -20365,8 +20363,7 @@ pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { - let x: int32x2_t = vrsubhn_s64(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) } + vcombine_s32(a, vrsubhn_s64(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"] @@ -20376,8 +20373,7 @@ pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t { - let x: uint8x8_t = vrsubhn_u16(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + vcombine_u8(a, vrsubhn_u16(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"] @@ -20387,8 +20383,7 @@ pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_ #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t { - let x: uint16x4_t = vrsubhn_u32(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vrsubhn_u32(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"] @@ -20398,8 +20393,7 @@ pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8 #[cfg_attr(test, assert_instr(rsubhn2))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t { - let x: uint32x2_t = vrsubhn_u64(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) } + vcombine_u32(a, vrsubhn_u64(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s16)"] @@ -20409,8 +20403,7 @@ pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4 #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { - let x: int8x8_t = vrsubhn_s16(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + vcombine_s8(a, vrsubhn_s16(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"] @@ -20420,8 +20413,7 @@ pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { - let x: int16x4_t = vrsubhn_s32(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, vrsubhn_s32(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"] @@ -20431,8 +20423,7 @@ pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { - let x: int32x2_t = vrsubhn_s64(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) } + vcombine_s32(a, vrsubhn_s64(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"] @@ -20442,8 +20433,7 @@ pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t { - let x: uint8x8_t = vrsubhn_u16(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + vcombine_u8(a, vrsubhn_u16(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"] @@ -20453,8 +20443,7 @@ pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_ #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t { - let x: uint16x4_t = vrsubhn_u32(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_u16(a, vrsubhn_u32(b, c)) } #[doc = "Rounding subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"] @@ -20464,8 +20453,7 @@ pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8 #[cfg_attr(test, assert_instr(rsubhn))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t { - let x: uint32x2_t = vrsubhn_u64(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) } + vcombine_u32(a, vrsubhn_u64(b, c)) } #[doc = "Multi-vector floating-point adjust exponent"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vscale_f16)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 3f5b7073e2..6c0dd2af94 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -2207,7 +2207,7 @@ pub fn vaddh_f16(a: f16, b: f16) -> f16 { pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8))); - simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) + vcombine_s8(r, x) } } #[doc = "Add returning High Narrow (high half)."] @@ -2235,7 +2235,7 @@ pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t { pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16))); - simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7]) + vcombine_s16(r, x) } } #[doc = "Add returning High Narrow (high half)."] @@ -2263,7 +2263,7 @@ pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t { pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32))); - simd_shuffle!(r, x, [0, 1, 2, 3]) + vcombine_s32(r, x) } } #[doc = "Add returning High Narrow (high half)."] @@ -2291,7 +2291,7 @@ pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t { pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8))); - simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) + vcombine_u8(r, x) } } #[doc = "Add returning High Narrow (high half)."] @@ -2319,7 +2319,7 @@ pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16))); - simd_shuffle!(r, x, [0, 1, 2, 3, 4, 5, 6, 7]) + vcombine_u16(r, x) } } #[doc = "Add returning High Narrow (high half)."] @@ -2347,7 +2347,7 @@ pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_ pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t { unsafe { let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32))); - simd_shuffle!(r, x, [0, 1, 2, 3]) + vcombine_u32(r, x) } } #[doc = "Add returning High Narrow."] @@ -38521,7 +38521,7 @@ pub fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { )] pub fn vraddhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { let x = vraddhn_s16(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + vcombine_s8(a, x) } #[doc = "Rounding Add returning High Narrow (high half)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_s32)"] @@ -38547,7 +38547,7 @@ pub fn vraddhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { )] pub fn vraddhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { let x = vraddhn_s32(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) } + vcombine_s16(a, x) } #[doc = "Rounding Add returning High Narrow (high half)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_s64)"] @@ -38573,7 +38573,7 @@ pub fn vraddhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { )] pub fn vraddhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { let x = vraddhn_s64(b, c); - unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) } + vcombine_s32(a, x) } #[doc = "Rounding Add returning High Narrow (high half)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vraddhn_high_u16)"] @@ -38600,7 +38600,7 @@ pub fn vraddhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { pub fn vraddhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t { unsafe { let x: uint8x8_t = transmute(vraddhn_s16(transmute(b), transmute(c))); - simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) + vcombine_u8(a, x) } } #[doc = "Rounding Add returning High Narrow (high half)."] @@ -38628,7 +38628,7 @@ pub fn vraddhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_ pub fn vraddhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t { unsafe { let x: uint16x4_t = transmute(vraddhn_s32(transmute(b), transmute(c))); - simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) + vcombine_u16(a, x) } } #[doc = "Rounding Add returning High Narrow (high half)."] @@ -38656,7 +38656,7 @@ pub fn vraddhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8 pub fn vraddhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t { unsafe { let x: uint32x2_t = transmute(vraddhn_s64(transmute(b), transmute(c))); - simd_shuffle!(a, x, [0, 1, 2, 3]) + vcombine_u32(a, x) } } #[doc = "Rounding Add returning High Narrow."] @@ -69225,8 +69225,8 @@ pub fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { - let d: int8x8_t = vsubhn_s16(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + let d = vsubhn_s16(b, c); + vcombine_s8(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_s32)"] @@ -69251,8 +69251,8 @@ pub fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { - let d: int16x4_t = vsubhn_s32(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7]) } + let d = vsubhn_s32(b, c); + vcombine_s16(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_s64)"] @@ -69277,8 +69277,8 @@ pub fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { - let d: int32x2_t = vsubhn_s64(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3]) } + let d = vsubhn_s64(b, c); + vcombine_s32(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u16)"] @@ -69303,8 +69303,8 @@ pub fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t { - let d: uint8x8_t = vsubhn_u16(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) } + let d = vsubhn_u16(b, c); + vcombine_u8(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u32)"] @@ -69329,8 +69329,8 @@ pub fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t { - let d: uint16x4_t = vsubhn_u32(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3, 4, 5, 6, 7]) } + let d = vsubhn_u32(b, c); + vcombine_u16(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_high_u64)"] @@ -69355,8 +69355,8 @@ pub fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_ unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t { - let d: uint32x2_t = vsubhn_u64(b, c); - unsafe { simd_shuffle!(a, d, [0, 1, 2, 3]) } + let d = vsubhn_u64(b, c); + vcombine_u32(a, d) } #[doc = "Subtract returning high narrow"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubhn_s16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index f6705033f9..912eff7ce4 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -8717,18 +8717,14 @@ intrinsics: - *neon-stable safety: safe types: - - [int8x8_t, int16x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int16x8_t, int8x16_t] + - [int16x4_t, int32x4_t, int32x4_t, int16x8_t] + - [int32x2_t, int64x2_t, int64x2_t, int32x4_t] + - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t] + - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t] + - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t] compose: - - Let: - - x - - "{neon_type[0]}" - - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]] - - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]}]] - name: "vrsubhn_high_{neon_type[1]}" doc: "Rounding subtract returning high narrow" @@ -8740,18 +8736,14 @@ intrinsics: - *neon-stable safety: safe types: - - [int8x8_t, int16x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int16x8_t, int8x16_t] + - [int16x4_t, int32x4_t, int32x4_t, int16x8_t] + - [int32x2_t, int64x2_t, int64x2_t, int32x4_t] + - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t] + - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t] + - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t] compose: - - Let: - - x - - "{neon_type[0]}" - - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]] - - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, {FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]}]] - name: "vcopy{neon_type[0].lane_nox}" doc: "Insert vector element from another vector element" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index e646d22369..f22f503c56 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -6805,18 +6805,17 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - [int8x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]'] - - [uint8x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]'] + - [int8x8_t, int16x8_t, int8x16_t] + - [int16x4_t, int32x4_t, int16x8_t] + - [int32x2_t, int64x2_t, int32x4_t] + - [uint8x8_t, uint16x8_t, uint8x16_t] + - [uint16x4_t, uint32x4_t, uint16x8_t] + - [uint32x2_t, uint64x2_t, uint32x4_t] compose: - Let: - d - - "{neon_type[0]}" - FnCall: ["vsubhn{neon_type[1].noq}", [b, c]] - - FnCall: [simd_shuffle!, [a, d, "{type[3]}"]] + - FnCall: ['vcombine_{neon_type[0]}', [a, d]] - name: "vhsub{neon_type[1].no}" doc: "Signed halving subtract" @@ -13225,9 +13224,9 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - [uint8x8_t , uint16x8_t, uint8x16_t, 'vraddhn.i16', int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [uint16x4_t, uint32x4_t, uint16x8_t, 'vraddhn.i32', int32x4_t, '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [uint32x2_t, uint64x2_t, uint32x4_t, 'vraddhn.i64', int64x2_t, '[0, 1, 2, 3]'] + - [uint8x8_t , uint16x8_t, uint8x16_t, 'vraddhn.i16', int16x8_t] + - [uint16x4_t, uint32x4_t, uint16x8_t, 'vraddhn.i32', int32x4_t] + - [uint32x2_t, uint64x2_t, uint32x4_t, 'vraddhn.i64', int64x2_t] compose: - Let: - x @@ -13238,7 +13237,7 @@ intrinsics: - "vraddhn{neon_type[4].noq}" - - FnCall: [transmute, [b]] - FnCall: [transmute, [c]] - - FnCall: ["simd_shuffle!", [a, x, '{type[5]}']] + - FnCall: ['vcombine_{neon_type[0]}', [a, x]] - name: "vraddhn_high{neon_type[1].noq}" doc: "Rounding Add returning High Narrow (high half)." @@ -13252,9 +13251,9 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - [int8x8_t , int16x8_t, int8x16_t, 'vraddhn.i16', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - [int16x4_t, int32x4_t, int16x8_t, 'vraddhn.i32', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - [int32x2_t, int64x2_t, int32x4_t, 'vraddhn.i64', '[0, 1, 2, 3]'] + - [int8x8_t , int16x8_t, int8x16_t, 'vraddhn.i16'] + - [int16x4_t, int32x4_t, int16x8_t, 'vraddhn.i32'] + - [int32x2_t, int64x2_t, int32x4_t, 'vraddhn.i64'] compose: - Let: - x @@ -13262,7 +13261,7 @@ intrinsics: - "vraddhn{neon_type[1].noq}" - - b - c - - FnCall: ["simd_shuffle!", [a, x, '{type[4]}']] + - FnCall: ['vcombine_{neon_type[0]}', [a, x]] - name: "vpadd{neon_type.no}" doc: "Add pairwise." @@ -14591,12 +14590,12 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - ['vaddhn_high_s16', 'int8x8_t', 'int16x8_t', 'int8x16_t', 'int16x8_t::splat(8)', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddhn_high_s32', 'int16x4_t', 'int32x4_t', 'int16x8_t', 'int32x4_t::splat(16)', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - ['vaddhn_high_s64', 'int32x2_t', 'int64x2_t', 'int32x4_t', 'int64x2_t::splat(32)', '[0, 1, 2, 3]'] - - ['vaddhn_high_u16', 'uint8x8_t', 'uint16x8_t', 'uint8x16_t', 'uint16x8_t::splat(8)', '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]'] - - ['vaddhn_high_u32', 'uint16x4_t', 'uint32x4_t', 'uint16x8_t', 'uint32x4_t::splat(16)', '[0, 1, 2, 3, 4, 5, 6, 7]'] - - ['vaddhn_high_u64', 'uint32x2_t', 'uint64x2_t', 'uint32x4_t', 'uint64x2_t::splat(32)', '[0, 1, 2, 3]'] + - ['vaddhn_high_s16', 'int8x8_t', 'int16x8_t', 'int8x16_t', 'int16x8_t::splat(8)'] + - ['vaddhn_high_s32', 'int16x4_t', 'int32x4_t', 'int16x8_t', 'int32x4_t::splat(16)'] + - ['vaddhn_high_s64', 'int32x2_t', 'int64x2_t', 'int32x4_t', 'int64x2_t::splat(32)'] + - ['vaddhn_high_u16', 'uint8x8_t', 'uint16x8_t', 'uint8x16_t', 'uint16x8_t::splat(8)'] + - ['vaddhn_high_u32', 'uint16x4_t', 'uint32x4_t', 'uint16x8_t', 'uint32x4_t::splat(16)'] + - ['vaddhn_high_u64', 'uint32x2_t', 'uint64x2_t', 'uint32x4_t', 'uint64x2_t::splat(32)'] compose: - Let: - x @@ -14609,7 +14608,7 @@ intrinsics: - - a - b - '{type[4]}' - - FnCall: ['simd_shuffle!', [r, x, '{type[5]}']] + - FnCall: ['vcombine_{neon_type[1]}', [r, x]] - name: "{type[0]}" doc: "Vector narrow integer." From 89168fa1f4d81d48c7431185d97caadaf616b828 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 06:10:10 +0530 Subject: [PATCH 16/19] Change implementation of `vmul_lane` --- .../core_arch/src/aarch64/neon/generated.rs | 8 +- .../src/arm_shared/neon/generated.rs | 189 ++---------------- .../spec/neon/aarch64.spec.yml | 10 +- .../spec/neon/arm_shared.spec.yml | 52 ++--- 4 files changed, 57 insertions(+), 202 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index d09171a753..63ffa6765c 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -12591,7 +12591,7 @@ pub fn vmul_lane_f64(a: float64x1_t, b: float64x1_t) -> float64 #[cfg(not(target_arch = "arm64ec"))] pub fn vmul_laneq_f16(a: float16x4_t, b: float16x8_t) -> float16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 4])) } + unsafe { simd_mul(a, vdup_laneq_f16::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f16)"] @@ -12603,7 +12603,7 @@ pub fn vmul_laneq_f16(a: float16x4_t, b: float16x8_t) -> float1 #[cfg(not(target_arch = "arm64ec"))] pub fn vmulq_laneq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 8])) } + unsafe { simd_mul(a, vdupq_laneq_f16::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f64)"] @@ -12917,7 +12917,7 @@ pub fn vmull_p64(a: p64, b: p64) -> p128 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t { static_assert!(LANE == 0); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + unsafe { simd_mul(a, vdupq_lane_f64::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f64)"] @@ -12928,7 +12928,7 @@ pub fn vmulq_lane_f64(a: float64x2_t, b: float64x1_t) -> float6 #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vmulq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 2])) } + unsafe { simd_mul(a, vdupq_laneq_f64::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuls_lane_f32)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 6c0dd2af94..2e3a24ee95 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -29763,12 +29763,7 @@ pub fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { #[cfg(not(target_arch = "arm64ec"))] pub fn vmul_lane_f16(a: float16x4_t, v: float16x4_t) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(v, v, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdup_lane_f16::(v)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_f16)"] @@ -29792,25 +29787,7 @@ pub fn vmul_lane_f16(a: float16x4_t, v: float16x4_t) -> float16 #[cfg(not(target_arch = "arm64ec"))] pub fn vmulq_lane_f16(a: float16x8_t, v: float16x4_t) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!( - v, - v, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + unsafe { simd_mul(a, vdupq_lane_f16::(v)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_f32)"] @@ -29833,7 +29810,7 @@ pub fn vmulq_lane_f16(a: float16x8_t, v: float16x4_t) -> float1 )] pub fn vmul_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_lane_f32::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f32)"] @@ -29856,7 +29833,7 @@ pub fn vmul_lane_f32(a: float32x2_t, b: float32x2_t) -> float32 )] pub fn vmul_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_laneq_f32::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_f32)"] @@ -29879,12 +29856,7 @@ pub fn vmul_laneq_f32(a: float32x2_t, b: float32x4_t) -> float3 )] pub fn vmulq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_lane_f32::(b)) } } #[doc = "Floating-point multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f32)"] @@ -29907,12 +29879,7 @@ pub fn vmulq_lane_f32(a: float32x4_t, b: float32x2_t) -> float3 )] pub fn vmulq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_laneq_f32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_s16)"] @@ -29935,12 +29902,7 @@ pub fn vmulq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float )] pub fn vmul_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdup_lane_s16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_s16)"] @@ -29963,25 +29925,7 @@ pub fn vmul_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { )] pub fn vmulq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + unsafe { simd_mul(a, vdupq_lane_s16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_s32)"] @@ -30004,7 +29948,7 @@ pub fn vmulq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t )] pub fn vmul_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_lane_s32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_s32)"] @@ -30027,12 +29971,7 @@ pub fn vmul_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { )] pub fn vmulq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_lane_s32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_u16)"] @@ -30055,12 +29994,7 @@ pub fn vmulq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t )] pub fn vmul_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdup_lane_u16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_u16)"] @@ -30083,25 +30017,7 @@ pub fn vmul_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_ )] pub fn vmulq_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + unsafe { simd_mul(a, vdupq_lane_u16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_lane_u32)"] @@ -30124,7 +30040,7 @@ pub fn vmulq_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint16x8 )] pub fn vmul_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_lane_u32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_lane_u32)"] @@ -30147,12 +30063,7 @@ pub fn vmul_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_ )] pub fn vmulq_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_lane_u32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_s16)"] @@ -30175,12 +30086,7 @@ pub fn vmulq_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint32x4 )] pub fn vmul_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdup_laneq_s16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_s16)"] @@ -30203,25 +30109,7 @@ pub fn vmul_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t )] pub fn vmulq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - simd_mul( - a, - simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + unsafe { simd_mul(a, vdupq_laneq_s16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_s32)"] @@ -30244,7 +30132,7 @@ pub fn vmulq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t )] pub fn vmul_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_laneq_s32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_s32)"] @@ -30267,12 +30155,7 @@ pub fn vmul_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t )] pub fn vmulq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_laneq_s32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_u16)"] @@ -30295,12 +30178,7 @@ pub fn vmulq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t )] pub fn vmul_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdup_laneq_u16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_u16)"] @@ -30323,25 +30201,7 @@ pub fn vmul_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint16x4 )] pub fn vmulq_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { static_assert_uimm_bits!(LANE, 3); - unsafe { - simd_mul( - a, - simd_shuffle!( - b, - b, - [ - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32, - LANE as u32 - ] - ), - ) - } + unsafe { simd_mul(a, vdupq_laneq_u16::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_u32)"] @@ -30364,7 +30224,7 @@ pub fn vmulq_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x )] pub fn vmul_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) } + unsafe { simd_mul(a, vdup_laneq_u32::(b)) } } #[doc = "Multiply"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_u32)"] @@ -30387,12 +30247,7 @@ pub fn vmul_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint32x2 )] pub fn vmulq_laneq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - simd_mul( - a, - simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]), - ) - } + unsafe { simd_mul(a, vdupq_laneq_u32::(b)) } } #[doc = "Vector multiply by scalar"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_n_f16)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 912eff7ce4..7664bf3c10 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -11006,7 +11006,7 @@ intrinsics: - FnCall: - simd_mul - - a - - FnCall: ["simd_shuffle!", [b, b, '[LANE as u32; 2]']] + - FnCall: [vdupq_lane_f64, [b], [LANE]] - name: "vmuld_lane_f64" doc: "Floating-point multiply" @@ -11063,7 +11063,7 @@ intrinsics: - FnCall: - simd_mul - - a - - FnCall: [simd_shuffle!, [b, b, '[LANE as u32; 2]']] + - FnCall: [vdupq_laneq_f64, [b], [LANE]] # vmulq_laneq_f16 @@ -11080,14 +11080,14 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [float16x4_t, float16x8_t, '_lane', "[LANE as u32; 4]"] - - [float16x8_t, float16x8_t, 'q_lane', "[LANE as u32; 8]"] + - [float16x4_t, float16x8_t, '_lane'] + - [float16x8_t, float16x8_t, 'q_lane'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '3']] - FnCall: - simd_mul - - a - - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]] + - FnCall: ['vdup{neon_type[0].laneq_nox}', [b], [LANE]] - name: "vmul{type[1]}_{type[0]}" diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index f22f503c56..d20c94a2f8 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -6300,20 +6300,20 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint16x4_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint32x2_t, uint32x2_t, '1', '[LANE as u32, LANE as u32]'] - - [uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [int16x4_t, int16x4_t, '2'] + - [int16x8_t, int16x4_t, '2'] + - [int32x2_t, int32x2_t, '1'] + - [int32x4_t, int32x2_t, '1'] + - [uint16x4_t, uint16x4_t, '2'] + - [uint16x8_t, uint16x4_t, '2'] + - [uint32x2_t, uint32x2_t, '1'] + - [uint32x4_t, uint32x2_t, '1'] compose: - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]] - FnCall: - simd_mul - - a - - FnCall: ["simd_shuffle!", [b, b, "{type[3]}"]] + - FnCall: ["vdup{neon_type[0].lane_nox}", [b], [LANE]] - name: "vmul{neon_type[0].lane_nox}" @@ -6332,14 +6332,14 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [float16x8_t, float16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [float16x4_t, float16x4_t, '2'] + - [float16x8_t, float16x4_t, '2'] compose: - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]] - FnCall: - simd_mul - - a - - FnCall: ["simd_shuffle!", [v, v, "{type[3]}"]] + - FnCall: ["vdup{neon_type[0].lane_nox}", [v], [LANE]] - name: "vmul{neon_type[0].laneq_nox}" @@ -6356,20 +6356,20 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint16x4_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [uint32x2_t, uint32x4_t, '2', '[LANE as u32, LANE as u32]'] - - [uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [int16x4_t, int16x8_t, '3'] + - [int16x8_t, int16x8_t, '3'] + - [int32x2_t, int32x4_t, '2'] + - [int32x4_t, int32x4_t, '2'] + - [uint16x4_t, uint16x8_t, '3'] + - [uint16x8_t, uint16x8_t, '3'] + - [uint32x2_t, uint32x4_t, '2'] + - [uint32x4_t, uint32x4_t, '2'] compose: - FnCall: ["static_assert_uimm_bits!", [LANE, "{type[2]}"]] - FnCall: - simd_mul - - a - - FnCall: ["simd_shuffle!", [b, b, "{type[3]}"]] + - FnCall: ["vdup{neon_type[0].laneq_nox}", [b], [LANE]] - name: "vmull{neon_type[1].no}" doc: Signed multiply long @@ -11204,16 +11204,16 @@ intrinsics: static_defs: ['const LANE: i32'] safety: safe types: - - [float32x2_t, float32x2_t, '_lane_f32', '1', '[LANE as u32, LANE as u32]'] - - [float32x2_t, float32x4_t, '_laneq_f32', '2', '[LANE as u32, LANE as u32]'] - - [float32x4_t, float32x2_t, 'q_lane_f32', '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] - - [float32x4_t, float32x4_t, 'q_laneq_f32', '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]'] + - [float32x2_t, float32x2_t, '_lane_f32', '1'] + - [float32x2_t, float32x4_t, '_laneq_f32', '2'] + - [float32x4_t, float32x2_t, 'q_lane_f32', '1'] + - [float32x4_t, float32x4_t, 'q_laneq_f32', '2'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: - simd_mul - - a - - FnCall: [simd_shuffle!, [b, b, "{type[4]}"]] + - FnCall: ['vdup{type[2]}', [b], [LANE]] - name: "vqrdmulh{type[0]}" doc: "Vector rounding saturating doubling multiply high by scalar" From 5f676a53f584e6304985345791525fa06bb9130e Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 06:37:24 +0530 Subject: [PATCH 17/19] Change implementation of `vld1_dup` --- crates/core_arch/src/arm_shared/neon/generated.rs | 8 ++++---- crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 2e3a24ee95..1d8484e47f 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -14634,8 +14634,8 @@ pub fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld1_dup_f16(ptr: *const f16) -> float16x4_t { - let x: float16x4_t = vld1_lane_f16::<0>(ptr, transmute(f16x4::splat(0.0))); - simd_shuffle!(x, x, [0, 0, 0, 0]) + let x = vld1_lane_f16::<0>(ptr, transmute(f16x4::splat(0.0))); + vdup_lane_f16::<0>(x) } #[doc = "Load one single-element structure and replicate to all lanes of one register"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_f16)"] @@ -14653,8 +14653,8 @@ pub unsafe fn vld1_dup_f16(ptr: *const f16) -> float16x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld1q_dup_f16(ptr: *const f16) -> float16x8_t { - let x: float16x8_t = vld1q_lane_f16::<0>(ptr, transmute(f16x8::splat(0.0))); - simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0]) + let x = vld1q_lane_f16::<0>(ptr, transmute(f16x8::splat(0.0))); + vdupq_laneq_f16::<0>(x) } #[doc = "Load one single-element structure and Replicate to all lanes (of one register)."] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_f32)"] diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index d20c94a2f8..61979ecf0f 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -2835,7 +2835,7 @@ intrinsics: - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']] - FnCall: [simd_insert!, [src, "LANE as u32", "*ptr"]] - - name: "vld1{type[2]}_{neon_type[1]}" + - name: "vld1{neon_type[1].dup_nox}" doc: "Load one single-element structure and replicate to all lanes of one register" arguments: ["ptr: {type[0]}"] return_type: "{neon_type[1]}" @@ -2849,11 +2849,11 @@ intrinsics: safety: unsafe: [neon] types: - - ["*const f16", float16x4_t, '_dup', 'f16x4', "[0, 0, 0, 0]"] - - ["*const f16", float16x8_t, 'q_dup', 'f16x8', "[0, 0, 0, 0, 0, 0, 0, 0]"] + - ["*const f16", float16x4_t, '_lane', 'f16x4'] + - ["*const f16", float16x8_t, 'q_laneq', 'f16x8'] compose: - - Let: [x, "{neon_type[1]}", "vld1{neon_type[1].lane_nox}::<0>(ptr, transmute({type[3]}::splat(0.0)))"] - - FnCall: [simd_shuffle!, [x, x, "{type[4]}"]] + - Let: [x, {FnCall: ["vld1{neon_type[1].lane_nox}", [ptr, {FnCall: [transmute, ["{type[3]}::splat(0.0)"]]}], [0]]}] + - FnCall: ['vdup{type[2]}_{neon_type[1]}', [x], [0]] - name: "vld2{neon_type[1].nox}" From 962b1a603d5b3e96c233bd2429deba347a0dd783 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 07:01:28 +0530 Subject: [PATCH 18/19] Change implementation of `v{us,su,}dot_lane` --- .../src/arm_shared/neon/generated.rs | 136 +++++++----------- .../spec/neon/arm_shared.spec.yml | 96 ++++++------- 2 files changed, 89 insertions(+), 143 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 1d8484e47f..476d8e78a2 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -9663,11 +9663,9 @@ pub fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t { )] pub fn vdot_lane_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - let c: int32x2_t = vreinterpret_s32_s8(c); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vdot_s32(a, b, vreinterpret_s8_s32(c)) - } + let c = vreinterpret_s32_s8(c); + let c = vdup_lane_s32::(c); + vdot_s32(a, b, vreinterpret_s8_s32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)"] @@ -9690,12 +9688,9 @@ pub fn vdot_lane_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> )] pub fn vdotq_lane_s32(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - let c: int32x2_t = vreinterpret_s32_s8(c); - unsafe { - let c: int32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vdotq_s32(a, b, vreinterpretq_s8_s32(c)) - } + let c = vreinterpret_s32_s8(c); + let c = vdupq_lane_s32::(c); + vdotq_s32(a, b, vreinterpretq_s8_s32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)"] @@ -9718,11 +9713,9 @@ pub fn vdotq_lane_s32(a: int32x4_t, b: int8x16_t, c: int8x8_t) )] pub fn vdot_lane_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 1); - let c: uint32x2_t = vreinterpret_u32_u8(c); - unsafe { - let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vdot_u32(a, b, vreinterpret_u8_u32(c)) - } + let c = vreinterpret_u32_u8(c); + let c = vdup_lane_u32::(c); + vdot_u32(a, b, vreinterpret_u8_u32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)"] @@ -9745,12 +9738,9 @@ pub fn vdot_lane_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) )] pub fn vdotq_lane_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 1); - let c: uint32x2_t = vreinterpret_u32_u8(c); - unsafe { - let c: uint32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vdotq_u32(a, b, vreinterpretq_u8_u32(c)) - } + let c = vreinterpret_u32_u8(c); + let c = vdupq_lane_u32::(c); + vdotq_u32(a, b, vreinterpretq_u8_u32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)"] @@ -9766,11 +9756,9 @@ pub fn vdotq_lane_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x8_ #[unstable(feature = "stdarch_neon_dotprod", issue = "117224")] pub fn vdot_laneq_s32(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - let c: int32x4_t = vreinterpretq_s32_s8(c); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vdot_s32(a, b, vreinterpret_s8_s32(c)) - } + let c = vreinterpretq_s32_s8(c); + let c = vdup_laneq_s32::(c); + vdot_s32(a, b, vreinterpret_s8_s32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)"] @@ -9786,12 +9774,9 @@ pub fn vdot_laneq_s32(a: int32x2_t, b: int8x8_t, c: int8x16_t) #[unstable(feature = "stdarch_neon_dotprod", issue = "117224")] pub fn vdotq_laneq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - let c: int32x4_t = vreinterpretq_s32_s8(c); - unsafe { - let c: int32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vdotq_s32(a, b, vreinterpretq_s8_s32(c)) - } + let c = vreinterpretq_s32_s8(c); + let c = vdupq_laneq_s32::(c); + vdotq_s32(a, b, vreinterpretq_s8_s32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)"] @@ -9807,11 +9792,9 @@ pub fn vdotq_laneq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t #[unstable(feature = "stdarch_neon_dotprod", issue = "117224")] pub fn vdot_laneq_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 2); - let c: uint32x4_t = vreinterpretq_u32_u8(c); - unsafe { - let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vdot_u32(a, b, transmute(c)) - } + let c = vreinterpretq_u32_u8(c); + let c = vdup_laneq_u32::(c); + vdot_u32(a, b, vreinterpret_u8_u32(c)) } #[doc = "Dot product arithmetic (indexed)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)"] @@ -9827,12 +9810,9 @@ pub fn vdot_laneq_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x16_ #[unstable(feature = "stdarch_neon_dotprod", issue = "117224")] pub fn vdotq_laneq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - let c: uint32x4_t = vreinterpretq_u32_u8(c); - unsafe { - let c: uint32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vdotq_u32(a, b, transmute(c)) - } + let c = vreinterpretq_u32_u8(c); + let c = vdupq_laneq_u32::(c); + vdotq_u32(a, b, vreinterpretq_u8_u32(c)) } #[doc = "Dot product arithmetic (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)"] @@ -69646,11 +69626,9 @@ pub fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t { )] pub fn vsudot_lane_s32(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - let c: uint32x2_t = vreinterpret_u32_u8(c); - unsafe { - let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vusdot_s32(a, vreinterpret_u8_u32(c), b) - } + let c = vreinterpret_u32_u8(c); + let c = vdup_lane_u32::(c); + vusdot_s32(a, vreinterpret_u8_u32(c), b) } #[doc = "Dot product index form with signed and unsigned integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)"] @@ -69677,12 +69655,9 @@ pub fn vsudot_lane_s32(a: int32x2_t, b: int8x8_t, c: uint8x8_t) )] pub fn vsudotq_lane_s32(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - let c: uint32x2_t = vreinterpret_u32_u8(c); - unsafe { - let c: uint32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vusdotq_s32(a, vreinterpretq_u8_u32(c), b) - } + let c = vreinterpret_u32_u8(c); + let c = vdupq_lane_u32::(c); + vusdotq_s32(a, vreinterpretq_u8_u32(c), b) } #[doc = "Dot product index form with signed and unsigned integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32)"] @@ -69702,11 +69677,9 @@ pub fn vsudotq_lane_s32(a: int32x4_t, b: int8x16_t, c: uint8x8_ #[unstable(feature = "stdarch_neon_i8mm", issue = "117223")] pub fn vsudot_laneq_s32(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: uint32x4_t = transmute(c); - let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vusdot_s32(a, transmute(c), b) - } + let c = vreinterpretq_u32_u8(c); + let c = vdup_laneq_u32::(c); + vusdot_s32(a, vreinterpret_u8_u32(c), b) } #[doc = "Dot product index form with signed and unsigned integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32)"] @@ -69726,12 +69699,9 @@ pub fn vsudot_laneq_s32(a: int32x2_t, b: int8x8_t, c: uint8x16_ #[unstable(feature = "stdarch_neon_i8mm", issue = "117223")] pub fn vsudotq_laneq_s32(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: uint32x4_t = transmute(c); - let c: uint32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vusdotq_s32(a, transmute(c), b) - } + let c = vreinterpretq_u32_u8(c); + let c = vdupq_laneq_u32::(c); + vusdotq_s32(a, vreinterpretq_u8_u32(c), b) } #[doc = "Table look-up"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1)"] @@ -71633,11 +71603,9 @@ pub fn vtstq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { )] pub fn vusdot_lane_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 1); - let c: int32x2_t = vreinterpret_s32_s8(c); - unsafe { - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vusdot_s32(a, b, vreinterpret_s8_s32(c)) - } + let c = vreinterpret_s32_s8(c); + let c = vdup_lane_s32::(c); + vusdot_s32(a, b, vreinterpret_s8_s32(c)) } #[doc = "Dot product index form with unsigned and signed integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)"] @@ -71660,12 +71628,9 @@ pub fn vusdot_lane_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) )] pub fn vusdotq_lane_s32(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 1); - let c: int32x2_t = vreinterpret_s32_s8(c); - unsafe { - let c: int32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vusdotq_s32(a, b, vreinterpretq_s8_s32(c)) - } + let c = vreinterpret_s32_s8(c); + let c = vdupq_lane_s32::(c); + vusdotq_s32(a, b, vreinterpretq_s8_s32(c)) } #[doc = "Dot product index form with unsigned and signed integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32)"] @@ -71681,11 +71646,9 @@ pub fn vusdotq_lane_s32(a: int32x4_t, b: uint8x16_t, c: int8x8_ #[unstable(feature = "stdarch_neon_i8mm", issue = "117223")] pub fn vusdot_laneq_s32(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x4_t = transmute(c); - let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); - vusdot_s32(a, b, vreinterpret_s8_s32(c)) - } + let c = vreinterpretq_s32_s8(c); + let c = vdup_laneq_s32::(c); + vusdot_s32(a, b, vreinterpret_s8_s32(c)) } #[doc = "Dot product index form with unsigned and signed integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32)"] @@ -71701,12 +71664,9 @@ pub fn vusdot_laneq_s32(a: int32x2_t, b: uint8x8_t, c: int8x16_ #[unstable(feature = "stdarch_neon_i8mm", issue = "117223")] pub fn vusdotq_laneq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: int32x4_t = transmute(c); - let c: int32x4_t = - simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); - vusdotq_s32(a, b, vreinterpretq_s8_s32(c)) - } + let c = vreinterpretq_s32_s8(c); + let c = vdupq_laneq_s32::(c); + vusdotq_s32(a, b, vreinterpretq_s8_s32(c)) } #[doc = "Dot product vector form with unsigned and signed integers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)"] diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 61979ecf0f..1f7e1f6987 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -6159,10 +6159,10 @@ intrinsics: - link: "llvm.arm.neon.usdot.v{neon_type[0].lane}i32.v{neon_type[1].lane}i8" arch: arm - - name: "vusdot{type[0]}" + - name: "vusdot{neon_type[0].lane_nox}" doc: "Dot product index form with unsigned and signed integers" - arguments: ["a: {neon_type[1]}", "b: {neon_type[2]}", "c: int8x8_t"] - return_type: "{neon_type[1]}" + arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: int8x8_t"] + return_type: "{neon_type[0]}" attr: - *neon-i8mm - *neon-v8 @@ -6174,19 +6174,17 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - ['_lane_s32', int32x2_t, uint8x8_t, '[LANE as u32, LANE as u32]',''] - - ['q_lane_s32', int32x4_t, uint8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [int32x2_t, uint8x8_t, ''] + - [int32x4_t, uint8x16_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '1']] - Let: - c - - int32x2_t - FnCall: ['vreinterpret_s32_s8', [c]] - Let: - c - - "{type[1]}" - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] - - FnCall: ["vusdot{neon_type[1].no}", [a, b, {FnCall: ['vreinterpret{type[4]}_s8_s32', [c]]}]] + - FnCall: ['vdup{neon_type[0].lane_nox}', [c], [LANE]] + - FnCall: ["vusdot{neon_type[0].no}", [a, b, {FnCall: ['vreinterpret{type[2]}_s8_s32', [c]]}]] - name: "vsudot{neon_type[0].lane_nox}" doc: "Dot product index form with signed and unsigned integers" @@ -6203,22 +6201,20 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [int32x2_t, int8x8_t, uint8x8_t, '[LANE as u32, LANE as u32]', uint32x2_t,''] - - [int32x4_t, int8x16_t, uint8x8_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]', uint32x4_t,'q'] + - [int32x2_t, int8x8_t, uint8x8_t, uint32x2_t, ''] + - [int32x4_t, int8x16_t, uint8x8_t, uint32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '1']] - Let: - c - - uint32x2_t - FnCall: ['vreinterpret_u32_u8', [c]] - Let: - c - - "{type[4]}" - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] + - FnCall: ['vdup{neon_type[3].lane_nox}', [c], [LANE]] - FnCall: - "vusdot{neon_type[0].no}" - - a - - FnCall: ['vreinterpret{type[5]}_u8_u32', [c]] + - FnCall: ['vreinterpret{type[4]}_u8_u32', [c]] - b - name: "vmul{neon_type[1].no}" @@ -6979,13 +6975,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [int32x2_t, uint8x8_t, int8x16_t, '[LANE as u32, LANE as u32]',''] - - [int32x4_t, uint8x16_t, int8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [int32x2_t, uint8x8_t, int8x16_t, ''] + - [int32x4_t, uint8x16_t, int8x16_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '2']] - - Let: [c, int32x4_t, {FnCall: [transmute, [c]]}] - - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[3]}"]]}] - - FnCall: ["vusdot{neon_type[0].no}", [a, b, {FnCall: ['vreinterpret{type[4]}_s8_s32', [c]]}]] + - Let: [c, {FnCall: [vreinterpretq_s32_s8, [c]]}] + - Let: [c, {FnCall: ['vdup{neon_type[0].laneq_nox}', [c], [LANE]]}] + - FnCall: ["vusdot{neon_type[0].no}", [a, b, {FnCall: ['vreinterpret{type[3]}_s8_s32', [c]]}]] - name: "vsudot{neon_type[0].laneq_nox}" doc: "Dot product index form with signed and unsigned integers" @@ -7001,22 +6997,20 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [int32x2_t, int8x8_t, uint8x16_t, '[LANE as u32, LANE as u32]', uint32x2_t] - - [int32x4_t, int8x16_t, uint8x16_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]', uint32x4_t] + - [int32x2_t, int8x8_t, uint8x16_t, uint32x2_t, ''] + - [int32x4_t, int8x16_t, uint8x16_t, uint32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - Let: - c - - uint32x4_t - - FnCall: [transmute, [c]] + - FnCall: [vreinterpretq_u32_u8, [c]] - Let: - c - - "{type[4]}" - - FnCall: [simd_shuffle!, [c, c, "{type[3]}"]] + - FnCall: ['vdup{neon_type[3].laneq_nox}', [c], [LANE]] - FnCall: - "vusdot{neon_type[0].no}" - - a - - FnCall: [transmute, [c]] + - FnCall: ['vreinterpret{type[4]}_u8_u32', [c]] - b - name: "vdot{neon_type[0].laneq_nox}" @@ -7033,23 +7027,21 @@ intrinsics: - FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']] safety: safe types: - - [int32x2_t, int8x8_t, int8x16_t, int32x4_t, '[LANE as u32, LANE as u32]', ''] - - [int32x4_t, int8x16_t, int8x16_t, int32x4_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [int32x2_t, int8x8_t, int8x16_t, ''] + - [int32x4_t, int8x16_t, int8x16_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '2']] - Let: - c - - "{neon_type[3]}" - - FnCall: ['vreinterpretq_{neon_type[0]}_{neon_type[1]}', [c]] + - FnCall: [vreinterpretq_s32_s8, [c]] - Let: - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, '{type[4]}']] + - FnCall: ['vdup{neon_type[0].laneq_nox}', [c], [LANE]] - FnCall: - "vdot{neon_type[0].no}" - - a - b - - FnCall: ['vreinterpret{type[5]}_{neon_type[1]}_{neon_type[0]}', [c]] + - FnCall: ['vreinterpret{type[3]}_s8_s32', [c]] - name: "vdot{neon_type[0].laneq_nox}" doc: Dot product arithmetic (indexed) @@ -7065,23 +7057,21 @@ intrinsics: - FnCall: [unstable, ['feature = "stdarch_neon_dotprod"', 'issue = "117224"']] safety: safe types: - - [uint32x2_t, uint8x8_t, uint8x16_t, uint32x4_t, '[LANE as u32, LANE as u32]',''] - - [uint32x4_t, uint8x16_t, uint8x16_t, uint32x4_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [uint32x2_t, uint8x8_t, uint8x16_t, ''] + - [uint32x4_t, uint8x16_t, uint8x16_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '2']] - Let: - c - - "{neon_type[3]}" - - FnCall: ['vreinterpretq_{neon_type[0]}_{neon_type[1]}', [c]] + - FnCall: ['vreinterpretq_u32_u8', [c]] - Let: - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, '{type[4]}']] + - FnCall: ['vdup{neon_type[0].laneq_nox}', [c], [LANE]] - FnCall: - "vdot{neon_type[0].no}" - - a - b - - FnCall: [transmute, [c]] + - FnCall: ['vreinterpret{type[3]}_u8_u32', [c]] - name: "vdot{neon_type[0].no}" doc: Dot product arithmetic (vector) @@ -7146,23 +7136,21 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - [int32x2_t, int8x8_t, int8x8_t, int32x2_t, '[LANE as u32, LANE as u32]',''] - - [int32x4_t, int8x16_t, int8x8_t, int32x2_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [int32x2_t, int8x8_t, int8x8_t, ''] + - [int32x4_t, int8x16_t, int8x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '1']] - Let: - c - - "{neon_type[3]}" - - FnCall: ['vreinterpret_{neon_type[0]}_{neon_type[1]}', [c]] + - FnCall: ['vreinterpret_s32_s8', [c]] - Let: - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, '{type[4]}']] + - FnCall: ['vdup{neon_type[0].lane_nox}', [c], [LANE]] - FnCall: - "vdot{neon_type[0].no}" - - a - b - - FnCall: ['vreinterpret{type[5]}_{neon_type[1]}_{neon_type[0]}', [c]] + - FnCall: ['vreinterpret{type[3]}_s8_s32', [c]] - name: "vdot{neon_type[0].lane_nox}" doc: Dot product arithmetic (indexed) @@ -7179,23 +7167,21 @@ intrinsics: - *neon-cfg-arm-unstable safety: safe types: - - [uint32x2_t, uint8x8_t, uint8x8_t, uint32x2_t, '[LANE as u32, LANE as u32]',''] - - [uint32x4_t, uint8x16_t, uint8x8_t, uint32x2_t, '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]','q'] + - [uint32x2_t, uint8x8_t, uint8x8_t, ''] + - [uint32x4_t, uint8x16_t, uint8x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, '1']] - Let: - c - - "{neon_type[3]}" - - FnCall: ['vreinterpret_{neon_type[0]}_{neon_type[1]}', [c]] + - FnCall: ['vreinterpret_u32_u8', [c]] - Let: - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, '{type[4]}']] + - FnCall: ['vdup{neon_type[0].lane_nox}', [c], [LANE]] - FnCall: - "vdot{neon_type[0].no}" - - a - b - - FnCall: ['vreinterpret{type[5]}_{neon_type[1]}_{neon_type[0]}', [c]] + - FnCall: ['vreinterpret{type[3]}_u8_u32', [c]] - name: "vmax{neon_type.no}" doc: Maximum (vector) From 2ada72b455b2752fe39760256a21903d121e18d8 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 9 May 2026 09:54:33 +0530 Subject: [PATCH 19/19] Split `vcopyq_lane{q}_p64` due to `vsetq_lane_p64` requiring `aes` --- .../core_arch/src/aarch64/neon/generated.rs | 4 +- .../spec/neon/aarch64.spec.yml | 37 ++++++++++++++++++- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 63ffa6765c..fef672cab9 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4618,7 +4618,7 @@ pub fn vcopyq_lane_p64( static_assert_uimm_bits!(LANE1, 1); static_assert!(LANE2 == 0); let b: poly64x2_t = vcombine_p64(b, b); - vsetq_lane_p64::(vgetq_lane_p64::(b), a) + unsafe { simd_insert!(a, LANE1 as u32, simd_extract!(b, LANE2 as u32, p64)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcopyq_lane_s8)"] @@ -5001,7 +5001,7 @@ pub fn vcopyq_laneq_p64( ) -> poly64x2_t { static_assert_uimm_bits!(LANE1, 1); static_assert_uimm_bits!(LANE2, 1); - vsetq_lane_p64::(vgetq_lane_p64::(b), a) + unsafe { simd_insert!(a, LANE1 as u32, simd_extract!(b, LANE2 as u32, p64)) } } #[doc = "Insert vector element from another vector element"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcreate_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 7664bf3c10..401bb504b3 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -8856,7 +8856,6 @@ intrinsics: - [uint64x2_t, uint64x2_t, uint64x2_t, '1', '1'] - [poly8x16_t, poly8x16_t, poly8x16_t, '4', '4'] - [poly16x8_t, poly16x8_t, poly16x8_t, '3', '3'] - - [poly64x2_t, poly64x2_t, poly64x2_t, '1', '1'] - [float32x4_t, float32x4_t, float32x4_t, '2', '2'] - [float64x2_t, float64x2_t, float64x2_t, '1', '1'] compose: @@ -8864,6 +8863,23 @@ intrinsics: - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[1].lane_nox}', [b], [LANE2]]}, a], [LANE1]] + - name: "vcopy{neon_type[0].laneq_nox}" + doc: "Insert vector element from another vector element" + arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"] + return_type: "{neon_type[2]}" + attr: + - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [mov, 'LANE1 = 0', 'LANE2 = 0']]}]] + - FnCall: [rustc_legacy_const_generics, ['1', '3']] + - *neon-stable + static_defs: ['const LANE1: i32, const LANE2: i32'] + safety: safe + types: + - [poly64x2_t, poly64x2_t, poly64x2_t, '1', '1'] + compose: + - FnCall: [static_assert_uimm_bits!, [LANE1, '{type[3]}']] + - FnCall: [static_assert_uimm_bits!, [LANE2, '{type[4]}']] + - FnCall: [simd_insert!, [a, LANE1 as u32, {FnCall: [simd_extract!, [b, LANE2 as u32, p64]]}]] + - name: "vcopy{neon_type[0].laneq_nox}" doc: "Insert vector element from another vector element" arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"] @@ -8902,7 +8918,6 @@ intrinsics: types: - [int64x2_t, int64x1_t] - [uint64x2_t, uint64x1_t] - - [poly64x2_t, poly64x1_t] - [float64x2_t, float64x1_t] compose: - FnCall: [static_assert_uimm_bits!, [LANE1, '1']] @@ -8910,6 +8925,24 @@ intrinsics: - Let: [b, '{neon_type[0]}', {FnCall: ['vcombine{neon_type[1].no}', [b, b]]}] - FnCall: ['vset{neon_type[0].lane_nox}', [{FnCall: ['vget{neon_type[0].lane_nox}', [b], [LANE2]]}, a], [LANE1]] + - name: "vcopyq_lane_{neon_type[0]}" + doc: "Insert vector element from another vector element" + arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"] + return_type: "{neon_type[0]}" + attr: + - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [mov, 'LANE1 = 1', 'LANE2 = 0']]}]] + - FnCall: [rustc_legacy_const_generics, ['1', '3']] + - *neon-stable + static_defs: ['const LANE1: i32, const LANE2: i32'] + safety: safe + types: + - [poly64x2_t, poly64x1_t] + compose: + - FnCall: [static_assert_uimm_bits!, [LANE1, '1']] + - FnCall: [static_assert!, ['LANE2 == 0']] + - Let: [b, '{neon_type[0]}', {FnCall: ['vcombine{neon_type[1].no}', [b, b]]}] + - FnCall: [simd_insert!, [a, LANE1 as u32, {FnCall: [simd_extract!, [b, LANE2 as u32, p64]]}]] + - name: "vcopyq_lane_f32" doc: "Insert vector element from another vector element" arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]