Add vmla_n, vmla_lane, vmls_n, vmls_lane neon instructions (#1145)

This commit is contained in:
Sparrow Li
2021-04-29 05:59:41 +08:00
committed by GitHub
parent 54a2d8b82a
commit 07f1d0cae3
6 changed files with 2528 additions and 6 deletions

View File

@@ -2950,6 +2950,118 @@ pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
vmlal_u32(a, b, c)
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2))]
pub unsafe fn vmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
vmlal_high_s16(a, b, vdupq_n_s16(c))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2))]
pub unsafe fn vmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
vmlal_high_s32(a, b, vdupq_n_s32(c))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2))]
pub unsafe fn vmlal_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
vmlal_high_u16(a, b, vdupq_n_u16(c))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2))]
pub unsafe fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
vmlal_high_u32(a, b, vdupq_n_u32(c))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
static_assert_imm2!(LANE);
vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
static_assert_imm3!(LANE);
vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
static_assert_imm1!(LANE);
vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
static_assert_imm2!(LANE);
vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
static_assert_imm2!(LANE);
vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
static_assert_imm3!(LANE);
vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
static_assert_imm1!(LANE);
vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlal_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
static_assert_imm2!(LANE);
vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Floating-point multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
@@ -3026,6 +3138,118 @@ pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
vmlsl_u32(a, b, c)
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2))]
pub unsafe fn vmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
vmlsl_high_s16(a, b, vdupq_n_s16(c))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2))]
pub unsafe fn vmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
vmlsl_high_s32(a, b, vdupq_n_s32(c))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2))]
pub unsafe fn vmlsl_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
vmlsl_high_u16(a, b, vdupq_n_u16(c))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2))]
pub unsafe fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
vmlsl_high_u32(a, b, vdupq_n_u32(c))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
static_assert_imm2!(LANE);
vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
static_assert_imm3!(LANE);
vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
static_assert_imm1!(LANE);
vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
static_assert_imm2!(LANE);
vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
static_assert_imm2!(LANE);
vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
static_assert_imm3!(LANE);
vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
static_assert_imm1!(LANE);
vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vmlsl_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
static_assert_imm2!(LANE);
vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
}
/// Extract narrow
#[inline]
#[target_feature(enable = "neon")]
@@ -9750,6 +9974,126 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_n_s16() {
let a: i32x4 = i32x4::new(8, 7, 6, 5);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16 = 2;
let e: i32x4 = i32x4::new(8, 9, 10, 11);
let r: i32x4 = transmute(vmlal_high_n_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_n_s32() {
let a: i64x2 = i64x2::new(8, 7);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32 = 2;
let e: i64x2 = i64x2::new(8, 9);
let r: i64x2 = transmute(vmlal_high_n_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_n_u16() {
let a: u32x4 = u32x4::new(8, 7, 6, 5);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16 = 2;
let e: u32x4 = u32x4::new(8, 9, 10, 11);
let r: u32x4 = transmute(vmlal_high_n_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_n_u32() {
let a: u64x2 = u64x2::new(8, 7);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32 = 2;
let e: u64x2 = u64x2::new(8, 9);
let r: u64x2 = transmute(vmlal_high_n_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_lane_s16() {
let a: i32x4 = i32x4::new(8, 7, 6, 5);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16x4 = i16x4::new(0, 2, 0, 0);
let e: i32x4 = i32x4::new(8, 9, 10, 11);
let r: i32x4 = transmute(vmlal_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_laneq_s16() {
let a: i32x4 = i32x4::new(8, 7, 6, 5);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
let e: i32x4 = i32x4::new(8, 9, 10, 11);
let r: i32x4 = transmute(vmlal_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_lane_s32() {
let a: i64x2 = i64x2::new(8, 7);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32x2 = i32x2::new(0, 2);
let e: i64x2 = i64x2::new(8, 9);
let r: i64x2 = transmute(vmlal_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_laneq_s32() {
let a: i64x2 = i64x2::new(8, 7);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32x4 = i32x4::new(0, 2, 0, 0);
let e: i64x2 = i64x2::new(8, 9);
let r: i64x2 = transmute(vmlal_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_lane_u16() {
let a: u32x4 = u32x4::new(8, 7, 6, 5);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16x4 = u16x4::new(0, 2, 0, 0);
let e: u32x4 = u32x4::new(8, 9, 10, 11);
let r: u32x4 = transmute(vmlal_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_laneq_u16() {
let a: u32x4 = u32x4::new(8, 7, 6, 5);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
let e: u32x4 = u32x4::new(8, 9, 10, 11);
let r: u32x4 = transmute(vmlal_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_lane_u32() {
let a: u64x2 = u64x2::new(8, 7);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32x2 = u32x2::new(0, 2);
let e: u64x2 = u64x2::new(8, 9);
let r: u64x2 = transmute(vmlal_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_laneq_u32() {
let a: u64x2 = u64x2::new(8, 7);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32x4 = u32x4::new(0, 2, 0, 0);
let e: u64x2 = u64x2::new(8, 9);
let r: u64x2 = transmute(vmlal_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmls_f64() {
let a: f64 = 6.;
@@ -9830,6 +10174,126 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_n_s16() {
let a: i32x4 = i32x4::new(14, 15, 16, 17);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16 = 2;
let e: i32x4 = i32x4::new(14, 13, 12, 11);
let r: i32x4 = transmute(vmlsl_high_n_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_n_s32() {
let a: i64x2 = i64x2::new(14, 15);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32 = 2;
let e: i64x2 = i64x2::new(14, 13);
let r: i64x2 = transmute(vmlsl_high_n_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_n_u16() {
let a: u32x4 = u32x4::new(14, 15, 16, 17);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16 = 2;
let e: u32x4 = u32x4::new(14, 13, 12, 11);
let r: u32x4 = transmute(vmlsl_high_n_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_n_u32() {
let a: u64x2 = u64x2::new(14, 15);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32 = 2;
let e: u64x2 = u64x2::new(14, 13);
let r: u64x2 = transmute(vmlsl_high_n_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_lane_s16() {
let a: i32x4 = i32x4::new(14, 15, 16, 17);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16x4 = i16x4::new(0, 2, 0, 0);
let e: i32x4 = i32x4::new(14, 13, 12, 11);
let r: i32x4 = transmute(vmlsl_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_laneq_s16() {
let a: i32x4 = i32x4::new(14, 15, 16, 17);
let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
let e: i32x4 = i32x4::new(14, 13, 12, 11);
let r: i32x4 = transmute(vmlsl_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_lane_s32() {
let a: i64x2 = i64x2::new(14, 15);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32x2 = i32x2::new(0, 2);
let e: i64x2 = i64x2::new(14, 13);
let r: i64x2 = transmute(vmlsl_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_laneq_s32() {
let a: i64x2 = i64x2::new(14, 15);
let b: i32x4 = i32x4::new(3, 3, 0, 1);
let c: i32x4 = i32x4::new(0, 2, 0, 0);
let e: i64x2 = i64x2::new(14, 13);
let r: i64x2 = transmute(vmlsl_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_lane_u16() {
let a: u32x4 = u32x4::new(14, 15, 16, 17);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16x4 = u16x4::new(0, 2, 0, 0);
let e: u32x4 = u32x4::new(14, 13, 12, 11);
let r: u32x4 = transmute(vmlsl_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_laneq_u16() {
let a: u32x4 = u32x4::new(14, 15, 16, 17);
let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
let e: u32x4 = u32x4::new(14, 13, 12, 11);
let r: u32x4 = transmute(vmlsl_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_lane_u32() {
let a: u64x2 = u64x2::new(14, 15);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32x2 = u32x2::new(0, 2);
let e: u64x2 = u64x2::new(14, 13);
let r: u64x2 = transmute(vmlsl_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_laneq_u32() {
let a: u64x2 = u64x2::new(14, 15);
let b: u32x4 = u32x4::new(3, 3, 0, 1);
let c: u32x4 = u32x4::new(0, 2, 0, 0);
let e: u64x2 = u64x2::new(14, 13);
let r: u64x2 = transmute(vmlsl_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmovn_high_s16() {
let a: i8x8 = i8x8::new(0, 1, 2, 3, 2, 3, 4, 5);

View File

@@ -37,7 +37,8 @@
external_doc,
allow_internal_unstable,
decl_macro,
extended_key_value_attributes
extended_key_value_attributes,
bench_black_box
)]
#![cfg_attr(test, feature(test, abi_vectorcall))]
#![cfg_attr(all(test, target_arch = "wasm32"), feature(wasm_simd))]

View File

@@ -1222,6 +1222,68 @@ generate float64x*_t
arm = vmla.
generate float*_t
/// Vector multiply accumulate with scalar
name = vmla
n-suffix
multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
aarch64 = mla
arm = vmla.
generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
/// Vector multiply accumulate with scalar
name = vmla
n-suffix
multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
a = 0., 1., 2., 3.
b = 2., 2., 2., 2.
c = 3.
validate 6., 7., 8., 9.
aarch64 = fmul
arm = vmla.
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
/// Vector multiply accumulate with scalar
name = vmla
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
aarch64 = mla
arm = vmla.
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
/// Vector multiply accumulate with scalar
name = vmla
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 0., 1., 2., 3.
b = 2., 2., 2., 2.
c = 0., 3., 0., 0.
n = 1
validate 6., 7., 8., 9.
aarch64 = fmul
arm = vmla.
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
/// Signed multiply-add long
name = vmlal
multi_fn = simd_add, a, {vmull-self-noext, b, c}
@@ -1246,6 +1308,41 @@ arm = vmlal.s
aarch64 = umlal
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
/// Vector widening multiply accumulate with scalar
name = vmlal
n-suffix
multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
arm = vmlal.s
aarch64 = smlal
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
aarch64 = umlal
generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
/// Vector widening multiply accumulate with scalar
name = vmlal_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
arm = vmlal.s
aarch64 = smlal
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
aarch64 = umlal
generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
/// Signed multiply-add long
name = vmlal_high
no-q
@@ -1276,6 +1373,39 @@ validate 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = umlal2
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Multiply-add long
name = vmlal_high_n
no-q
multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 2
validate 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = smlal2
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
aarch64 = umlal2
generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
/// Multiply-add long
name = vmlal_high_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = smlal2
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
aarch64 = umlal2
generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Multiply-subtract from accumulator
name = vmls
multi_fn = simd_sub, a, {simd_mul, b, c}
@@ -1302,6 +1432,68 @@ generate float64x*_t
arm = vmls.
generate float*_t
/// Vector multiply subtract with scalar
name = vmls
n-suffix
multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = mls
arm = vmls.
generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
/// Vector multiply subtract with scalar
name = vmls
n-suffix
multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
a = 6., 7., 8., 9.
b = 2., 2., 2., 2.
c = 3.
validate 0., 1., 2., 3.
aarch64 = fmul
arm = vmls.
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
/// Vector multiply subtract with scalar
name = vmls
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = mls
arm = vmls.
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
/// Vector multiply subtract with scalar
name = vmls
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 6., 7., 8., 9.
b = 2., 2., 2., 2.
c = 0., 3., 0., 0.
n = 1
validate 0., 1., 2., 3.
aarch64 = fmul
arm = vmls.
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
/// Signed multiply-subtract long
name = vmlsl
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
@@ -1314,7 +1506,7 @@ arm = vmlsl.s
aarch64 = smlsl
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
/// Signed multiply-subtract long
/// Unsigned multiply-subtract long
name = vmlsl
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
@@ -1326,6 +1518,41 @@ arm = vmlsl.s
aarch64 = umlsl
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
/// Vector widening multiply subtract with scalar
name = vmlsl
n-suffix
multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
arm = vmlsl.s
aarch64 = smlsl
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
aarch64 = umlsl
generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
/// Vector widening multiply subtract with scalar
name = vmlsl_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
arm = vmlsl.s
aarch64 = smlsl
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
aarch64 = umlsl
generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
/// Signed multiply-subtract long
name = vmlsl_high
no-q
@@ -1356,6 +1583,39 @@ validate 14, 13, 12, 11, 10, 9, 8, 7
aarch64 = umlsl2
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Multiply-subtract long
name = vmlsl_high_n
no-q
multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 2
validate 14, 13, 12, 11, 10, 9, 8, 7
aarch64 = smlsl2
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
aarch64 = umlsl2
generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
/// Multiply-subtract long
name = vmlsl_high_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
validate 14, 13, 12, 11, 10, 9, 8, 7
aarch64 = smlsl2
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
aarch64 = umlsl2
generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Extract narrow
name = vmovn_high
no-q

View File

@@ -349,6 +349,7 @@ enum Suffix {
OutSuffix,
Lane,
In2,
In2Lane,
}
#[derive(Clone, Copy)]
@@ -847,6 +848,7 @@ fn gen_aarch64(
OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
};
let current_fn = if let Some(current_fn) = current_fn.clone() {
if link_aarch64.is_some() {
@@ -1259,6 +1261,7 @@ fn gen_arm(
OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
};
let current_aarch64 = current_aarch64
.clone()
@@ -2216,6 +2219,8 @@ mod test {
suffix = Lane;
} else if line.starts_with("in2-suffix") {
suffix = In2;
} else if line.starts_with("in2-lane-suffixes") {
suffix = In2Lane;
} else if line.starts_with("a = ") {
a = line[4..].split(',').map(|v| v.trim().to_string()).collect();
} else if line.starts_with("b = ") {

View File

@@ -3,7 +3,7 @@
//! This basically just disassembles the current executable and then parses the
//! output once globally and then provides the `assert` function which makes
//! assertions about the disassembly of a function.
#![feature(test)] // For black_box
#![feature(bench_black_box)] // For black_box
#![deny(rust_2018_idioms)]
#![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]