Add support for AArch64 i8mm *dot intrinsics.
This includes vsudot and vusdot, which perform mixed-signedness dot product operations.
This commit is contained in:
committed by
Amanieu d'Antras
parent
55ef711226
commit
1e15fa3f0a
@@ -8353,6 +8353,62 @@ pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
|
||||
vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
|
||||
}
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vusdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: int32x4_t = transmute(c);
|
||||
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vusdot_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vusdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: int32x4_t = transmute(c);
|
||||
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vusdotq_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vsudot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: uint32x4_t = transmute(c);
|
||||
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vusdot_s32(a, transmute(c), b)
|
||||
}
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vsudotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: uint32x4_t = transmute(c);
|
||||
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vusdotq_s32(a, transmute(c), b)
|
||||
}
|
||||
|
||||
/// Multiply
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64)
|
||||
@@ -22184,6 +22240,46 @@ mod test {
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdot_laneq_s32() {
|
||||
let a: i32x2 = i32x2::new(1000, -4200);
|
||||
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
|
||||
let e: i32x2 = i32x2::new(-3420, -10140);
|
||||
let r: i32x2 = transmute(vusdot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdotq_laneq_s32() {
|
||||
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
|
||||
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
|
||||
let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
|
||||
let e: i32x4 = i32x4::new(-3420, -10140, -8460, -6980);
|
||||
let r: i32x4 = transmute(vusdotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vsudot_laneq_s32() {
|
||||
let a: i32x2 = i32x2::new(-2000, 4200);
|
||||
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
|
||||
let e: i32x2 = i32x2::new(300, 2740);
|
||||
let r: i32x2 = transmute(vsudot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vsudotq_laneq_s32() {
|
||||
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
|
||||
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
|
||||
let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
|
||||
let e: i32x4 = i32x4::new(300, 2740, -6220, -6980);
|
||||
let r: i32x4 = transmute(vsudotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vmul_f64() {
|
||||
let a: f64 = 1.0;
|
||||
|
||||
@@ -16775,6 +16775,106 @@ pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
|
||||
vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
|
||||
}
|
||||
|
||||
/// Dot product vector form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
|
||||
pub unsafe fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8")]
|
||||
fn vusdot_s32_(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t;
|
||||
}
|
||||
vusdot_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product vector form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
|
||||
pub unsafe fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8")]
|
||||
fn vusdotq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
|
||||
}
|
||||
vusdotq_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vusdot_lane_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int32x2_t = transmute(c);
|
||||
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vusdot_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vusdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int32x2_t = transmute(c);
|
||||
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vusdotq_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vsudot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint32x2_t = transmute(c);
|
||||
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vusdot_s32(a, transmute(c), b)
|
||||
}
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,i8mm")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vsudotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint32x2_t = transmute(c);
|
||||
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vusdotq_s32(a, transmute(c), b)
|
||||
}
|
||||
|
||||
/// Multiply
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8)
|
||||
@@ -37823,6 +37923,94 @@ mod test {
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdot_s32() {
|
||||
let a: i32x2 = i32x2::new(1000, -4200);
|
||||
let b: u8x8 = u8x8::new(100, 205, 110, 195, 120, 185, 130, 175);
|
||||
let c: i8x8 = i8x8::new(0, 1, 2, 3, -1, -2, -3, -4);
|
||||
let e: i32x2 = i32x2::new(2010, -5780);
|
||||
let r: i32x2 = transmute(vusdot_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdotq_s32() {
|
||||
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
|
||||
let b: u8x16 = u8x16::new(100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135);
|
||||
let c: i8x16 = i8x16::new(0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8);
|
||||
let e: i32x4 = i32x4::new(2010, -5780, 2370, -1940);
|
||||
let r: i32x4 = transmute(vusdotq_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdot_lane_s32() {
|
||||
let a: i32x2 = i32x2::new(1000, -4200);
|
||||
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let e: i32x2 = i32x2::new(2100, -2700);
|
||||
let r: i32x2 = transmute(vusdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
||||
let a: i32x2 = i32x2::new(1000, -4200);
|
||||
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let e: i32x2 = i32x2::new(260, -5180);
|
||||
let r: i32x2 = transmute(vusdot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vusdotq_lane_s32() {
|
||||
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
|
||||
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
|
||||
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let e: i32x4 = i32x4::new(2100, -2700, 900, 4300);
|
||||
let r: i32x4 = transmute(vusdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
||||
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
|
||||
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
|
||||
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let e: i32x4 = i32x4::new(260, -5180, -2220, 540);
|
||||
let r: i32x4 = transmute(vusdotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vsudot_lane_s32() {
|
||||
let a: i32x2 = i32x2::new(-2000, 4200);
|
||||
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let e: i32x2 = i32x2::new(-900, 3460);
|
||||
let r: i32x2 = transmute(vsudot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
||||
let a: i32x2 = i32x2::new(-2000, 4200);
|
||||
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
|
||||
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let e: i32x2 = i32x2::new(-500, 3220);
|
||||
let r: i32x2 = transmute(vsudot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,i8mm")]
|
||||
unsafe fn test_vsudotq_lane_s32() {
|
||||
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
|
||||
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
|
||||
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let e: i32x4 = i32x4::new(-900, 3460, -3580, -2420);
|
||||
let r: i32x4 = transmute(vsudotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
||||
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
|
||||
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
|
||||
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
|
||||
let e: i32x4 = i32x4::new(-500, 3220, -4460, -3940);
|
||||
let r: i32x4 = transmute(vsudotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vmul_s8() {
|
||||
let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
|
||||
|
||||
@@ -12,16 +12,6 @@ vbfmlaltq_f32
|
||||
vbfmlaltq_lane_f32
|
||||
vbfmlaltq_laneq_f32
|
||||
vbfmmlaq_f32
|
||||
vsudot_laneq_s32
|
||||
vsudot_lane_s32
|
||||
vsudotq_laneq_s32
|
||||
vsudotq_lane_s32
|
||||
vusdot_laneq_s32
|
||||
vusdot_lane_s32
|
||||
vusdotq_laneq_s32
|
||||
vusdotq_lane_s32
|
||||
vusdotq_s32
|
||||
vusdot_s32
|
||||
|
||||
|
||||
# Missing from both Clang and stdarch
|
||||
|
||||
@@ -12,16 +12,6 @@ vbfmlaltq_f32
|
||||
vbfmlaltq_lane_f32
|
||||
vbfmlaltq_laneq_f32
|
||||
vbfmmlaq_f32
|
||||
vsudot_laneq_s32
|
||||
vsudot_lane_s32
|
||||
vsudotq_laneq_s32
|
||||
vsudotq_lane_s32
|
||||
vusdot_laneq_s32
|
||||
vusdot_lane_s32
|
||||
vusdotq_laneq_s32
|
||||
vusdotq_lane_s32
|
||||
vusdotq_s32
|
||||
vusdot_s32
|
||||
|
||||
# Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
|
||||
__crc32d
|
||||
|
||||
@@ -3478,27 +3478,138 @@ link-arm = vst4lane._EXTpi8r_
|
||||
const-arm = LANE
|
||||
generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
|
||||
|
||||
/// Dot product vector form with unsigned and signed integers
|
||||
name = vusdot
|
||||
out-suffix
|
||||
a = 1000, -4200, -1000, 2000
|
||||
b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135
|
||||
c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8
|
||||
aarch64 = usdot
|
||||
arm = vusdot
|
||||
target = i8mm
|
||||
|
||||
// 1000 + (100, 205, 110, 195) . ( 0, 1, 2, 3)
|
||||
// -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4)
|
||||
// ...
|
||||
validate 2010, -5780, 2370, -1940
|
||||
|
||||
link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
|
||||
link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
|
||||
generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
|
||||
|
||||
link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
|
||||
link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
|
||||
generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
name = vusdot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
aarch64 = usdot
|
||||
arm = vusdot
|
||||
target = i8mm
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vusdot-out-noext, a, b, {transmute, c}
|
||||
a = 1000, -4200, -1000, 2000
|
||||
b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
|
||||
c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
|
||||
|
||||
// 1000 + (100, 110, 120, 130) . (4, 3, 2, 1)
|
||||
// -4200 + (140, 150, 160, 170) . (4, 3, 2, 1)
|
||||
// ...
|
||||
n = 0
|
||||
validate 2100, -2700, 900, 4300
|
||||
|
||||
// 1000 + (100, 110, 120, 130) . (0, -1, -2, -3)
|
||||
// -4200 + (140, 150, 160, 170) . (0, -1, -2, -3)
|
||||
// ...
|
||||
n = 1
|
||||
validate 260, -5180, -2220, 540
|
||||
|
||||
generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
|
||||
generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t
|
||||
|
||||
/// Dot product index form with unsigned and signed integers
|
||||
name = vusdot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
// Only AArch64 has the laneq forms.
|
||||
aarch64 = usdot
|
||||
target = i8mm
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vusdot-out-noext, a, b, {transmute, c}
|
||||
a = 1000, -4200, -1000, 2000
|
||||
b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
|
||||
c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
|
||||
|
||||
// 1000 + (100, 110, 120, 130) . (-4, -5, -6, -7)
|
||||
// -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7)
|
||||
// ...
|
||||
n = 3
|
||||
validate -3420, -10140, -8460, -6980
|
||||
|
||||
generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t
|
||||
generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
name = vsudot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE}
|
||||
multi_fn = vsudot-outlane-_, a, b, c
|
||||
a = 1, 2, 1, 2
|
||||
b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
n = 0
|
||||
validate 31, 72, 31, 72
|
||||
target = dotprod
|
||||
|
||||
aarch64 = sudot
|
||||
link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t
|
||||
// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
|
||||
//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t
|
||||
link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t
|
||||
// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
|
||||
//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t
|
||||
arm = vsudot
|
||||
target = i8mm
|
||||
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vusdot-out-noext, a, {transmute, c}, b
|
||||
a = -2000, 4200, -1000, 2000
|
||||
b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
|
||||
c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
|
||||
|
||||
// -2000 + (4, 3, 2, 1) . (100, 110, 120, 130)
|
||||
// 4200 + (0, -1, -2, -3) . (100, 110, 120, 130)
|
||||
// ...
|
||||
n = 0
|
||||
validate -900, 3460, -3580, -2420
|
||||
|
||||
// -2000 + (4, 3, 2, 1) . (140, 150, 160, 170)
|
||||
// 4200 + (0, -1, -2, -3) . (140, 150, 160, 170)
|
||||
// ...
|
||||
n = 1
|
||||
validate -500, 3220, -4460, -3940
|
||||
|
||||
generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t
|
||||
generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t
|
||||
|
||||
/// Dot product index form with signed and unsigned integers
|
||||
name = vsudot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
// Only AArch64 has the laneq forms.
|
||||
aarch64 = sudot
|
||||
target = i8mm
|
||||
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vusdot-out-noext, a, {transmute, c}, b
|
||||
a = -2000, 4200, -1000, 2000
|
||||
b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
|
||||
c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
|
||||
|
||||
// -2000 + (4, 3, 2, 1) . (220, 230, 240, 250)
|
||||
// 4200 + (0, -1, -2, -3) . (220, 230, 240, 250)
|
||||
// ...
|
||||
n = 3
|
||||
validate 300, 2740, -6220, -6980
|
||||
|
||||
generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t
|
||||
generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t
|
||||
|
||||
/// Multiply
|
||||
name = vmul
|
||||
|
||||
@@ -799,6 +799,19 @@ fn type_to_half(t: &str) -> &str {
|
||||
}
|
||||
}
|
||||
|
||||
fn type_with_merged_lanes(t: &str, elements_per_lane: usize) -> String {
|
||||
assert_eq!(type_len(t) % elements_per_lane, 0);
|
||||
let prefix_len = t
|
||||
.find(|c: char| c.is_ascii_digit())
|
||||
.unwrap_or_else(|| t.len());
|
||||
format!(
|
||||
"{prefix}{bits}x{len}_t",
|
||||
prefix = &t[0..prefix_len],
|
||||
bits = type_bits(t) * elements_per_lane,
|
||||
len = type_len(t) / elements_per_lane
|
||||
)
|
||||
}
|
||||
|
||||
fn asc(start: i32, len: usize) -> String {
|
||||
let mut s = String::from("[");
|
||||
for i in 0..len {
|
||||
@@ -2993,6 +3006,12 @@ fn get_call(
|
||||
re = Some((re_params[0].clone(), in_t[1].to_string()));
|
||||
} else if re_params[1] == "out_t" {
|
||||
re = Some((re_params[0].clone(), out_t.to_string()));
|
||||
} else if re_params[1] == "out_unsigned" {
|
||||
re = Some((re_params[0].clone(), type_to_unsigned(out_t).to_string()));
|
||||
} else if re_params[1] == "out_signed" {
|
||||
re = Some((re_params[0].clone(), type_to_signed(out_t).to_string()));
|
||||
} else if re_params[1] == "merge4_t2" {
|
||||
re = Some((re_params[0].clone(), type_with_merged_lanes(in_t[2], 4)));
|
||||
} else if re_params[1] == "half" {
|
||||
re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string()));
|
||||
} else if re_params[1] == "in_ntt" {
|
||||
|
||||
Reference in New Issue
Block a user