Convert shuffle_ps and shuffle_pd to const generics (#1037)

This commit is contained in:
minybot
2021-03-02 00:36:01 -05:00
committed by GitHub
parent f626d43780
commit 2a9f6349d8
5 changed files with 674 additions and 1137 deletions

View File

@@ -113,44 +113,21 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd)
#[inline] #[inline]
#[target_feature(enable = "avx")] #[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))] #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_args_required_const(2)] #[rustc_legacy_const_generics(2)]
#[stable(feature = "simd_x86", since = "1.27.0")] #[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
let imm8 = (imm8 & 0xFF) as u8; static_assert_imm8!(MASK);
macro_rules! shuffle4 { simd_shuffle4(
($a:expr, $b:expr, $c:expr, $d:expr) => { a,
simd_shuffle4(a, b, [$a, $b, $c, $d]) b,
}; [
} MASK as u32 & 0b1,
macro_rules! shuffle3 { ((MASK as u32 >> 1) & 0b1) + 4,
($a:expr, $b:expr, $c:expr) => { ((MASK as u32 >> 2) & 0b1) + 2,
match (imm8 >> 3) & 0x1 { ((MASK as u32 >> 3) & 0b1) + 6,
0 => shuffle4!($a, $b, $c, 6), ],
_ => shuffle4!($a, $b, $c, 7), )
}
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
match (imm8 >> 2) & 0x1 {
0 => shuffle3!($a, $b, 2),
_ => shuffle3!($a, $b, 3),
}
};
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 1) & 0x1 {
0 => shuffle2!($a, 4),
_ => shuffle2!($a, 5),
}
};
}
match imm8 & 0x1 {
0 => shuffle1!(0),
_ => shuffle1!(1),
}
} }
/// Shuffles single-precision (32-bit) floating-point elements in `a` within /// Shuffles single-precision (32-bit) floating-point elements in `a` within
@@ -159,61 +136,25 @@ pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps)
#[inline] #[inline]
#[target_feature(enable = "avx")] #[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))] #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_args_required_const(2)] #[rustc_legacy_const_generics(2)]
#[stable(feature = "simd_x86", since = "1.27.0")] #[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
let imm8 = (imm8 & 0xFF) as u8; static_assert_imm8!(MASK);
macro_rules! shuffle4 { simd_shuffle8(
( a,
$a:expr, b,
$b:expr, [
$c:expr, MASK as u32 & 0b11,
$d:expr, (MASK as u32 >> 2) & 0b11,
$e:expr, ((MASK as u32 >> 4) & 0b11) + 8,
$f:expr, ((MASK as u32 >> 6) & 0b11) + 8,
$g:expr, (MASK as u32 & 0b11) + 4,
$h:expr ((MASK as u32 >> 2) & 0b11) + 4,
) => { ((MASK as u32 >> 4) & 0b11) + 12,
simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]) ((MASK as u32 >> 6) & 0b11) + 12,
}; ],
} )
macro_rules! shuffle3 {
($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
match (imm8 >> 6) & 0x3 {
0 => shuffle4!($a, $b, $c, 8, $e, $f, $g, 12),
1 => shuffle4!($a, $b, $c, 9, $e, $f, $g, 13),
2 => shuffle4!($a, $b, $c, 10, $e, $f, $g, 14),
_ => shuffle4!($a, $b, $c, 11, $e, $f, $g, 15),
}
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr, $e:expr, $f:expr) => {
match (imm8 >> 4) & 0x3 {
0 => shuffle3!($a, $b, 8, $e, $f, 12),
1 => shuffle3!($a, $b, 9, $e, $f, 13),
2 => shuffle3!($a, $b, 10, $e, $f, 14),
_ => shuffle3!($a, $b, 11, $e, $f, 15),
}
};
}
macro_rules! shuffle1 {
($a:expr, $e:expr) => {
match (imm8 >> 2) & 0x3 {
0 => shuffle2!($a, 0, $e, 4),
1 => shuffle2!($a, 1, $e, 5),
2 => shuffle2!($a, 2, $e, 6),
_ => shuffle2!($a, 3, $e, 7),
}
};
}
match imm8 & 0x3 {
0 => shuffle1!(0, 4),
1 => shuffle1!(1, 5),
2 => shuffle1!(2, 6),
_ => shuffle1!(3, 7),
}
} }
/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point /// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
@@ -3381,7 +3322,7 @@ mod tests {
unsafe fn test_mm256_shuffle_pd() { unsafe fn test_mm256_shuffle_pd() {
let a = _mm256_setr_pd(1., 4., 5., 8.); let a = _mm256_setr_pd(1., 4., 5., 8.);
let b = _mm256_setr_pd(2., 3., 6., 7.); let b = _mm256_setr_pd(2., 3., 6., 7.);
let r = _mm256_shuffle_pd(a, b, 0xF); let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
let e = _mm256_setr_pd(4., 3., 8., 7.); let e = _mm256_setr_pd(4., 3., 8., 7.);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -3390,7 +3331,7 @@ mod tests {
unsafe fn test_mm256_shuffle_ps() { unsafe fn test_mm256_shuffle_ps() {
let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
let r = _mm256_shuffle_ps(a, b, 0x0F); let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.); let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
assert_eq_m256(r, e); assert_eq_m256(r, e);
} }

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,20 @@
//! Utility macros. //! Utility macros.
//!
// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
// not a round number.
pub(crate) struct ValidateConstRound<const IMM: i32>;
impl<const IMM: i32> ValidateConstRound<IMM> {
pub(crate) const VALID: () = {
let _ = 1 / ((IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11) as usize);
};
}
#[allow(unused)]
macro_rules! static_assert_rounding {
($imm:ident) => {
let _ = $crate::core_arch::x86::macros::ValidateConstRound::<$imm>::VALID;
};
}
macro_rules! constify_imm6 { macro_rules! constify_imm6 {
($imm8:expr, $expand:ident) => { ($imm8:expr, $expand:ident) => {

View File

@@ -2653,21 +2653,17 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
#[target_feature(enable = "sse2")] #[target_feature(enable = "sse2")]
#[cfg_attr( #[cfg_attr(
all(test, any(not(target_os = "windows"), target_arch = "x86")), all(test, any(not(target_os = "windows"), target_arch = "x86")),
assert_instr(shufps, imm8 = 1) cfg_attr(test, assert_instr(shufps, MASK = 2)) // FIXME shufpd expected
)] )]
#[cfg_attr( #[cfg_attr(
all(test, all(target_os = "windows", target_arch = "x86_64")), all(test, all(target_os = "windows", target_arch = "x86_64")),
assert_instr(shufpd, imm8 = 1) cfg_attr(test, assert_instr(shufpd, MASK = 1))
)] )]
#[rustc_args_required_const(2)] #[rustc_legacy_const_generics(2)]
#[stable(feature = "simd_x86", since = "1.27.0")] #[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
match imm8 & 0b11 { static_assert_imm8!(MASK);
0b00 => simd_shuffle2(a, b, [0, 2]), simd_shuffle2(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
0b01 => simd_shuffle2(a, b, [1, 2]),
0b10 => simd_shuffle2(a, b, [0, 3]),
_ => simd_shuffle2(a, b, [1, 3]),
}
} }
/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower /// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
@@ -4852,7 +4848,7 @@ mod tests {
let a = _mm_setr_pd(1., 2.); let a = _mm_setr_pd(1., 2.);
let b = _mm_setr_pd(3., 4.); let b = _mm_setr_pd(3., 4.);
let expected = _mm_setr_pd(1., 3.); let expected = _mm_setr_pd(1., 3.);
let r = _mm_shuffle_pd(a, b, 0); let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
assert_eq_m128d(r, expected); assert_eq_m128d(r, expected);
} }

View File

@@ -2920,7 +2920,7 @@ mod tests {
#[simd_test(enable = "avx512f")] #[simd_test(enable = "avx512f")]
unsafe fn test_mm512_roundscale_pd() { unsafe fn test_mm512_roundscale_pd() {
let a = _mm512_set1_pd(1.1); let a = _mm512_set1_pd(1.1);
let r = _mm512_roundscale_pd(a, 0); let r = _mm512_roundscale_pd::<0b00_00_00_00>(a);
let e = _mm512_set1_pd(1.0); let e = _mm512_set1_pd(1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -2928,10 +2928,10 @@ mod tests {
#[simd_test(enable = "avx512f")] #[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_roundscale_pd() { unsafe fn test_mm512_mask_roundscale_pd() {
let a = _mm512_set1_pd(1.1); let a = _mm512_set1_pd(1.1);
let r = _mm512_mask_roundscale_pd(a, 0, a, 0); let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
let e = _mm512_set1_pd(1.1); let e = _mm512_set1_pd(1.1);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
let r = _mm512_mask_roundscale_pd(a, 0b11111111, a, 0); let r = _mm512_mask_roundscale_pd::<0b00_00_00_00>(a, 0b11111111, a);
let e = _mm512_set1_pd(1.0); let e = _mm512_set1_pd(1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -2939,9 +2939,9 @@ mod tests {
#[simd_test(enable = "avx512f")] #[simd_test(enable = "avx512f")]
unsafe fn test_mm512_maskz_roundscale_pd() { unsafe fn test_mm512_maskz_roundscale_pd() {
let a = _mm512_set1_pd(1.1); let a = _mm512_set1_pd(1.1);
let r = _mm512_maskz_roundscale_pd(0, a, 0); let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
assert_eq_m512d(r, _mm512_setzero_pd()); assert_eq_m512d(r, _mm512_setzero_pd());
let r = _mm512_maskz_roundscale_pd(0b11111111, a, 0); let r = _mm512_maskz_roundscale_pd::<0b00_00_00_00>(0b11111111, a);
let e = _mm512_set1_pd(1.0); let e = _mm512_set1_pd(1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -2949,7 +2949,7 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_roundscale_pd() { unsafe fn test_mm256_roundscale_pd() {
let a = _mm256_set1_pd(1.1); let a = _mm256_set1_pd(1.1);
let r = _mm256_roundscale_pd(a, 0); let r = _mm256_roundscale_pd::<0b00_00_00_00>(a);
let e = _mm256_set1_pd(1.0); let e = _mm256_set1_pd(1.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -2957,10 +2957,9 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_roundscale_pd() { unsafe fn test_mm256_mask_roundscale_pd() {
let a = _mm256_set1_pd(1.1); let a = _mm256_set1_pd(1.1);
let r = _mm256_mask_roundscale_pd(a, 0, a, 0); let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
let e = _mm256_set1_pd(1.1); assert_eq_m256d(r, a);
assert_eq_m256d(r, e); let r = _mm256_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00001111, a);
let r = _mm256_mask_roundscale_pd(a, 0b00001111, a, 0);
let e = _mm256_set1_pd(1.0); let e = _mm256_set1_pd(1.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -2968,9 +2967,9 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_maskz_roundscale_pd() { unsafe fn test_mm256_maskz_roundscale_pd() {
let a = _mm256_set1_pd(1.1); let a = _mm256_set1_pd(1.1);
let r = _mm256_maskz_roundscale_pd(0, a, 0); let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
assert_eq_m256d(r, _mm256_setzero_pd()); assert_eq_m256d(r, _mm256_setzero_pd());
let r = _mm256_maskz_roundscale_pd(0b00001111, a, 0); let r = _mm256_maskz_roundscale_pd::<0b00_00_00_00>(0b00001111, a);
let e = _mm256_set1_pd(1.0); let e = _mm256_set1_pd(1.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -2978,7 +2977,7 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_roundscale_pd() { unsafe fn test_mm_roundscale_pd() {
let a = _mm_set1_pd(1.1); let a = _mm_set1_pd(1.1);
let r = _mm_roundscale_pd(a, 0); let r = _mm_roundscale_pd::<0b00_00_00_00>(a);
let e = _mm_set1_pd(1.0); let e = _mm_set1_pd(1.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -2986,10 +2985,10 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_roundscale_pd() { unsafe fn test_mm_mask_roundscale_pd() {
let a = _mm_set1_pd(1.1); let a = _mm_set1_pd(1.1);
let r = _mm_mask_roundscale_pd(a, 0, a, 0); let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0, a);
let e = _mm_set1_pd(1.1); let e = _mm_set1_pd(1.1);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
let r = _mm_mask_roundscale_pd(a, 0b00000011, a, 0); let r = _mm_mask_roundscale_pd::<0b00_00_00_00>(a, 0b00000011, a);
let e = _mm_set1_pd(1.0); let e = _mm_set1_pd(1.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -2997,9 +2996,9 @@ mod tests {
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_maskz_roundscale_pd() { unsafe fn test_mm_maskz_roundscale_pd() {
let a = _mm_set1_pd(1.1); let a = _mm_set1_pd(1.1);
let r = _mm_maskz_roundscale_pd(0, a, 0); let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0, a);
assert_eq_m128d(r, _mm_setzero_pd()); assert_eq_m128d(r, _mm_setzero_pd());
let r = _mm_maskz_roundscale_pd(0b00000011, a, 0); let r = _mm_maskz_roundscale_pd::<0b00_00_00_00>(0b00000011, a);
let e = _mm_set1_pd(1.0); let e = _mm_set1_pd(1.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -3102,7 +3101,7 @@ mod tests {
let a = _mm512_set1_pd(f64::NAN); let a = _mm512_set1_pd(f64::NAN);
let b = _mm512_set1_pd(f64::MAX); let b = _mm512_set1_pd(f64::MAX);
let c = _mm512_set1_epi64(i32::MAX as i64); let c = _mm512_set1_epi64(i32::MAX as i64);
let r = _mm512_fixupimm_pd(a, b, c, 5); let r = _mm512_fixupimm_pd::<5>(a, b, c);
let e = _mm512_set1_pd(0.0); let e = _mm512_set1_pd(0.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -3112,7 +3111,7 @@ mod tests {
let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
let b = _mm512_set1_pd(f64::MAX); let b = _mm512_set1_pd(f64::MAX);
let c = _mm512_set1_epi64(i32::MAX as i64); let c = _mm512_set1_epi64(i32::MAX as i64);
let r = _mm512_mask_fixupimm_pd(a, 0b11110000, b, c, 5); let r = _mm512_mask_fixupimm_pd::<5>(a, 0b11110000, b, c);
let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.); let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -3122,7 +3121,7 @@ mod tests {
let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
let b = _mm512_set1_pd(f64::MAX); let b = _mm512_set1_pd(f64::MAX);
let c = _mm512_set1_epi64(i32::MAX as i64); let c = _mm512_set1_epi64(i32::MAX as i64);
let r = _mm512_maskz_fixupimm_pd(0b11110000, a, b, c, 5); let r = _mm512_maskz_fixupimm_pd::<5>(0b11110000, a, b, c);
let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.); let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -3132,7 +3131,7 @@ mod tests {
let a = _mm256_set1_pd(f64::NAN); let a = _mm256_set1_pd(f64::NAN);
let b = _mm256_set1_pd(f64::MAX); let b = _mm256_set1_pd(f64::MAX);
let c = _mm256_set1_epi64x(i32::MAX as i64); let c = _mm256_set1_epi64x(i32::MAX as i64);
let r = _mm256_fixupimm_pd(a, b, c, 5); let r = _mm256_fixupimm_pd::<5>(a, b, c);
let e = _mm256_set1_pd(0.0); let e = _mm256_set1_pd(0.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -3142,7 +3141,7 @@ mod tests {
let a = _mm256_set1_pd(f64::NAN); let a = _mm256_set1_pd(f64::NAN);
let b = _mm256_set1_pd(f64::MAX); let b = _mm256_set1_pd(f64::MAX);
let c = _mm256_set1_epi64x(i32::MAX as i64); let c = _mm256_set1_epi64x(i32::MAX as i64);
let r = _mm256_mask_fixupimm_pd(a, 0b00001111, b, c, 5); let r = _mm256_mask_fixupimm_pd::<5>(a, 0b00001111, b, c);
let e = _mm256_set1_pd(0.0); let e = _mm256_set1_pd(0.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -3152,7 +3151,7 @@ mod tests {
let a = _mm256_set1_pd(f64::NAN); let a = _mm256_set1_pd(f64::NAN);
let b = _mm256_set1_pd(f64::MAX); let b = _mm256_set1_pd(f64::MAX);
let c = _mm256_set1_epi64x(i32::MAX as i64); let c = _mm256_set1_epi64x(i32::MAX as i64);
let r = _mm256_maskz_fixupimm_pd(0b00001111, a, b, c, 5); let r = _mm256_maskz_fixupimm_pd::<5>(0b00001111, a, b, c);
let e = _mm256_set1_pd(0.0); let e = _mm256_set1_pd(0.0);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -3162,7 +3161,7 @@ mod tests {
let a = _mm_set1_pd(f64::NAN); let a = _mm_set1_pd(f64::NAN);
let b = _mm_set1_pd(f64::MAX); let b = _mm_set1_pd(f64::MAX);
let c = _mm_set1_epi64x(i32::MAX as i64); let c = _mm_set1_epi64x(i32::MAX as i64);
let r = _mm_fixupimm_pd(a, b, c, 5); let r = _mm_fixupimm_pd::<5>(a, b, c);
let e = _mm_set1_pd(0.0); let e = _mm_set1_pd(0.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -3172,7 +3171,7 @@ mod tests {
let a = _mm_set1_pd(f64::NAN); let a = _mm_set1_pd(f64::NAN);
let b = _mm_set1_pd(f64::MAX); let b = _mm_set1_pd(f64::MAX);
let c = _mm_set1_epi64x(i32::MAX as i64); let c = _mm_set1_epi64x(i32::MAX as i64);
let r = _mm_mask_fixupimm_pd(a, 0b00000011, b, c, 5); let r = _mm_mask_fixupimm_pd::<5>(a, 0b00000011, b, c);
let e = _mm_set1_pd(0.0); let e = _mm_set1_pd(0.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -3182,7 +3181,7 @@ mod tests {
let a = _mm_set1_pd(f64::NAN); let a = _mm_set1_pd(f64::NAN);
let b = _mm_set1_pd(f64::MAX); let b = _mm_set1_pd(f64::MAX);
let c = _mm_set1_epi64x(i32::MAX as i64); let c = _mm_set1_epi64x(i32::MAX as i64);
let r = _mm_maskz_fixupimm_pd(0b00000011, a, b, c, 5); let r = _mm_maskz_fixupimm_pd::<5>(0b00000011, a, b, c);
let e = _mm_set1_pd(0.0); let e = _mm_set1_pd(0.0);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -3192,7 +3191,7 @@ mod tests {
let a = _mm512_set1_epi64(1 << 2); let a = _mm512_set1_epi64(1 << 2);
let b = _mm512_set1_epi64(1 << 1); let b = _mm512_set1_epi64(1 << 1);
let c = _mm512_set1_epi64(1 << 0); let c = _mm512_set1_epi64(1 << 0);
let r = _mm512_ternarylogic_epi64(a, b, c, 8); let r = _mm512_ternarylogic_epi64::<8>(a, b, c);
let e = _mm512_set1_epi64(0); let e = _mm512_set1_epi64(0);
assert_eq_m512i(r, e); assert_eq_m512i(r, e);
} }
@@ -3202,9 +3201,9 @@ mod tests {
let src = _mm512_set1_epi64(1 << 2); let src = _mm512_set1_epi64(1 << 2);
let a = _mm512_set1_epi64(1 << 1); let a = _mm512_set1_epi64(1 << 1);
let b = _mm512_set1_epi64(1 << 0); let b = _mm512_set1_epi64(1 << 0);
let r = _mm512_mask_ternarylogic_epi64(src, 0, a, b, 8); let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0, a, b);
assert_eq_m512i(r, src); assert_eq_m512i(r, src);
let r = _mm512_mask_ternarylogic_epi64(src, 0b11111111, a, b, 8); let r = _mm512_mask_ternarylogic_epi64::<8>(src, 0b11111111, a, b);
let e = _mm512_set1_epi64(0); let e = _mm512_set1_epi64(0);
assert_eq_m512i(r, e); assert_eq_m512i(r, e);
} }
@@ -3214,9 +3213,9 @@ mod tests {
let a = _mm512_set1_epi64(1 << 2); let a = _mm512_set1_epi64(1 << 2);
let b = _mm512_set1_epi64(1 << 1); let b = _mm512_set1_epi64(1 << 1);
let c = _mm512_set1_epi64(1 << 0); let c = _mm512_set1_epi64(1 << 0);
let r = _mm512_maskz_ternarylogic_epi64(0, a, b, c, 9); let r = _mm512_maskz_ternarylogic_epi64::<8>(0, a, b, c);
assert_eq_m512i(r, _mm512_setzero_si512()); assert_eq_m512i(r, _mm512_setzero_si512());
let r = _mm512_maskz_ternarylogic_epi64(0b11111111, a, b, c, 8); let r = _mm512_maskz_ternarylogic_epi64::<8>(0b11111111, a, b, c);
let e = _mm512_set1_epi64(0); let e = _mm512_set1_epi64(0);
assert_eq_m512i(r, e); assert_eq_m512i(r, e);
} }
@@ -3226,7 +3225,7 @@ mod tests {
let a = _mm256_set1_epi64x(1 << 2); let a = _mm256_set1_epi64x(1 << 2);
let b = _mm256_set1_epi64x(1 << 1); let b = _mm256_set1_epi64x(1 << 1);
let c = _mm256_set1_epi64x(1 << 0); let c = _mm256_set1_epi64x(1 << 0);
let r = _mm256_ternarylogic_epi64(a, b, c, 8); let r = _mm256_ternarylogic_epi64::<8>(a, b, c);
let e = _mm256_set1_epi64x(0); let e = _mm256_set1_epi64x(0);
assert_eq_m256i(r, e); assert_eq_m256i(r, e);
} }
@@ -3236,9 +3235,9 @@ mod tests {
let src = _mm256_set1_epi64x(1 << 2); let src = _mm256_set1_epi64x(1 << 2);
let a = _mm256_set1_epi64x(1 << 1); let a = _mm256_set1_epi64x(1 << 1);
let b = _mm256_set1_epi64x(1 << 0); let b = _mm256_set1_epi64x(1 << 0);
let r = _mm256_mask_ternarylogic_epi64(src, 0, a, b, 8); let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0, a, b);
assert_eq_m256i(r, src); assert_eq_m256i(r, src);
let r = _mm256_mask_ternarylogic_epi64(src, 0b00001111, a, b, 8); let r = _mm256_mask_ternarylogic_epi64::<8>(src, 0b00001111, a, b);
let e = _mm256_set1_epi64x(0); let e = _mm256_set1_epi64x(0);
assert_eq_m256i(r, e); assert_eq_m256i(r, e);
} }
@@ -3248,9 +3247,9 @@ mod tests {
let a = _mm256_set1_epi64x(1 << 2); let a = _mm256_set1_epi64x(1 << 2);
let b = _mm256_set1_epi64x(1 << 1); let b = _mm256_set1_epi64x(1 << 1);
let c = _mm256_set1_epi64x(1 << 0); let c = _mm256_set1_epi64x(1 << 0);
let r = _mm256_maskz_ternarylogic_epi64(0, a, b, c, 9); let r = _mm256_maskz_ternarylogic_epi64::<9>(0, a, b, c);
assert_eq_m256i(r, _mm256_setzero_si256()); assert_eq_m256i(r, _mm256_setzero_si256());
let r = _mm256_maskz_ternarylogic_epi64(0b00001111, a, b, c, 8); let r = _mm256_maskz_ternarylogic_epi64::<8>(0b00001111, a, b, c);
let e = _mm256_set1_epi64x(0); let e = _mm256_set1_epi64x(0);
assert_eq_m256i(r, e); assert_eq_m256i(r, e);
} }
@@ -3260,7 +3259,7 @@ mod tests {
let a = _mm_set1_epi64x(1 << 2); let a = _mm_set1_epi64x(1 << 2);
let b = _mm_set1_epi64x(1 << 1); let b = _mm_set1_epi64x(1 << 1);
let c = _mm_set1_epi64x(1 << 0); let c = _mm_set1_epi64x(1 << 0);
let r = _mm_ternarylogic_epi64(a, b, c, 8); let r = _mm_ternarylogic_epi64::<8>(a, b, c);
let e = _mm_set1_epi64x(0); let e = _mm_set1_epi64x(0);
assert_eq_m128i(r, e); assert_eq_m128i(r, e);
} }
@@ -3270,9 +3269,9 @@ mod tests {
let src = _mm_set1_epi64x(1 << 2); let src = _mm_set1_epi64x(1 << 2);
let a = _mm_set1_epi64x(1 << 1); let a = _mm_set1_epi64x(1 << 1);
let b = _mm_set1_epi64x(1 << 0); let b = _mm_set1_epi64x(1 << 0);
let r = _mm_mask_ternarylogic_epi64(src, 0, a, b, 8); let r = _mm_mask_ternarylogic_epi64::<8>(src, 0, a, b);
assert_eq_m128i(r, src); assert_eq_m128i(r, src);
let r = _mm_mask_ternarylogic_epi64(src, 0b00000011, a, b, 8); let r = _mm_mask_ternarylogic_epi64::<8>(src, 0b00000011, a, b);
let e = _mm_set1_epi64x(0); let e = _mm_set1_epi64x(0);
assert_eq_m128i(r, e); assert_eq_m128i(r, e);
} }
@@ -3282,9 +3281,9 @@ mod tests {
let a = _mm_set1_epi64x(1 << 2); let a = _mm_set1_epi64x(1 << 2);
let b = _mm_set1_epi64x(1 << 1); let b = _mm_set1_epi64x(1 << 1);
let c = _mm_set1_epi64x(1 << 0); let c = _mm_set1_epi64x(1 << 0);
let r = _mm_maskz_ternarylogic_epi64(0, a, b, c, 9); let r = _mm_maskz_ternarylogic_epi64::<9>(0, a, b, c);
assert_eq_m128i(r, _mm_setzero_si128()); assert_eq_m128i(r, _mm_setzero_si128());
let r = _mm_maskz_ternarylogic_epi64(0b00000011, a, b, c, 8); let r = _mm_maskz_ternarylogic_epi64::<8>(0b00000011, a, b, c);
let e = _mm_set1_epi64x(0); let e = _mm_set1_epi64x(0);
assert_eq_m128i(r, e); assert_eq_m128i(r, e);
} }
@@ -5308,10 +5307,10 @@ mod tests {
unsafe fn test_mm512_add_round_pd() { unsafe fn test_mm512_add_round_pd() {
let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007); let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
let b = _mm512_set1_pd(-1.); let b = _mm512_set1_pd(-1.);
let r = _mm512_add_round_pd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0); let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
let r = _mm512_add_round_pd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); let r = _mm512_add_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999); let e = _mm512_setr_pd(7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999999999999);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
} }
@@ -5320,14 +5319,12 @@ mod tests {
unsafe fn test_mm512_mask_add_round_pd() { unsafe fn test_mm512_mask_add_round_pd() {
let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007); let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
let b = _mm512_set1_pd(-1.); let b = _mm512_set1_pd(-1.);
let r = _mm512_mask_add_round_pd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
a, 0, a, b,
);
assert_eq_m512d(r, a); assert_eq_m512d(r, a);
let r = _mm512_mask_add_round_pd( let r = _mm512_mask_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
a, a, 0b11110000, a, b,
0b11110000,
a,
b,
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
); );
let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0); let e = _mm512_setr_pd(8., 9.5, 10., 11.5, 11., 12.5, 13., -1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
@@ -5337,13 +5334,11 @@ mod tests {
unsafe fn test_mm512_maskz_add_round_pd() { unsafe fn test_mm512_maskz_add_round_pd() {
let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007); let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);
let b = _mm512_set1_pd(-1.); let b = _mm512_set1_pd(-1.);
let r = _mm512_maskz_add_round_pd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); let r =
_mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
assert_eq_m512d(r, _mm512_setzero_pd()); assert_eq_m512d(r, _mm512_setzero_pd());
let r = _mm512_maskz_add_round_pd( let r = _mm512_maskz_add_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
0b11110000, 0b11110000, a, b,
a,
b,
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
); );
let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0); let e = _mm512_setr_pd(0., 0., 0., 0., 11., 12.5, 13., -1.0);
assert_eq_m512d(r, e); assert_eq_m512d(r, e);
@@ -9715,70 +9710,13 @@ mod tests {
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_shuffle_pd() {
let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
let r = _mm512_shuffle_pd(
a,
b,
1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7,
);
let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
assert_eq_m512d(r, e);
}
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_shuffle_pd() {
let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
let r = _mm512_mask_shuffle_pd(
a,
0,
a,
b,
1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7,
);
assert_eq_m512d(r, a);
let r = _mm512_mask_shuffle_pd(
a,
0b11111111,
a,
b,
1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7,
);
let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
assert_eq_m512d(r, e);
}
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_maskz_shuffle_pd() {
let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
let r = _mm512_maskz_shuffle_pd(
0,
a,
b,
1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7,
);
assert_eq_m512d(r, _mm512_setzero_pd());
let r = _mm512_maskz_shuffle_pd(
0b00001111,
a,
b,
1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7,
);
let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
assert_eq_m512d(r, e);
}
#[simd_test(enable = "avx512f,avx512vl")] #[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_shuffle_pd() { unsafe fn test_mm256_mask_shuffle_pd() {
let a = _mm256_set_pd(1., 4., 5., 8.); let a = _mm256_set_pd(1., 4., 5., 8.);
let b = _mm256_set_pd(2., 3., 6., 7.); let b = _mm256_set_pd(2., 3., 6., 7.);
let r = _mm256_mask_shuffle_pd(a, 0, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3); let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
assert_eq_m256d(r, a); assert_eq_m256d(r, a);
let r = _mm256_mask_shuffle_pd(a, 0b00001111, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3); let r = _mm256_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00001111, a, b);
let e = _mm256_set_pd(2., 1., 6., 5.); let e = _mm256_set_pd(2., 1., 6., 5.);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -9787,9 +9725,9 @@ mod tests {
unsafe fn test_mm256_maskz_shuffle_pd() { unsafe fn test_mm256_maskz_shuffle_pd() {
let a = _mm256_set_pd(1., 4., 5., 8.); let a = _mm256_set_pd(1., 4., 5., 8.);
let b = _mm256_set_pd(2., 3., 6., 7.); let b = _mm256_set_pd(2., 3., 6., 7.);
let r = _mm256_maskz_shuffle_pd(0, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3); let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
assert_eq_m256d(r, _mm256_setzero_pd()); assert_eq_m256d(r, _mm256_setzero_pd());
let r = _mm256_maskz_shuffle_pd(0b00001111, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3); let r = _mm256_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
let e = _mm256_set_pd(2., 1., 6., 5.); let e = _mm256_set_pd(2., 1., 6., 5.);
assert_eq_m256d(r, e); assert_eq_m256d(r, e);
} }
@@ -9798,9 +9736,9 @@ mod tests {
unsafe fn test_mm_mask_shuffle_pd() { unsafe fn test_mm_mask_shuffle_pd() {
let a = _mm_set_pd(1., 4.); let a = _mm_set_pd(1., 4.);
let b = _mm_set_pd(2., 3.); let b = _mm_set_pd(2., 3.);
let r = _mm_mask_shuffle_pd(a, 0, a, b, 1 << 0 | 1 << 1); let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
assert_eq_m128d(r, a); assert_eq_m128d(r, a);
let r = _mm_mask_shuffle_pd(a, 0b00000011, a, b, 1 << 0 | 1 << 1); let r = _mm_mask_shuffle_pd::<0b11_11_11_11>(a, 0b00000011, a, b);
let e = _mm_set_pd(2., 1.); let e = _mm_set_pd(2., 1.);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }
@@ -9809,9 +9747,9 @@ mod tests {
unsafe fn test_mm_maskz_shuffle_pd() { unsafe fn test_mm_maskz_shuffle_pd() {
let a = _mm_set_pd(1., 4.); let a = _mm_set_pd(1., 4.);
let b = _mm_set_pd(2., 3.); let b = _mm_set_pd(2., 3.);
let r = _mm_maskz_shuffle_pd(0, a, b, 1 << 0 | 1 << 1); let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
assert_eq_m128d(r, _mm_setzero_pd()); assert_eq_m128d(r, _mm_setzero_pd());
let r = _mm_maskz_shuffle_pd(0b00000011, a, b, 1 << 0 | 1 << 1); let r = _mm_maskz_shuffle_pd::<0b11_11_11_11>(0b00000011, a, b);
let e = _mm_set_pd(2., 1.); let e = _mm_set_pd(2., 1.);
assert_eq_m128d(r, e); assert_eq_m128d(r, e);
} }