reformat with latest rustfmt
This commit is contained in:
@@ -16,36 +16,36 @@ extern "C" {
|
||||
fn vsha1h_u32_(hash_e: u32) -> u32;
|
||||
#[link_name = "llvm.aarch64.crypto.sha1su0"]
|
||||
fn vsha1su0q_u32_(
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha1su1"]
|
||||
fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha1c"]
|
||||
fn vsha1cq_u32_(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha1p"]
|
||||
fn vsha1pq_u32_(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha1m"]
|
||||
fn vsha1mq_u32_(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
|
||||
#[link_name = "llvm.aarch64.crypto.sha256h"]
|
||||
fn vsha256hq_u32_(
|
||||
hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha256h2"]
|
||||
fn vsha256h2q_u32_(
|
||||
hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
|
||||
hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha256su0"]
|
||||
fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
|
||||
#[link_name = "llvm.aarch64.crypto.sha256su1"]
|
||||
fn vsha256su1q_u32_(
|
||||
tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
|
||||
tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
|
||||
) -> uint32x4_t;
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 {
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha1c))]
|
||||
pub unsafe fn vsha1cq_u32(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha1cq_u32_(hash_abcd, hash_e, wk)
|
||||
}
|
||||
@@ -107,7 +107,7 @@ pub unsafe fn vsha1cq_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha1m))]
|
||||
pub unsafe fn vsha1mq_u32(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha1mq_u32_(hash_abcd, hash_e, wk)
|
||||
}
|
||||
@@ -117,7 +117,7 @@ pub unsafe fn vsha1mq_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha1p))]
|
||||
pub unsafe fn vsha1pq_u32(
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha1pq_u32_(hash_abcd, hash_e, wk)
|
||||
}
|
||||
@@ -127,7 +127,7 @@ pub unsafe fn vsha1pq_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha1su0))]
|
||||
pub unsafe fn vsha1su0q_u32(
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha1su0q_u32_(w0_3, w4_7, w8_11)
|
||||
}
|
||||
@@ -137,7 +137,7 @@ pub unsafe fn vsha1su0q_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha1su1))]
|
||||
pub unsafe fn vsha1su1q_u32(
|
||||
tw0_3: uint32x4_t, w12_15: uint32x4_t
|
||||
tw0_3: uint32x4_t, w12_15: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha1su1q_u32_(tw0_3, w12_15)
|
||||
}
|
||||
@@ -147,7 +147,7 @@ pub unsafe fn vsha1su1q_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha256h))]
|
||||
pub unsafe fn vsha256hq_u32(
|
||||
hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
|
||||
hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha256hq_u32_(hash_abcd, hash_efgh, wk)
|
||||
}
|
||||
@@ -157,7 +157,7 @@ pub unsafe fn vsha256hq_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha256h2))]
|
||||
pub unsafe fn vsha256h2q_u32(
|
||||
hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
|
||||
hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha256h2q_u32_(hash_efgh, hash_abcd, wk)
|
||||
}
|
||||
@@ -167,7 +167,7 @@ pub unsafe fn vsha256h2q_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha256su0))]
|
||||
pub unsafe fn vsha256su0q_u32(
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t
|
||||
w0_3: uint32x4_t, w4_7: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha256su0q_u32_(w0_3, w4_7)
|
||||
}
|
||||
@@ -177,7 +177,7 @@ pub unsafe fn vsha256su0q_u32(
|
||||
#[target_feature(enable = "crypto")]
|
||||
#[cfg_attr(test, assert_instr(sha256su1))]
|
||||
pub unsafe fn vsha256su1q_u32(
|
||||
tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
|
||||
tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
|
||||
) -> uint32x4_t {
|
||||
vsha256su1q_u32_(tw0_3, w8_11, w12_15)
|
||||
}
|
||||
@@ -199,22 +199,8 @@ mod tests {
|
||||
assert_eq!(
|
||||
r,
|
||||
u8x16::new(
|
||||
124,
|
||||
123,
|
||||
124,
|
||||
118,
|
||||
124,
|
||||
123,
|
||||
124,
|
||||
197,
|
||||
124,
|
||||
123,
|
||||
124,
|
||||
118,
|
||||
124,
|
||||
123,
|
||||
124,
|
||||
197
|
||||
124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118,
|
||||
124, 123, 124, 197
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -229,22 +215,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
r,
|
||||
u8x16::new(
|
||||
9,
|
||||
213,
|
||||
9,
|
||||
251,
|
||||
9,
|
||||
213,
|
||||
9,
|
||||
56,
|
||||
9,
|
||||
213,
|
||||
9,
|
||||
251,
|
||||
9,
|
||||
213,
|
||||
9,
|
||||
56
|
||||
9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -256,24 +227,7 @@ mod tests {
|
||||
let r: u8x16 = vaesmcq_u8(data).into_bits();
|
||||
assert_eq!(
|
||||
r,
|
||||
u8x16::new(
|
||||
3,
|
||||
4,
|
||||
9,
|
||||
10,
|
||||
15,
|
||||
8,
|
||||
21,
|
||||
30,
|
||||
3,
|
||||
4,
|
||||
9,
|
||||
10,
|
||||
15,
|
||||
8,
|
||||
21,
|
||||
30
|
||||
)
|
||||
u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -285,22 +239,8 @@ mod tests {
|
||||
assert_eq!(
|
||||
r,
|
||||
u8x16::new(
|
||||
43,
|
||||
60,
|
||||
33,
|
||||
50,
|
||||
103,
|
||||
80,
|
||||
125,
|
||||
70,
|
||||
43,
|
||||
60,
|
||||
33,
|
||||
50,
|
||||
103,
|
||||
80,
|
||||
125,
|
||||
70
|
||||
43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80,
|
||||
125, 70
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -546,7 +546,6 @@ pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
|
||||
vpmaxq_f64_(a, b)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use coresimd::aarch64::*;
|
||||
@@ -800,20 +799,11 @@ mod tests {
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vpminq_s8() {
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let a = i8x16::new(
|
||||
1, -2, 3, -4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8
|
||||
);
|
||||
let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let b = i8x16::new(
|
||||
0, 3, 2, 5, 4, 7, 6, 9,
|
||||
0, 3, 2, 5, 4, 7, 6, 9
|
||||
);
|
||||
let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let e = i8x16::new(
|
||||
-2, -4, 5, 7, 1, 3, 5, 7,
|
||||
0, 2, 4, 6, 0, 2, 4, 6,
|
||||
);
|
||||
let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
|
||||
let r: i8x16 = vpminq_s8(a.into_bits(), b.into_bits()).into_bits();
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
@@ -839,20 +829,11 @@ mod tests {
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vpminq_u8() {
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let a = u8x16::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8
|
||||
);
|
||||
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let b = u8x16::new(
|
||||
0, 3, 2, 5, 4, 7, 6, 9,
|
||||
0, 3, 2, 5, 4, 7, 6, 9
|
||||
);
|
||||
let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let e = u8x16::new(
|
||||
1, 3, 5, 7, 1, 3, 5, 7,
|
||||
0, 2, 4, 6, 0, 2, 4, 6,
|
||||
);
|
||||
let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
|
||||
let r: u8x16 = vpminq_u8(a.into_bits(), b.into_bits()).into_bits();
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
@@ -896,20 +877,11 @@ mod tests {
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vpmaxq_s8() {
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let a = i8x16::new(
|
||||
1, -2, 3, -4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8
|
||||
);
|
||||
let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let b = i8x16::new(
|
||||
0, 3, 2, 5, 4, 7, 6, 9,
|
||||
0, 3, 2, 5, 4, 7, 6, 9
|
||||
);
|
||||
let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let e = i8x16::new(
|
||||
1, 3, 6, 8, 2, 4, 6, 8,
|
||||
3, 5, 7, 9, 3, 5, 7, 9,
|
||||
);
|
||||
let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
|
||||
let r: i8x16 = vpmaxq_s8(a.into_bits(), b.into_bits()).into_bits();
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
@@ -935,20 +907,11 @@ mod tests {
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vpmaxq_u8() {
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let a = u8x16::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8
|
||||
);
|
||||
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let b = u8x16::new(
|
||||
0, 3, 2, 5, 4, 7, 6, 9,
|
||||
0, 3, 2, 5, 4, 7, 6, 9
|
||||
);
|
||||
let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
|
||||
#[cfg_attr(rustfmt, skip)]
|
||||
let e = u8x16::new(
|
||||
2, 4, 6, 8, 2, 4, 6, 8,
|
||||
3, 5, 7, 9, 3, 5, 7, 9,
|
||||
);
|
||||
let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
|
||||
let r: u8x16 = vpmaxq_u8(a.into_bits(), b.into_bits()).into_bits();
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
@@ -19,11 +19,19 @@ pub use self::v7::*;
|
||||
|
||||
// NEON is supported on AArch64, and on ARM when built with the v7 and neon
|
||||
// features. Building ARM without neon produces incorrect codegen.
|
||||
#[cfg(any(target_arch = "aarch64",
|
||||
#[cfg(
|
||||
any(
|
||||
target_arch = "aarch64",
|
||||
all(target_feature = "v7", target_feature = "neon"),
|
||||
dox))]
|
||||
dox
|
||||
)
|
||||
)]
|
||||
mod neon;
|
||||
#[cfg(any(target_arch = "aarch64",
|
||||
#[cfg(
|
||||
any(
|
||||
target_arch = "aarch64",
|
||||
all(target_feature = "v7", target_feature = "neon"),
|
||||
dox))]
|
||||
dox
|
||||
)
|
||||
)]
|
||||
pub use self::neon::*;
|
||||
|
||||
@@ -366,52 +366,82 @@ impl_from_bits_!(
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32"
|
||||
)]
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
|
||||
fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
|
||||
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8"
|
||||
)]
|
||||
fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16"
|
||||
)]
|
||||
fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32"
|
||||
)]
|
||||
fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8"
|
||||
)]
|
||||
fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16"
|
||||
)]
|
||||
fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32"
|
||||
)]
|
||||
fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32"
|
||||
)]
|
||||
fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
|
||||
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8"
|
||||
)]
|
||||
fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16"
|
||||
)]
|
||||
fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32"
|
||||
)]
|
||||
fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8"
|
||||
)]
|
||||
fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16"
|
||||
)]
|
||||
fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32"
|
||||
)]
|
||||
fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
|
||||
#[cfg_attr(
|
||||
target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32"
|
||||
)]
|
||||
fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
|
||||
}
|
||||
|
||||
@@ -916,7 +946,6 @@ pub unsafe fn vpmax_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t {
|
||||
vpmaxf_v2f32(a, b)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use coresimd::arm::*;
|
||||
|
||||
@@ -75,8 +75,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
i8x16:
|
||||
vector_signed_char,
|
||||
i8x16: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -114,8 +113,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
u8x16:
|
||||
vector_signed_char,
|
||||
u8x16: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -135,11 +133,7 @@ impl_from_bits_!(
|
||||
vector_bool_short,
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
m8x16: vector_bool_char,
|
||||
vector_bool_short,
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(m8x16: vector_bool_char, vector_bool_short, vector_bool_int);
|
||||
|
||||
impl_from_bits_!(
|
||||
vector_signed_short: u64x2,
|
||||
@@ -166,8 +160,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
i16x8:
|
||||
vector_signed_char,
|
||||
i16x8: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -204,8 +197,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
u16x8:
|
||||
vector_signed_char,
|
||||
u16x8: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -251,8 +243,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
i32x4:
|
||||
vector_signed_char,
|
||||
i32x4: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -289,8 +280,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
u32x4:
|
||||
vector_signed_char,
|
||||
u32x4: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -345,8 +335,7 @@ impl_from_bits_!(
|
||||
vector_bool_int
|
||||
);
|
||||
impl_from_bits_!(
|
||||
f32x4:
|
||||
vector_signed_char,
|
||||
f32x4: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -361,9 +350,13 @@ impl_from_bits_!(
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[link_name = "llvm.ppc.altivec.vperm"]
|
||||
fn vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int;
|
||||
fn vperm(
|
||||
a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
|
||||
) -> vector_signed_int;
|
||||
#[link_name = "llvm.ppc.altivec.vmhaddshs"]
|
||||
fn vmhaddshs(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short;
|
||||
fn vmhaddshs(
|
||||
a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
|
||||
) -> vector_signed_short;
|
||||
}
|
||||
|
||||
mod sealed {
|
||||
@@ -373,7 +366,9 @@ mod sealed {
|
||||
#[inline]
|
||||
#[target_feature(enable = "altivec")]
|
||||
#[cfg_attr(test, assert_instr(vperm))]
|
||||
unsafe fn vec_vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int {
|
||||
unsafe fn vec_vperm(
|
||||
a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
|
||||
) -> vector_signed_int {
|
||||
vperm(a, b, c)
|
||||
}
|
||||
|
||||
@@ -703,7 +698,6 @@ where
|
||||
a.vec_add(b)
|
||||
}
|
||||
|
||||
|
||||
/// Endian-biased intrinsics
|
||||
#[cfg(target_endian = "little")]
|
||||
mod endian {
|
||||
@@ -718,8 +712,10 @@ mod endian {
|
||||
// vperm has big-endian bias
|
||||
//
|
||||
// Xor the mask and flip the arguments
|
||||
let d = u8x16::new(255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255).into_bits();
|
||||
let d = u8x16::new(
|
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255,
|
||||
).into_bits();
|
||||
let c = simd_xor(c, d);
|
||||
|
||||
b.vec_vperm(a, c)
|
||||
@@ -730,7 +726,9 @@ mod endian {
|
||||
#[inline]
|
||||
#[target_feature(enable = "altivec")]
|
||||
#[cfg_attr(test, assert_instr(vmhaddshs))]
|
||||
pub unsafe fn vec_madds(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short {
|
||||
pub unsafe fn vec_madds(
|
||||
a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
|
||||
) -> vector_signed_short {
|
||||
vmhaddshs(a, b, c)
|
||||
}
|
||||
|
||||
@@ -850,9 +848,20 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "altivec")]
|
||||
unsafe fn test_vec_madds() {
|
||||
let a: vector_signed_short = i16x8::new(0 * 256, 1 * 256, 2 * 256, 3 * 256, 4 * 256, 5 * 256, 6 * 256, 7 * 256).into_bits();
|
||||
let b: vector_signed_short = i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
|
||||
let c: vector_signed_short = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
|
||||
let a: vector_signed_short = i16x8::new(
|
||||
0 * 256,
|
||||
1 * 256,
|
||||
2 * 256,
|
||||
3 * 256,
|
||||
4 * 256,
|
||||
5 * 256,
|
||||
6 * 256,
|
||||
7 * 256,
|
||||
).into_bits();
|
||||
let b: vector_signed_short =
|
||||
i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
|
||||
let c: vector_signed_short =
|
||||
i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
|
||||
|
||||
let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! PowerPC 64
|
||||
//!
|
||||
//! The reference is the [64-Bit ELF V2 ABI Specification - Power Architecture].
|
||||
//! The reference is the [64-Bit ELF V2 ABI Specification - Power
|
||||
//! Architecture].
|
||||
//!
|
||||
//! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
|
||||
|
||||
|
||||
@@ -63,8 +63,7 @@ impl_from_bits_!(
|
||||
vector_double
|
||||
);
|
||||
impl_from_bits_!(
|
||||
i64x2:
|
||||
vector_signed_char,
|
||||
i64x2: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -109,8 +108,7 @@ impl_from_bits_!(
|
||||
vector_double
|
||||
);
|
||||
impl_from_bits_!(
|
||||
u64x2:
|
||||
vector_signed_char,
|
||||
u64x2: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -155,8 +153,7 @@ impl_from_bits_!(
|
||||
vector_bool_long
|
||||
);
|
||||
impl_from_bits_!(
|
||||
f64x2:
|
||||
vector_signed_char,
|
||||
f64x2: vector_signed_char,
|
||||
vector_unsigned_char,
|
||||
vector_bool_char,
|
||||
vector_signed_short,
|
||||
@@ -234,8 +231,12 @@ mod sealed {
|
||||
// xxpermdi has an big-endian bias and extended mnemonics
|
||||
#[inline]
|
||||
#[target_feature(enable = "vsx")]
|
||||
#[cfg_attr(all(test, target_endian="little"), assert_instr(xxmrgld, dm = 0x0))]
|
||||
#[cfg_attr(all(test, target_endian="big"), assert_instr(xxspltd, dm = 0x0))]
|
||||
#[cfg_attr(
|
||||
all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0)
|
||||
)]
|
||||
unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
|
||||
match dm & 0b11 {
|
||||
0 => simd_shuffle2(a, b, [0b00, 0b10]),
|
||||
|
||||
@@ -165,7 +165,6 @@ macro_rules! impl_float_arithmetic_reductions {
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
macro_rules! test_int_arithmetic_reductions {
|
||||
($id:ident, $elem_ty:ident) => {
|
||||
@@ -237,10 +236,7 @@ macro_rules! test_float_arithmetic_reductions {
|
||||
let v = $id::splat(1 as $elem_ty);
|
||||
assert_eq!(v.sum(), $id::lanes() as $elem_ty);
|
||||
let v = alternating(2);
|
||||
assert_eq!(
|
||||
v.sum(),
|
||||
($id::lanes() / 2 + $id::lanes()) as $elem_ty
|
||||
);
|
||||
assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty);
|
||||
}
|
||||
#[test]
|
||||
fn product() {
|
||||
|
||||
@@ -59,7 +59,6 @@ macro_rules! impl_float_math {
|
||||
|
||||
macro_rules! test_float_math {
|
||||
($id:ident, $elem_ty:ident) => {
|
||||
|
||||
fn sqrt2() -> $elem_ty {
|
||||
match ::mem::size_of::<$elem_ty>() {
|
||||
4 => 1.4142135 as $elem_ty,
|
||||
|
||||
@@ -46,7 +46,7 @@ macro_rules! impl_load_store {
|
||||
/// undefined.
|
||||
#[inline]
|
||||
pub unsafe fn store_aligned_unchecked(
|
||||
self, slice: &mut [$elem_ty]
|
||||
self, slice: &mut [$elem_ty],
|
||||
) {
|
||||
*(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) =
|
||||
self;
|
||||
@@ -59,7 +59,7 @@ macro_rules! impl_load_store {
|
||||
/// If `slice.len() < Self::lanes()` the behavior is undefined.
|
||||
#[inline]
|
||||
pub unsafe fn store_unaligned_unchecked(
|
||||
self, slice: &mut [$elem_ty]
|
||||
self, slice: &mut [$elem_ty],
|
||||
) {
|
||||
let target_ptr =
|
||||
slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8;
|
||||
@@ -121,7 +121,7 @@ macro_rules! impl_load_store {
|
||||
/// If `slice.len() < Self::lanes()` the behavior is undefined.
|
||||
#[inline]
|
||||
pub unsafe fn load_unaligned_unchecked(
|
||||
slice: &[$elem_ty]
|
||||
slice: &[$elem_ty],
|
||||
) -> Self {
|
||||
use mem::size_of;
|
||||
let target_ptr =
|
||||
@@ -238,7 +238,8 @@ macro_rules! test_load_store {
|
||||
data: [0 as $elem_ty; 2 * $id::lanes()],
|
||||
};
|
||||
// offset the aligned data by one byte:
|
||||
let s: &mut [u8; 2 * $id::lanes()
|
||||
let s: &mut [u8; 2
|
||||
* $id::lanes()
|
||||
* mem::size_of::<$elem_ty>()] =
|
||||
mem::transmute(&mut aligned.data);
|
||||
let s: &mut [$elem_ty] = slice::from_raw_parts_mut(
|
||||
@@ -296,7 +297,8 @@ macro_rules! test_load_store {
|
||||
data: [0 as $elem_ty; 2 * $id::lanes()],
|
||||
};
|
||||
// offset the aligned data by one byte:
|
||||
let s: &[u8; 2 * $id::lanes()
|
||||
let s: &[u8; 2
|
||||
* $id::lanes()
|
||||
* mem::size_of::<$elem_ty>()] =
|
||||
mem::transmute(&aligned.data);
|
||||
let s: &[$elem_ty] = slice::from_raw_parts(
|
||||
|
||||
@@ -41,18 +41,7 @@ macro_rules! impl_shifts {
|
||||
macro_rules! impl_all_scalar_shifts {
|
||||
($id:ident, $elem_ty:ident) => {
|
||||
impl_shifts!(
|
||||
$id,
|
||||
$elem_ty,
|
||||
u8,
|
||||
u16,
|
||||
u32,
|
||||
u64,
|
||||
usize,
|
||||
i8,
|
||||
i16,
|
||||
i32,
|
||||
i64,
|
||||
isize
|
||||
$id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
|
||||
);
|
||||
};
|
||||
}
|
||||
@@ -125,18 +114,7 @@ macro_rules! test_shift_ops {
|
||||
macro_rules! test_all_scalar_shift_ops {
|
||||
($id:ident, $elem_ty:ident) => {
|
||||
test_shift_ops!(
|
||||
$id,
|
||||
$elem_ty,
|
||||
u8,
|
||||
u16,
|
||||
u32,
|
||||
u64,
|
||||
usize,
|
||||
i8,
|
||||
i16,
|
||||
i32,
|
||||
i64,
|
||||
isize
|
||||
$id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ macro_rules! impl_fabs {
|
||||
unsafe { $fn(self) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_fabs!(f32x2: abs_v2f32);
|
||||
|
||||
@@ -31,7 +31,7 @@ macro_rules! impl_fcos {
|
||||
unsafe { $fn(self) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_fcos!(f32x2: cos_v2f32);
|
||||
|
||||
@@ -31,7 +31,7 @@ macro_rules! impl_fma {
|
||||
unsafe { $fn(self, y, z) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_fma!(f32x2: fma_v2f32);
|
||||
|
||||
@@ -25,11 +25,13 @@ macro_rules! default_impl {
|
||||
impl All for $id {
|
||||
#[inline]
|
||||
unsafe fn all(self) -> bool {
|
||||
#[cfg(not(target_arch = "aarch64"))] {
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
use coresimd::simd_llvm::simd_reduce_all;
|
||||
simd_reduce_all(self)
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")] {
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
{
|
||||
// FIXME: Broken on AArch64
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=36796
|
||||
self.and()
|
||||
@@ -40,11 +42,13 @@ macro_rules! default_impl {
|
||||
impl Any for $id {
|
||||
#[inline]
|
||||
unsafe fn any(self) -> bool {
|
||||
#[cfg(not(target_arch = "aarch64"))] {
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
use coresimd::simd_llvm::simd_reduce_any;
|
||||
simd_reduce_any(self)
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")] {
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
{
|
||||
// FIXME: Broken on AArch64
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=36796
|
||||
self.or()
|
||||
@@ -63,7 +67,12 @@ macro_rules! default_impl {
|
||||
// or floating point vectors, we can't currently work around this yet. The
|
||||
// performance impact for this shouldn't be large, but this is filled as:
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=37087
|
||||
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
|
||||
#[cfg(
|
||||
all(
|
||||
any(target_arch = "x86", target_arch = "x86_64"),
|
||||
target_feature = "sse2"
|
||||
)
|
||||
)]
|
||||
macro_rules! x86_128_sse2_movemask_impl {
|
||||
($id:ident) => {
|
||||
impl All for $id {
|
||||
@@ -71,13 +80,15 @@ macro_rules! x86_128_sse2_movemask_impl {
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn all(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm_movemask_epi8;
|
||||
use coresimd::arch::x86::_mm_movemask_epi8;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm_movemask_epi8;
|
||||
// _mm_movemask_epi8(a) creates a 16bit mask containing the most
|
||||
// significant bit of each byte of `a`. If all bits are set,
|
||||
// then all 16 lanes of the mask are true.
|
||||
_mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32
|
||||
use coresimd::arch::x86_64::_mm_movemask_epi8;
|
||||
// _mm_movemask_epi8(a) creates a 16bit mask containing the
|
||||
// most significant bit of each byte of `a`. If all
|
||||
// bits are set, then all 16 lanes of the mask are
|
||||
// true.
|
||||
_mm_movemask_epi8(::mem::transmute(self))
|
||||
== u16::max_value() as i32
|
||||
}
|
||||
}
|
||||
impl Any for $id {
|
||||
@@ -85,14 +96,14 @@ macro_rules! x86_128_sse2_movemask_impl {
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn any(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm_movemask_epi8;
|
||||
use coresimd::arch::x86::_mm_movemask_epi8;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm_movemask_epi8;
|
||||
use coresimd::arch::x86_64::_mm_movemask_epi8;
|
||||
|
||||
_mm_movemask_epi8(::mem::transmute(self)) != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256.
|
||||
@@ -103,7 +114,12 @@ macro_rules! x86_128_sse2_movemask_impl {
|
||||
// integer or floating point vectors, we can't currently work around this yet.
|
||||
//
|
||||
// TODO: investigate perf impact and fill LLVM bugs as necessary.
|
||||
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))]
|
||||
#[cfg(
|
||||
all(
|
||||
any(target_arch = "x86", target_arch = "x86_64"),
|
||||
target_feature = "avx"
|
||||
)
|
||||
)]
|
||||
macro_rules! x86_256_avx_test_impl {
|
||||
($id:ident) => {
|
||||
impl All for $id {
|
||||
@@ -111,11 +127,13 @@ macro_rules! x86_256_avx_test_impl {
|
||||
#[target_feature(enable = "avx")]
|
||||
unsafe fn all(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm256_testc_si256;
|
||||
use coresimd::arch::x86::_mm256_testc_si256;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm256_testc_si256;
|
||||
_mm256_testc_si256(::mem::transmute(self),
|
||||
::mem::transmute($id::splat(true))) != 0
|
||||
use coresimd::arch::x86_64::_mm256_testc_si256;
|
||||
_mm256_testc_si256(
|
||||
::mem::transmute(self),
|
||||
::mem::transmute($id::splat(true)),
|
||||
) != 0
|
||||
}
|
||||
}
|
||||
impl Any for $id {
|
||||
@@ -123,20 +141,27 @@ macro_rules! x86_256_avx_test_impl {
|
||||
#[target_feature(enable = "avx")]
|
||||
unsafe fn any(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm256_testz_si256;
|
||||
use coresimd::arch::x86::_mm256_testz_si256;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm256_testz_si256;
|
||||
_mm256_testz_si256(::mem::transmute(self),
|
||||
::mem::transmute(self)) == 0
|
||||
}
|
||||
use coresimd::arch::x86_64::_mm256_testz_si256;
|
||||
_mm256_testz_si256(
|
||||
::mem::transmute(self),
|
||||
::mem::transmute(self),
|
||||
) == 0
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing
|
||||
// the algorithm for 128-bit on the higher and lower elements of the vector
|
||||
// independently.
|
||||
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
|
||||
// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by
|
||||
// executing the algorithm for 128-bit on the higher and lower elements of the
|
||||
// vector independently.
|
||||
#[cfg(
|
||||
all(
|
||||
any(target_arch = "x86", target_arch = "x86_64"),
|
||||
target_feature = "sse2"
|
||||
)
|
||||
)]
|
||||
macro_rules! x86_256_sse2_impl {
|
||||
($id:ident, $v128:ident) => {
|
||||
impl All for $id {
|
||||
@@ -146,7 +171,7 @@ macro_rules! x86_256_sse2_impl {
|
||||
unsafe {
|
||||
union U {
|
||||
halves: ($v128, $v128),
|
||||
vec: $id
|
||||
vec: $id,
|
||||
}
|
||||
let halves = U { vec: self }.halves;
|
||||
halves.0.all() && halves.1.all()
|
||||
@@ -160,14 +185,14 @@ macro_rules! x86_256_sse2_impl {
|
||||
unsafe {
|
||||
union U {
|
||||
halves: ($v128, $v128),
|
||||
vec: $id
|
||||
vec: $id,
|
||||
}
|
||||
let halves = U { vec: self }.halves;
|
||||
halves.0.any() || halves.1.any()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for 64-bit wide masks on x86.
|
||||
@@ -179,13 +204,14 @@ macro_rules! x86_64_mmx_movemask_impl {
|
||||
#[target_feature(enable = "mmx")]
|
||||
unsafe fn all(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm_movemask_pi8;
|
||||
use coresimd::arch::x86::_mm_movemask_pi8;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm_movemask_pi8;
|
||||
use coresimd::arch::x86_64::_mm_movemask_pi8;
|
||||
// _mm_movemask_pi8(a) creates an 8bit mask containing the most
|
||||
// significant bit of each byte of `a`. If all bits are set,
|
||||
// then all 8 lanes of the mask are true.
|
||||
_mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32
|
||||
_mm_movemask_pi8(::mem::transmute(self))
|
||||
== u8::max_value() as i32
|
||||
}
|
||||
}
|
||||
impl Any for $id {
|
||||
@@ -193,14 +219,14 @@ macro_rules! x86_64_mmx_movemask_impl {
|
||||
#[target_feature(enable = "mmx")]
|
||||
unsafe fn any(self) -> bool {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use ::coresimd::arch::x86::_mm_movemask_pi8;
|
||||
use coresimd::arch::x86::_mm_movemask_pi8;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use ::coresimd::arch::x86_64::_mm_movemask_pi8;
|
||||
use coresimd::arch::x86_64::_mm_movemask_pi8;
|
||||
|
||||
_mm_movemask_pi8(::mem::transmute(self)) != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for 128-bit wide masks on x86
|
||||
@@ -214,7 +240,7 @@ macro_rules! x86_128_impl {
|
||||
default_impl!($id);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for 256-bit wide masks on x86
|
||||
@@ -230,22 +256,25 @@ macro_rules! x86_256_impl {
|
||||
default_impl!($id);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
|
||||
// minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors.
|
||||
#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
|
||||
)]
|
||||
macro_rules! arm_64_x2_v7_neon_impl {
|
||||
($id:ident, $vpmin:ident, $vpmax:ident) => {
|
||||
impl All for $id {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn all(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmin;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmin;
|
||||
use mem::transmute;
|
||||
// pmin((a, b), (-,-)) => (b, -).0 => b
|
||||
let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized()));
|
||||
let tmp: $id =
|
||||
transmute($vpmin(transmute(self), ::mem::uninitialized()));
|
||||
tmp.extract(0)
|
||||
}
|
||||
}
|
||||
@@ -253,27 +282,30 @@ macro_rules! arm_64_x2_v7_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn any(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmax;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmax;
|
||||
use mem::transmute;
|
||||
// pmax((a, b), (-,-)) => (b, -).0 => b
|
||||
let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized()));
|
||||
let tmp: $id =
|
||||
transmute($vpmax(transmute(self), ::mem::uninitialized()));
|
||||
tmp.extract(0)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
|
||||
// minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors.
|
||||
#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
|
||||
)]
|
||||
macro_rules! arm_64_x4_v7_neon_impl {
|
||||
($id:ident, $vpmin:ident, $vpmax:ident) => {
|
||||
impl All for $id {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn all(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmin;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmin;
|
||||
use mem::transmute;
|
||||
// tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
|
||||
let tmp = $vpmin(transmute(self), ::mem::uninitialized());
|
||||
// tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
|
||||
@@ -285,8 +317,8 @@ macro_rules! arm_64_x4_v7_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn any(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmax;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmax;
|
||||
use mem::transmute;
|
||||
// tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
|
||||
let tmp = $vpmax(transmute(self), ::mem::uninitialized());
|
||||
// tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
|
||||
@@ -294,20 +326,22 @@ macro_rules! arm_64_x4_v7_neon_impl {
|
||||
tmp.extract(0)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
|
||||
// minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors.
|
||||
#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
|
||||
)]
|
||||
macro_rules! arm_64_x8_v7_neon_impl {
|
||||
($id:ident, $vpmin:ident, $vpmax:ident) => {
|
||||
impl All for $id {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn all(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmin;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmin;
|
||||
use mem::transmute;
|
||||
// tmp = pmin(
|
||||
// (a, b, c, d, e, f, g, h),
|
||||
// (-, -, -, -, -, -, -, -)
|
||||
@@ -330,8 +364,8 @@ macro_rules! arm_64_x8_v7_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn any(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmax;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmax;
|
||||
use mem::transmute;
|
||||
// tmp = pmax(
|
||||
// (a, b, c, d, e, f, g, h),
|
||||
// (-, -, -, -, -, -, -, -)
|
||||
@@ -350,28 +384,32 @@ macro_rules! arm_64_x8_v7_neon_impl {
|
||||
tmp.extract(0)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
|
||||
// minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with
|
||||
// more than two elements.
|
||||
#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
|
||||
)]
|
||||
macro_rules! arm_128_v7_neon_impl {
|
||||
($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
|
||||
impl All for $id {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn all(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmin;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmin;
|
||||
use mem::transmute;
|
||||
union U {
|
||||
halves: ($half, $half),
|
||||
vec: $id
|
||||
vec: $id,
|
||||
}
|
||||
let halves = U { vec: self }.halves;
|
||||
let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
|
||||
let h: $half = transmute($vpmin(
|
||||
transmute(halves.0),
|
||||
transmute(halves.1),
|
||||
));
|
||||
h.all()
|
||||
}
|
||||
}
|
||||
@@ -379,18 +417,21 @@ macro_rules! arm_128_v7_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "v7,neon")]
|
||||
unsafe fn any(self) -> bool {
|
||||
use ::coresimd::arch::arm::$vpmax;
|
||||
use ::mem::transmute;
|
||||
use coresimd::arch::arm::$vpmax;
|
||||
use mem::transmute;
|
||||
union U {
|
||||
halves: ($half, $half),
|
||||
vec: $id
|
||||
vec: $id,
|
||||
}
|
||||
let halves = U { vec: self }.halves;
|
||||
let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
|
||||
let h: $half = transmute($vpmax(
|
||||
transmute(halves.0),
|
||||
transmute(halves.1),
|
||||
));
|
||||
h.any()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
|
||||
@@ -402,7 +443,7 @@ macro_rules! aarch64_128_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn all(self) -> bool {
|
||||
use ::coresimd::arch::aarch64::$vmin;
|
||||
use coresimd::arch::aarch64::$vmin;
|
||||
$vmin(::mem::transmute(self)) != 0
|
||||
}
|
||||
}
|
||||
@@ -410,11 +451,11 @@ macro_rules! aarch64_128_neon_impl {
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn any(self) -> bool {
|
||||
use ::coresimd::arch::aarch64::$vmax;
|
||||
use coresimd::arch::aarch64::$vmax;
|
||||
$vmax(::mem::transmute(self)) != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
|
||||
@@ -431,9 +472,12 @@ macro_rules! aarch64_64_neon_impl {
|
||||
unsafe fn all(self) -> bool {
|
||||
union U {
|
||||
halves: ($id, $id),
|
||||
vec: $vec128
|
||||
vec: $vec128,
|
||||
}
|
||||
U { halves: (self, self) }.vec.all()
|
||||
U {
|
||||
halves: (self, self),
|
||||
}.vec
|
||||
.all()
|
||||
}
|
||||
}
|
||||
impl Any for $id {
|
||||
@@ -442,12 +486,15 @@ macro_rules! aarch64_64_neon_impl {
|
||||
unsafe fn any(self) -> bool {
|
||||
union U {
|
||||
halves: ($id, $id),
|
||||
vec: $vec128
|
||||
}
|
||||
U { halves: (self, self) }.vec.any()
|
||||
vec: $vec128,
|
||||
}
|
||||
U {
|
||||
halves: (self, self),
|
||||
}.vec
|
||||
.any()
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_mask_all_any {
|
||||
|
||||
@@ -5,8 +5,8 @@ pub mod wrapping;
|
||||
|
||||
pub mod masks_reductions;
|
||||
|
||||
pub mod sqrt;
|
||||
pub mod abs;
|
||||
pub mod cos;
|
||||
pub mod fma;
|
||||
pub mod sin;
|
||||
pub mod cos;
|
||||
pub mod sqrt;
|
||||
|
||||
@@ -31,7 +31,7 @@ macro_rules! impl_fsin {
|
||||
unsafe { $fn(self) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_fsin!(f32x2: sin_v2f32);
|
||||
|
||||
@@ -31,7 +31,7 @@ macro_rules! impl_fsqrt {
|
||||
unsafe { $fn(self) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_fsqrt!(f32x2: sqrt_v2f32);
|
||||
|
||||
@@ -66,8 +66,12 @@ where
|
||||
U: FromBits<T>,
|
||||
{
|
||||
// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
|
||||
#[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
|
||||
#[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
|
||||
#[cfg_attr(
|
||||
any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
|
||||
)]
|
||||
fn into_bits(self) -> U {
|
||||
debug_assert!(::mem::size_of::<Self>() == ::mem::size_of::<U>());
|
||||
U::from_bits(self)
|
||||
@@ -77,8 +81,12 @@ where
|
||||
// FromBits (and thus IntoBits) is reflexive.
|
||||
impl<T> FromBits<T> for T {
|
||||
// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
|
||||
#[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
|
||||
#[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
|
||||
#[cfg_attr(
|
||||
any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
|
||||
)]
|
||||
fn from_bits(t: Self) -> Self {
|
||||
t
|
||||
}
|
||||
|
||||
@@ -110,9 +110,11 @@ macro_rules! from_bits_x86 {
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "arm", target_feature = "neon",
|
||||
target_feature = "v7"))]
|
||||
use coresimd::arch::arm::{// FIXME: float16x8_t,
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
|
||||
)]
|
||||
use coresimd::arch::arm::{
|
||||
// FIXME: float16x8_t,
|
||||
float32x4_t,
|
||||
int16x8_t,
|
||||
int32x4_t,
|
||||
@@ -123,10 +125,12 @@ use coresimd::arch::arm::{// FIXME: float16x8_t,
|
||||
uint16x8_t,
|
||||
uint32x4_t,
|
||||
uint64x2_t,
|
||||
uint8x16_t};
|
||||
uint8x16_t,
|
||||
};
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use coresimd::arch::aarch64::{// FIXME: float16x8_t,
|
||||
use coresimd::arch::aarch64::{
|
||||
// FIXME: float16x8_t,
|
||||
float32x4_t,
|
||||
float64x2_t,
|
||||
int16x8_t,
|
||||
@@ -138,13 +142,21 @@ use coresimd::arch::aarch64::{// FIXME: float16x8_t,
|
||||
uint16x8_t,
|
||||
uint32x4_t,
|
||||
uint64x2_t,
|
||||
uint8x16_t};
|
||||
uint8x16_t,
|
||||
};
|
||||
|
||||
macro_rules! from_bits_arm {
|
||||
($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
|
||||
#[cfg(any(all(target_arch = "arm", target_feature = "neon",
|
||||
target_feature = "v7"),
|
||||
target_arch = "aarch64"))]
|
||||
#[cfg(
|
||||
any(
|
||||
all(
|
||||
target_arch = "arm",
|
||||
target_feature = "neon",
|
||||
target_feature = "v7"
|
||||
),
|
||||
target_arch = "aarch64"
|
||||
)
|
||||
)]
|
||||
impl_from_bits_!(
|
||||
$id: int8x16_t,
|
||||
uint8x16_t,
|
||||
@@ -182,12 +194,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(u64x2, u64, u64x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u64x2,
|
||||
u64,
|
||||
u64x2_from_bits_arm,
|
||||
u64x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i64x2: i64,
|
||||
@@ -207,12 +214,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(i64x2, i64, i64x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i64x2,
|
||||
i64,
|
||||
i64x2_from_bits_arm,
|
||||
i64x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
f64x2: f64,
|
||||
@@ -232,12 +234,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(f64x2, f64, f64x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
f64x2,
|
||||
f64,
|
||||
f64x2_from_bits_arm,
|
||||
f64x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
u32x4: u32,
|
||||
@@ -257,12 +254,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(u32x4, u32, u32x4_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u32x4,
|
||||
u32,
|
||||
u32x4_from_bits_arm,
|
||||
u32x4_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i32x4: i32,
|
||||
@@ -282,12 +274,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(i32x4, i32, i32x4_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i32x4,
|
||||
i32,
|
||||
i32x4_from_bits_arm,
|
||||
i32x4_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
f32x4: f32,
|
||||
@@ -307,12 +294,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(f32x4, f32, f32x4_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
f32x4,
|
||||
f32,
|
||||
f32x4_from_bits_arm,
|
||||
f32x4_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
u16x8: u16,
|
||||
@@ -332,12 +314,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(u16x8, u16, u16x8_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u16x8,
|
||||
u16,
|
||||
u16x8_from_bits_arm,
|
||||
u16x8_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i16x8: i16,
|
||||
@@ -357,12 +334,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(i16x8, i16, i16x8_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i16x8,
|
||||
i16,
|
||||
i16x8_from_bits_arm,
|
||||
i16x8_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
u8x16: u8,
|
||||
@@ -382,12 +354,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(u8x16, u8, u8x16_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u8x16,
|
||||
u8,
|
||||
u8x16_from_bits_arm,
|
||||
u8x16_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i8x16: i8,
|
||||
@@ -407,12 +374,7 @@ impl_from_bits!(
|
||||
m8x16
|
||||
);
|
||||
from_bits_x86!(i8x16, i8, i8x16_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i8x16,
|
||||
i8,
|
||||
i8x16_from_bits_arm,
|
||||
i8x16_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64);
|
||||
|
||||
impl_from!(
|
||||
f64x2: f64,
|
||||
@@ -552,31 +514,37 @@ impl_from!(
|
||||
m8x8
|
||||
);
|
||||
|
||||
impl_from!(u8x16: u8, u8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, i8x16, m8x16);
|
||||
impl_from!(i8x16: i8, i8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, u8x16, m8x16);
|
||||
impl_from!(
|
||||
u8x16: u8,
|
||||
u8x16_from,
|
||||
test_v128 | i32x16,
|
||||
u32x16,
|
||||
f32x16,
|
||||
m1x16,
|
||||
i16x16,
|
||||
u16x16,
|
||||
m16x16,
|
||||
i8x16,
|
||||
m8x16
|
||||
);
|
||||
impl_from!(
|
||||
i8x16: i8,
|
||||
i8x16_from,
|
||||
test_v128 | i32x16,
|
||||
u32x16,
|
||||
f32x16,
|
||||
m1x16,
|
||||
i16x16,
|
||||
u16x16,
|
||||
m16x16,
|
||||
u8x16,
|
||||
m8x16
|
||||
);
|
||||
|
||||
impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16);
|
||||
|
||||
impl_from!(
|
||||
m16x8: i16,
|
||||
m16x8_from,
|
||||
test_v128 | m1x8,
|
||||
m32x8,
|
||||
m8x8
|
||||
);
|
||||
impl_from!(m16x8: i16, m16x8_from, test_v128 | m1x8, m32x8, m8x8);
|
||||
|
||||
impl_from!(
|
||||
m32x4: i32,
|
||||
m32x4_from,
|
||||
test_v128 | m64x4,
|
||||
m16x4,
|
||||
m8x4
|
||||
);
|
||||
impl_from!(m32x4: i32, m32x4_from, test_v128 | m64x4, m16x4, m8x4);
|
||||
|
||||
impl_from!(
|
||||
m64x2: i64,
|
||||
m64x2_from,
|
||||
test_v128 | m32x2,
|
||||
m16x2,
|
||||
m8x2
|
||||
);
|
||||
impl_from!(m64x2: i64, m64x2_from, test_v128 | m32x2, m16x2, m8x2);
|
||||
|
||||
@@ -57,10 +57,4 @@ impl_from!(
|
||||
m8x2
|
||||
);
|
||||
|
||||
impl_from!(
|
||||
m8x2: i8,
|
||||
m8x2_from,
|
||||
test_v16 | m64x2,
|
||||
m32x2,
|
||||
m16x2
|
||||
);
|
||||
impl_from!(m8x2: i8, m8x2_from, test_v16 | m64x2, m32x2, m16x2);
|
||||
|
||||
@@ -465,25 +465,8 @@ impl_from!(
|
||||
|
||||
impl_from!(m8x32: i8, m8x32_from, test_v256 | m1x32);
|
||||
|
||||
impl_from!(
|
||||
m16x16: i16,
|
||||
m16x16_from,
|
||||
test_v256 | m1x16,
|
||||
m8x16
|
||||
);
|
||||
impl_from!(m16x16: i16, m16x16_from, test_v256 | m1x16, m8x16);
|
||||
|
||||
impl_from!(
|
||||
m32x8: i32,
|
||||
m32x8_from,
|
||||
test_v256 | m1x8,
|
||||
m16x8,
|
||||
m8x8
|
||||
);
|
||||
impl_from!(m32x8: i32, m32x8_from, test_v256 | m1x8, m16x8, m8x8);
|
||||
|
||||
impl_from!(
|
||||
m64x4: i64,
|
||||
m64x4_from,
|
||||
test_v256 | m32x4,
|
||||
m16x4,
|
||||
m8x4
|
||||
);
|
||||
impl_from!(m64x4: i64, m64x4_from, test_v256 | m32x4, m16x4, m8x4);
|
||||
|
||||
@@ -151,18 +151,6 @@ impl_from!(
|
||||
m8x4
|
||||
);
|
||||
|
||||
impl_from!(
|
||||
m8x4: i8,
|
||||
m8x4_from,
|
||||
test_v32 | m64x4,
|
||||
m32x4,
|
||||
m16x4
|
||||
);
|
||||
impl_from!(m8x4: i8, m8x4_from, test_v32 | m64x4, m32x4, m16x4);
|
||||
|
||||
impl_from!(
|
||||
m16x2: i16,
|
||||
m16x2_from,
|
||||
test_v32 | m64x2,
|
||||
m32x2,
|
||||
m8x2
|
||||
);
|
||||
impl_from!(m16x2: i16, m16x2_from, test_v32 | m64x2, m32x2, m8x2);
|
||||
|
||||
@@ -446,17 +446,6 @@ impl_from!(u8x64: u8, u8x64_from, test_v512 | i8x64, m1x64);
|
||||
|
||||
impl_from!(m1x32: i16, m1x32_from, test_v512 | m8x32);
|
||||
|
||||
impl_from!(
|
||||
m1x16: i32,
|
||||
m1x16_from,
|
||||
test_v512 | m16x16,
|
||||
m8x16
|
||||
);
|
||||
impl_from!(m1x16: i32, m1x16_from, test_v512 | m16x16, m8x16);
|
||||
|
||||
impl_from!(
|
||||
m1x8: i64,
|
||||
m1x8_from,
|
||||
test_v512 | m32x8,
|
||||
m16x8,
|
||||
m8x8
|
||||
);
|
||||
impl_from!(m1x8: i64, m1x8_from, test_v512 | m32x8, m16x8, m8x8);
|
||||
|
||||
@@ -83,9 +83,11 @@ macro_rules! from_bits_x86 {
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "arm", target_feature = "neon",
|
||||
target_feature = "v7"))]
|
||||
use coresimd::arch::arm::{// FIXME: float16x4_t,
|
||||
#[cfg(
|
||||
all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
|
||||
)]
|
||||
use coresimd::arch::arm::{
|
||||
// FIXME: float16x4_t,
|
||||
float32x2_t,
|
||||
int16x4_t,
|
||||
int32x2_t,
|
||||
@@ -96,10 +98,12 @@ use coresimd::arch::arm::{// FIXME: float16x4_t,
|
||||
uint16x4_t,
|
||||
uint32x2_t,
|
||||
uint64x1_t,
|
||||
uint8x8_t};
|
||||
uint8x8_t,
|
||||
};
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use coresimd::arch::aarch64::{// FIXME: float16x4_t,
|
||||
use coresimd::arch::aarch64::{
|
||||
// FIXME: float16x4_t,
|
||||
float32x2_t,
|
||||
float64x1_t,
|
||||
int16x4_t,
|
||||
@@ -111,13 +115,21 @@ use coresimd::arch::aarch64::{// FIXME: float16x4_t,
|
||||
uint16x4_t,
|
||||
uint32x2_t,
|
||||
uint64x1_t,
|
||||
uint8x8_t};
|
||||
uint8x8_t,
|
||||
};
|
||||
|
||||
macro_rules! from_bits_arm {
|
||||
($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
|
||||
#[cfg(any(all(target_arch = "arm", target_feature = "neon",
|
||||
target_feature = "v7"),
|
||||
target_arch = "aarch64"))]
|
||||
#[cfg(
|
||||
any(
|
||||
all(
|
||||
target_arch = "arm",
|
||||
target_feature = "neon",
|
||||
target_feature = "v7"
|
||||
),
|
||||
target_arch = "aarch64"
|
||||
)
|
||||
)]
|
||||
impl_from_bits_!(
|
||||
$id: int64x1_t,
|
||||
uint64x1_t,
|
||||
@@ -151,12 +163,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(u32x2, u32, u32x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u32x2,
|
||||
u32,
|
||||
u32x2_from_bits_arm,
|
||||
u32x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i32x2: i32,
|
||||
@@ -172,12 +179,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(i32x2, i32, i32x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i32x2,
|
||||
i32,
|
||||
i32x2_from_bits_arm,
|
||||
i32x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
f32x2: f32,
|
||||
@@ -193,12 +195,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(f32x2, f32, f32x2_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
f32x2,
|
||||
f32,
|
||||
f32x2_from_bits_arm,
|
||||
f32x2_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
u16x4: u16,
|
||||
@@ -213,12 +210,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(u16x4, u16, u16x4_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u16x4,
|
||||
u16,
|
||||
u16x4_from_bits_arm,
|
||||
u16x4_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i16x4: i16,
|
||||
@@ -233,12 +225,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(i16x4, i16, i16x4_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i16x4,
|
||||
i16,
|
||||
i16x4_from_bits_arm,
|
||||
i16x4_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
u8x8: u8,
|
||||
@@ -253,12 +240,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(u8x8, u8, u8x8_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
u8x8,
|
||||
u8,
|
||||
u8x8_from_bits_arm,
|
||||
u8x8_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64);
|
||||
|
||||
impl_from_bits!(
|
||||
i8x8: i8,
|
||||
@@ -273,12 +255,7 @@ impl_from_bits!(
|
||||
m8x8
|
||||
);
|
||||
from_bits_x86!(i8x8, i8, i8x8_from_bits_x86);
|
||||
from_bits_arm!(
|
||||
i8x8,
|
||||
i8,
|
||||
i8x8_from_bits_arm,
|
||||
i8x8_from_bits_aarch64
|
||||
);
|
||||
from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64);
|
||||
|
||||
impl_from!(
|
||||
f32x2: f32,
|
||||
@@ -404,26 +381,8 @@ impl_from!(
|
||||
m8x8
|
||||
);
|
||||
|
||||
impl_from!(
|
||||
m8x8: i8,
|
||||
m8x8_from,
|
||||
test_v64 | m1x8,
|
||||
m32x8,
|
||||
m16x8
|
||||
);
|
||||
impl_from!(m8x8: i8, m8x8_from, test_v64 | m1x8, m32x8, m16x8);
|
||||
|
||||
impl_from!(
|
||||
m16x4: i16,
|
||||
m16x4_from,
|
||||
test_v64 | m64x4,
|
||||
m32x4,
|
||||
m8x4
|
||||
);
|
||||
impl_from!(m16x4: i16, m16x4_from, test_v64 | m64x4, m32x4, m8x4);
|
||||
|
||||
impl_from!(
|
||||
m32x2: i32,
|
||||
m32x2_from,
|
||||
test_v64 | m64x2,
|
||||
m16x2,
|
||||
m8x2
|
||||
);
|
||||
impl_from!(m32x2: i32, m32x2_from, test_v64 | m64x2, m16x2, m8x2);
|
||||
|
||||
@@ -1387,7 +1387,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_permute2f128_ps(
|
||||
a: __m256, b: __m256, imm8: i32
|
||||
a: __m256, b: __m256, imm8: i32,
|
||||
) -> __m256 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
@@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_permute2f128_ps(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_permute2f128_pd(
|
||||
a: __m256d, b: __m256d, imm8: i32
|
||||
a: __m256d, b: __m256d, imm8: i32,
|
||||
) -> __m256d {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
@@ -1427,7 +1427,7 @@ pub unsafe fn _mm256_permute2f128_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_permute2f128_si256(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
a: __m256i, b: __m256i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let a = a.as_i32x8();
|
||||
let b = b.as_i32x8();
|
||||
@@ -1529,7 +1529,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_insertf128_pd(
|
||||
a: __m256d, b: __m128d, imm8: i32
|
||||
a: __m256d, b: __m128d, imm8: i32,
|
||||
) -> __m256d {
|
||||
match imm8 & 1 {
|
||||
0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
|
||||
@@ -1547,7 +1547,7 @@ pub unsafe fn _mm256_insertf128_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_insertf128_si256(
|
||||
a: __m256i, b: __m128i, imm8: i32
|
||||
a: __m256i, b: __m128i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let b = _mm256_castsi128_si256(b).as_i64x4();
|
||||
let dst: i64x4 = match imm8 & 1 {
|
||||
@@ -1567,11 +1567,7 @@ pub unsafe fn _mm256_insertf128_si256(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
|
||||
mem::transmute(simd_insert(
|
||||
a.as_i8x32(),
|
||||
(index as u32) & 31,
|
||||
i,
|
||||
))
|
||||
mem::transmute(simd_insert(a.as_i8x32(), (index as u32) & 31, i))
|
||||
}
|
||||
|
||||
/// Copy `a` to result, and insert the 16-bit integer `i` into result
|
||||
@@ -1584,11 +1580,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
|
||||
mem::transmute(simd_insert(
|
||||
a.as_i16x16(),
|
||||
(index as u32) & 15,
|
||||
i,
|
||||
))
|
||||
mem::transmute(simd_insert(a.as_i16x16(), (index as u32) & 15, i))
|
||||
}
|
||||
|
||||
/// Copy `a` to result, and insert the 32-bit integer `i` into result
|
||||
@@ -1790,7 +1782,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
|
||||
#[cfg_attr(test, assert_instr(vmaskmovpd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskload_pd(
|
||||
mem_addr: *const f64, mask: __m256i
|
||||
mem_addr: *const f64, mask: __m256i,
|
||||
) -> __m256d {
|
||||
maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
|
||||
}
|
||||
@@ -1804,7 +1796,7 @@ pub unsafe fn _mm256_maskload_pd(
|
||||
#[cfg_attr(test, assert_instr(vmaskmovpd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskstore_pd(
|
||||
mem_addr: *mut f64, mask: __m256i, a: __m256d
|
||||
mem_addr: *mut f64, mask: __m256i, a: __m256d,
|
||||
) {
|
||||
maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
|
||||
}
|
||||
@@ -1844,7 +1836,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
|
||||
#[cfg_attr(test, assert_instr(vmaskmovps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskload_ps(
|
||||
mem_addr: *const f32, mask: __m256i
|
||||
mem_addr: *const f32, mask: __m256i,
|
||||
) -> __m256 {
|
||||
maskloadps256(mem_addr as *const i8, mask.as_i32x8())
|
||||
}
|
||||
@@ -1858,7 +1850,7 @@ pub unsafe fn _mm256_maskload_ps(
|
||||
#[cfg_attr(test, assert_instr(vmaskmovps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskstore_ps(
|
||||
mem_addr: *mut f32, mask: __m256i, a: __m256
|
||||
mem_addr: *mut f32, mask: __m256i, a: __m256,
|
||||
) {
|
||||
maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
|
||||
}
|
||||
@@ -2383,7 +2375,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_set_ps(
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
|
||||
) -> __m256 {
|
||||
_mm256_setr_ps(h, g, f, e, d, c, b, a)
|
||||
}
|
||||
@@ -2440,7 +2432,7 @@ pub unsafe fn _mm256_set_epi16(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_set_epi32(
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
|
||||
) -> __m256i {
|
||||
_mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
|
||||
}
|
||||
@@ -2477,7 +2469,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_setr_ps(
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
|
||||
) -> __m256 {
|
||||
__m256(a, b, c, d, e, f, g, h)
|
||||
}
|
||||
@@ -2536,7 +2528,7 @@ pub unsafe fn _mm256_setr_epi16(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_setr_epi32(
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
|
||||
) -> __m256i {
|
||||
mem::transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
|
||||
}
|
||||
@@ -2950,7 +2942,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128(
|
||||
hiaddr: *const f32, loaddr: *const f32
|
||||
hiaddr: *const f32, loaddr: *const f32,
|
||||
) -> __m256 {
|
||||
let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
|
||||
_mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
|
||||
@@ -2967,7 +2959,7 @@ pub unsafe fn _mm256_loadu2_m128(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128d(
|
||||
hiaddr: *const f64, loaddr: *const f64
|
||||
hiaddr: *const f64, loaddr: *const f64,
|
||||
) -> __m256d {
|
||||
let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
|
||||
_mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
|
||||
@@ -2983,7 +2975,7 @@ pub unsafe fn _mm256_loadu2_m128d(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128i(
|
||||
hiaddr: *const __m128i, loaddr: *const __m128i
|
||||
hiaddr: *const __m128i, loaddr: *const __m128i,
|
||||
) -> __m256i {
|
||||
let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
|
||||
_mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
|
||||
@@ -3000,7 +2992,7 @@ pub unsafe fn _mm256_loadu2_m128i(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128(
|
||||
hiaddr: *mut f32, loaddr: *mut f32, a: __m256
|
||||
hiaddr: *mut f32, loaddr: *mut f32, a: __m256,
|
||||
) {
|
||||
let lo = _mm256_castps256_ps128(a);
|
||||
_mm_storeu_ps(loaddr, lo);
|
||||
@@ -3019,7 +3011,7 @@ pub unsafe fn _mm256_storeu2_m128(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128d(
|
||||
hiaddr: *mut f64, loaddr: *mut f64, a: __m256d
|
||||
hiaddr: *mut f64, loaddr: *mut f64, a: __m256d,
|
||||
) {
|
||||
let lo = _mm256_castpd256_pd128(a);
|
||||
_mm_storeu_pd(loaddr, lo);
|
||||
@@ -3037,7 +3029,7 @@ pub unsafe fn _mm256_storeu2_m128d(
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128i(
|
||||
hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
|
||||
hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i,
|
||||
) {
|
||||
let lo = _mm256_castsi256_si128(a);
|
||||
_mm_storeu_si128(loaddr, lo);
|
||||
@@ -3500,20 +3492,11 @@ mod tests {
|
||||
let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
|
||||
let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
|
||||
let r = _mm256_blend_ps(a, b, 0x0);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.),
|
||||
);
|
||||
assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
|
||||
let r = _mm256_blend_ps(a, b, 0x3);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.),
|
||||
);
|
||||
assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
|
||||
let r = _mm256_blend_ps(a, b, 0xF);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.),
|
||||
);
|
||||
assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx")]
|
||||
@@ -3544,16 +3527,8 @@ mod tests {
|
||||
let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
|
||||
let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
|
||||
let r = _mm256_dp_ps(a, b, 0xFF);
|
||||
let e = _mm256_setr_ps(
|
||||
200.,
|
||||
200.,
|
||||
200.,
|
||||
200.,
|
||||
2387.,
|
||||
2387.,
|
||||
2387.,
|
||||
2387.,
|
||||
);
|
||||
let e =
|
||||
_mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
|
||||
assert_eq_m256(r, e);
|
||||
}
|
||||
|
||||
@@ -4234,9 +4209,7 @@ mod tests {
|
||||
pub data: [f64; 4],
|
||||
}
|
||||
let a = _mm256_set1_pd(7.0);
|
||||
let mut mem = Memory {
|
||||
data: [-1.0; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [-1.0; 4] };
|
||||
|
||||
_mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
|
||||
for i in 0..4 {
|
||||
@@ -4251,9 +4224,7 @@ mod tests {
|
||||
pub data: [f32; 8],
|
||||
}
|
||||
let a = _mm256_set1_ps(7.0);
|
||||
let mut mem = Memory {
|
||||
data: [-1.0; 8],
|
||||
};
|
||||
let mut mem = Memory { data: [-1.0; 8] };
|
||||
|
||||
_mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
|
||||
for i in 0..8 {
|
||||
@@ -4534,10 +4505,7 @@ mod tests {
|
||||
#[simd_test(enable = "avx")]
|
||||
unsafe fn test_mm256_set_ps() {
|
||||
let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.),
|
||||
);
|
||||
assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx")]
|
||||
@@ -4595,10 +4563,7 @@ mod tests {
|
||||
#[simd_test(enable = "avx")]
|
||||
unsafe fn test_mm256_setr_ps() {
|
||||
let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.),
|
||||
);
|
||||
assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx")]
|
||||
|
||||
@@ -413,7 +413,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blend_epi32(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
a: __m256i, b: __m256i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
let a = a.as_i32x8();
|
||||
@@ -480,7 +480,7 @@ pub unsafe fn _mm256_blend_epi32(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blend_epi16(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
a: __m256i, b: __m256i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
let a = a.as_i16x16();
|
||||
@@ -531,76 +531,20 @@ pub unsafe fn _mm256_blend_epi16(
|
||||
) => {
|
||||
match (imm8 >> 6) & 0b11 {
|
||||
0b00 => blend4!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
$e,
|
||||
$f,
|
||||
6,
|
||||
7,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
$e2,
|
||||
$f2,
|
||||
14,
|
||||
15
|
||||
$a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2,
|
||||
$f2, 14, 15
|
||||
),
|
||||
0b01 => blend4!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
$e,
|
||||
$f,
|
||||
22,
|
||||
7,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
$e2,
|
||||
$f2,
|
||||
30,
|
||||
15
|
||||
$a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2,
|
||||
$e2, $f2, 30, 15
|
||||
),
|
||||
0b10 => blend4!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
$e,
|
||||
$f,
|
||||
6,
|
||||
23,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
$e2,
|
||||
$f2,
|
||||
14,
|
||||
31
|
||||
$a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2,
|
||||
$e2, $f2, 14, 31
|
||||
),
|
||||
_ => blend4!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
$e,
|
||||
$f,
|
||||
22,
|
||||
23,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
$e2,
|
||||
$f2,
|
||||
30,
|
||||
31
|
||||
$a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2,
|
||||
$e2, $f2, 30, 31
|
||||
),
|
||||
}
|
||||
};
|
||||
@@ -618,60 +562,16 @@ pub unsafe fn _mm256_blend_epi16(
|
||||
) => {
|
||||
match (imm8 >> 4) & 0b11 {
|
||||
0b00 => blend3!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
4,
|
||||
5,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
12,
|
||||
13
|
||||
$a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13
|
||||
),
|
||||
0b01 => blend3!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
20,
|
||||
5,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
28,
|
||||
13
|
||||
$a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13
|
||||
),
|
||||
0b10 => blend3!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
4,
|
||||
21,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
12,
|
||||
29
|
||||
$a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29
|
||||
),
|
||||
_ => blend3!(
|
||||
$a,
|
||||
$b,
|
||||
$c,
|
||||
$d,
|
||||
20,
|
||||
21,
|
||||
$a2,
|
||||
$b2,
|
||||
$c2,
|
||||
$d2,
|
||||
28,
|
||||
29
|
||||
$a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29
|
||||
),
|
||||
}
|
||||
};
|
||||
@@ -703,13 +603,9 @@ pub unsafe fn _mm256_blend_epi16(
|
||||
#[cfg_attr(test, assert_instr(vpblendvb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blendv_epi8(
|
||||
a: __m256i, b: __m256i, mask: __m256i
|
||||
a: __m256i, b: __m256i, mask: __m256i,
|
||||
) -> __m256i {
|
||||
mem::transmute(pblendvb(
|
||||
a.as_i8x32(),
|
||||
b.as_i8x32(),
|
||||
mask.as_i8x32(),
|
||||
))
|
||||
mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
|
||||
}
|
||||
|
||||
/// Broadcast the low packed 8-bit integer from `a` to all elements of
|
||||
@@ -1226,7 +1122,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i32gather_epi32(
|
||||
slice: *const i32, offsets: __m128i, scale: i32
|
||||
slice: *const i32, offsets: __m128i, scale: i32,
|
||||
) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let neg_one = _mm_set1_epi32(-1).as_i32x4();
|
||||
@@ -1280,7 +1176,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i32gather_epi32(
|
||||
slice: *const i32, offsets: __m256i, scale: i32
|
||||
slice: *const i32, offsets: __m256i, scale: i32,
|
||||
) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
let neg_one = _mm256_set1_epi32(-1).as_i32x8();
|
||||
@@ -1334,7 +1230,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i32gather_ps(
|
||||
slice: *const f32, offsets: __m128i, scale: i32
|
||||
slice: *const f32, offsets: __m128i, scale: i32,
|
||||
) -> __m128 {
|
||||
let zero = _mm_setzero_ps();
|
||||
let neg_one = _mm_set1_ps(-1.0);
|
||||
@@ -1360,7 +1256,7 @@ pub unsafe fn _mm_i32gather_ps(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_mask_i32gather_ps(
|
||||
src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
|
||||
src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
|
||||
) -> __m128 {
|
||||
let offsets = offsets.as_i32x4();
|
||||
let slice = slice as *const i8;
|
||||
@@ -1383,7 +1279,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i32gather_ps(
|
||||
slice: *const f32, offsets: __m256i, scale: i32
|
||||
slice: *const f32, offsets: __m256i, scale: i32,
|
||||
) -> __m256 {
|
||||
let zero = _mm256_setzero_ps();
|
||||
let neg_one = _mm256_set1_ps(-1.0);
|
||||
@@ -1409,7 +1305,7 @@ pub unsafe fn _mm256_i32gather_ps(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_mask_i32gather_ps(
|
||||
src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32
|
||||
src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32,
|
||||
) -> __m256 {
|
||||
let offsets = offsets.as_i32x8();
|
||||
let slice = slice as *const i8;
|
||||
@@ -1432,7 +1328,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i32gather_epi64(
|
||||
slice: *const i64, offsets: __m128i, scale: i32
|
||||
slice: *const i64, offsets: __m128i, scale: i32,
|
||||
) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i64x2();
|
||||
@@ -1486,7 +1382,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i32gather_epi64(
|
||||
slice: *const i64, offsets: __m128i, scale: i32
|
||||
slice: *const i64, offsets: __m128i, scale: i32,
|
||||
) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
|
||||
@@ -1540,7 +1436,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i32gather_pd(
|
||||
slice: *const f64, offsets: __m128i, scale: i32
|
||||
slice: *const f64, offsets: __m128i, scale: i32,
|
||||
) -> __m128d {
|
||||
let zero = _mm_setzero_pd();
|
||||
let neg_one = _mm_set1_pd(-1.0);
|
||||
@@ -1590,7 +1486,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i32gather_pd(
|
||||
slice: *const f64, offsets: __m128i, scale: i32
|
||||
slice: *const f64, offsets: __m128i, scale: i32,
|
||||
) -> __m256d {
|
||||
let zero = _mm256_setzero_pd();
|
||||
let neg_one = _mm256_set1_pd(-1.0);
|
||||
@@ -1640,7 +1536,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i64gather_epi32(
|
||||
slice: *const i32, offsets: __m128i, scale: i32
|
||||
slice: *const i32, offsets: __m128i, scale: i32,
|
||||
) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i32x4();
|
||||
@@ -1694,7 +1590,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i64gather_epi32(
|
||||
slice: *const i32, offsets: __m256i, scale: i32
|
||||
slice: *const i32, offsets: __m256i, scale: i32,
|
||||
) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i32x4();
|
||||
@@ -1748,7 +1644,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i64gather_ps(
|
||||
slice: *const f32, offsets: __m128i, scale: i32
|
||||
slice: *const f32, offsets: __m128i, scale: i32,
|
||||
) -> __m128 {
|
||||
let zero = _mm_setzero_ps();
|
||||
let neg_one = _mm_set1_ps(-1.0);
|
||||
@@ -1774,7 +1670,7 @@ pub unsafe fn _mm_i64gather_ps(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_mask_i64gather_ps(
|
||||
src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
|
||||
src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
|
||||
) -> __m128 {
|
||||
let offsets = offsets.as_i64x2();
|
||||
let slice = slice as *const i8;
|
||||
@@ -1797,7 +1693,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i64gather_ps(
|
||||
slice: *const f32, offsets: __m256i, scale: i32
|
||||
slice: *const f32, offsets: __m256i, scale: i32,
|
||||
) -> __m128 {
|
||||
let zero = _mm_setzero_ps();
|
||||
let neg_one = _mm_set1_ps(-1.0);
|
||||
@@ -1823,7 +1719,7 @@ pub unsafe fn _mm256_i64gather_ps(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_mask_i64gather_ps(
|
||||
src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32
|
||||
src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32,
|
||||
) -> __m128 {
|
||||
let offsets = offsets.as_i64x4();
|
||||
let slice = slice as *const i8;
|
||||
@@ -1846,7 +1742,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i64gather_epi64(
|
||||
slice: *const i64, offsets: __m128i, scale: i32
|
||||
slice: *const i64, offsets: __m128i, scale: i32,
|
||||
) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i64x2();
|
||||
@@ -1900,7 +1796,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i64gather_epi64(
|
||||
slice: *const i64, offsets: __m256i, scale: i32
|
||||
slice: *const i64, offsets: __m256i, scale: i32,
|
||||
) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
|
||||
@@ -1954,7 +1850,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_i64gather_pd(
|
||||
slice: *const f64, offsets: __m128i, scale: i32
|
||||
slice: *const f64, offsets: __m128i, scale: i32,
|
||||
) -> __m128d {
|
||||
let zero = _mm_setzero_pd();
|
||||
let neg_one = _mm_set1_pd(-1.0);
|
||||
@@ -2004,7 +1900,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_i64gather_pd(
|
||||
slice: *const f64, offsets: __m256i, scale: i32
|
||||
slice: *const f64, offsets: __m256i, scale: i32,
|
||||
) -> __m256d {
|
||||
let zero = _mm256_setzero_pd();
|
||||
let neg_one = _mm256_set1_pd(-1.0);
|
||||
@@ -2053,7 +1949,7 @@ pub unsafe fn _mm256_mask_i64gather_pd(
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_inserti128_si256(
|
||||
a: __m256i, b: __m128i, imm8: i32
|
||||
a: __m256i, b: __m128i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let a = a.as_i64x4();
|
||||
let b = _mm256_castsi128_si256(b).as_i64x4();
|
||||
@@ -2101,12 +1997,9 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_maskload_epi32(
|
||||
mem_addr: *const i32, mask: __m128i
|
||||
mem_addr: *const i32, mask: __m128i,
|
||||
) -> __m128i {
|
||||
mem::transmute(maskloadd(
|
||||
mem_addr as *const i8,
|
||||
mask.as_i32x4(),
|
||||
))
|
||||
mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
|
||||
@@ -2119,12 +2012,9 @@ pub unsafe fn _mm_maskload_epi32(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskload_epi32(
|
||||
mem_addr: *const i32, mask: __m256i
|
||||
mem_addr: *const i32, mask: __m256i,
|
||||
) -> __m256i {
|
||||
mem::transmute(maskloadd256(
|
||||
mem_addr as *const i8,
|
||||
mask.as_i32x8(),
|
||||
))
|
||||
mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
|
||||
@@ -2137,12 +2027,9 @@ pub unsafe fn _mm256_maskload_epi32(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovq))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_maskload_epi64(
|
||||
mem_addr: *const i64, mask: __m128i
|
||||
mem_addr: *const i64, mask: __m128i,
|
||||
) -> __m128i {
|
||||
mem::transmute(maskloadq(
|
||||
mem_addr as *const i8,
|
||||
mask.as_i64x2(),
|
||||
))
|
||||
mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
|
||||
}
|
||||
|
||||
/// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
|
||||
@@ -2155,12 +2042,9 @@ pub unsafe fn _mm_maskload_epi64(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovq))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskload_epi64(
|
||||
mem_addr: *const i64, mask: __m256i
|
||||
mem_addr: *const i64, mask: __m256i,
|
||||
) -> __m256i {
|
||||
mem::transmute(maskloadq256(
|
||||
mem_addr as *const i8,
|
||||
mask.as_i64x4(),
|
||||
))
|
||||
mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
|
||||
}
|
||||
|
||||
/// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
|
||||
@@ -2173,13 +2057,9 @@ pub unsafe fn _mm256_maskload_epi64(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_maskstore_epi32(
|
||||
mem_addr: *mut i32, mask: __m128i, a: __m128i
|
||||
mem_addr: *mut i32, mask: __m128i, a: __m128i,
|
||||
) {
|
||||
maskstored(
|
||||
mem_addr as *mut i8,
|
||||
mask.as_i32x4(),
|
||||
a.as_i32x4(),
|
||||
)
|
||||
maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
|
||||
}
|
||||
|
||||
/// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
|
||||
@@ -2192,13 +2072,9 @@ pub unsafe fn _mm_maskstore_epi32(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskstore_epi32(
|
||||
mem_addr: *mut i32, mask: __m256i, a: __m256i
|
||||
mem_addr: *mut i32, mask: __m256i, a: __m256i,
|
||||
) {
|
||||
maskstored256(
|
||||
mem_addr as *mut i8,
|
||||
mask.as_i32x8(),
|
||||
a.as_i32x8(),
|
||||
)
|
||||
maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
|
||||
}
|
||||
|
||||
/// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
|
||||
@@ -2211,13 +2087,9 @@ pub unsafe fn _mm256_maskstore_epi32(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovq))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_maskstore_epi64(
|
||||
mem_addr: *mut i64, mask: __m128i, a: __m128i
|
||||
mem_addr: *mut i64, mask: __m128i, a: __m128i,
|
||||
) {
|
||||
maskstoreq(
|
||||
mem_addr as *mut i8,
|
||||
mask.as_i64x2(),
|
||||
a.as_i64x2(),
|
||||
)
|
||||
maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
|
||||
}
|
||||
|
||||
/// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
|
||||
@@ -2230,13 +2102,9 @@ pub unsafe fn _mm_maskstore_epi64(
|
||||
#[cfg_attr(test, assert_instr(vpmaskmovq))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_maskstore_epi64(
|
||||
mem_addr: *mut i64, mask: __m256i, a: __m256i
|
||||
mem_addr: *mut i64, mask: __m256i, a: __m256i,
|
||||
) {
|
||||
maskstoreq256(
|
||||
mem_addr as *mut i8,
|
||||
mask.as_i64x4(),
|
||||
a.as_i64x4(),
|
||||
)
|
||||
maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
|
||||
}
|
||||
|
||||
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
|
||||
@@ -2410,7 +2278,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_mpsadbw_epu8(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
a: __m256i, b: __m256i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let a = a.as_u8x32();
|
||||
let b = b.as_u8x32();
|
||||
@@ -2656,7 +2524,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_permute2x128_si256(
|
||||
a: __m256i, b: __m256i, imm8: i32
|
||||
a: __m256i, b: __m256i, imm8: i32,
|
||||
) -> __m256i {
|
||||
let a = a.as_i64x4();
|
||||
let b = b.as_i64x4();
|
||||
@@ -3559,16 +3427,23 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
|
||||
/// # if is_x86_feature_detected!("avx2") {
|
||||
/// # #[target_feature(enable = "avx2")]
|
||||
/// # unsafe fn worker() {
|
||||
/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
|
||||
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
/// let a = _mm256_setr_epi8(
|
||||
/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
/// );
|
||||
/// let b = _mm256_setr_epi8(
|
||||
/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
|
||||
/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
|
||||
/// -30, -31,
|
||||
/// );
|
||||
///
|
||||
/// let c = _mm256_unpackhi_epi8(a, b);
|
||||
///
|
||||
/// let expected = _mm256_setr_epi8(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13,
|
||||
/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30,
|
||||
/// 31,-31);
|
||||
/// let expected = _mm256_setr_epi8(
|
||||
/// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
|
||||
/// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
|
||||
/// -31,
|
||||
/// );
|
||||
/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
|
||||
///
|
||||
/// # }
|
||||
@@ -3612,15 +3487,22 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
|
||||
/// # if is_x86_feature_detected!("avx2") {
|
||||
/// # #[target_feature(enable = "avx2")]
|
||||
/// # unsafe fn worker() {
|
||||
/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
|
||||
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
/// let a = _mm256_setr_epi8(
|
||||
/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
/// );
|
||||
/// let b = _mm256_setr_epi8(
|
||||
/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
|
||||
/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
|
||||
/// -30, -31,
|
||||
/// );
|
||||
///
|
||||
/// let c = _mm256_unpacklo_epi8(a, b);
|
||||
///
|
||||
/// let expected = _mm256_setr_epi8(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7,
|
||||
/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
|
||||
/// let expected = _mm256_setr_epi8(
|
||||
/// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
|
||||
/// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
|
||||
/// );
|
||||
/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
|
||||
///
|
||||
/// # }
|
||||
@@ -3664,13 +3546,18 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
|
||||
/// # if is_x86_feature_detected!("avx2") {
|
||||
/// # #[target_feature(enable = "avx2")]
|
||||
/// # unsafe fn worker() {
|
||||
/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
|
||||
/// let a = _mm256_setr_epi16(
|
||||
/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// );
|
||||
/// let b = _mm256_setr_epi16(
|
||||
/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
|
||||
/// );
|
||||
///
|
||||
/// let c = _mm256_unpackhi_epi16(a, b);
|
||||
///
|
||||
/// let expected = _mm256_setr_epi16(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14,
|
||||
/// 15,-15);
|
||||
/// let expected = _mm256_setr_epi16(
|
||||
/// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
|
||||
/// );
|
||||
/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
|
||||
///
|
||||
/// # }
|
||||
@@ -3688,9 +3575,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
|
||||
let r: i16x16 = simd_shuffle16(
|
||||
a.as_i16x16(),
|
||||
b.as_i16x16(),
|
||||
[
|
||||
4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31
|
||||
],
|
||||
[4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
|
||||
);
|
||||
mem::transmute(r)
|
||||
}
|
||||
@@ -3715,13 +3600,18 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
|
||||
/// # #[target_feature(enable = "avx2")]
|
||||
/// # unsafe fn worker() {
|
||||
///
|
||||
/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
|
||||
/// let a = _mm256_setr_epi16(
|
||||
/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// );
|
||||
/// let b = _mm256_setr_epi16(
|
||||
/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
|
||||
/// );
|
||||
///
|
||||
/// let c = _mm256_unpacklo_epi16(a, b);
|
||||
///
|
||||
/// let expected = _mm256_setr_epi16(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10,
|
||||
/// 11,-11);
|
||||
/// let expected = _mm256_setr_epi16(
|
||||
/// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
|
||||
/// );
|
||||
/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
|
||||
///
|
||||
/// # }
|
||||
@@ -3739,9 +3629,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
|
||||
let r: i16x16 = simd_shuffle16(
|
||||
a.as_i16x16(),
|
||||
b.as_i16x16(),
|
||||
[
|
||||
0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27
|
||||
],
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
|
||||
);
|
||||
mem::transmute(r)
|
||||
}
|
||||
@@ -3832,11 +3720,8 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
|
||||
#[cfg_attr(test, assert_instr(vunpcklps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
|
||||
let r: i32x8 = simd_shuffle8(
|
||||
a.as_i32x8(),
|
||||
b.as_i32x8(),
|
||||
[0, 8, 1, 9, 4, 12, 5, 13],
|
||||
);
|
||||
let r: i32x8 =
|
||||
simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
|
||||
mem::transmute(r)
|
||||
}
|
||||
|
||||
@@ -4183,35 +4068,35 @@ extern "C" {
|
||||
fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.d"]
|
||||
fn pgatherdd(
|
||||
src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
|
||||
src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8,
|
||||
) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.d.256"]
|
||||
fn vpgatherdd(
|
||||
src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8
|
||||
src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8,
|
||||
) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.q"]
|
||||
fn pgatherdq(
|
||||
src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8
|
||||
src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8,
|
||||
) -> i64x2;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.q.256"]
|
||||
fn vpgatherdq(
|
||||
src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8
|
||||
src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8,
|
||||
) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.d"]
|
||||
fn pgatherqd(
|
||||
src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8
|
||||
src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8,
|
||||
) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.d.256"]
|
||||
fn vpgatherqd(
|
||||
src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8
|
||||
src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8,
|
||||
) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.q"]
|
||||
fn pgatherqq(
|
||||
src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8
|
||||
src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8,
|
||||
) -> i64x2;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.q.256"]
|
||||
fn vpgatherqq(
|
||||
src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8
|
||||
src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8,
|
||||
) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.pd"]
|
||||
fn pgatherdpd(
|
||||
@@ -4235,19 +4120,19 @@ extern "C" {
|
||||
) -> __m256d;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.ps"]
|
||||
fn pgatherdps(
|
||||
src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8
|
||||
src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8,
|
||||
) -> __m128;
|
||||
#[link_name = "llvm.x86.avx2.gather.d.ps.256"]
|
||||
fn vpgatherdps(
|
||||
src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8
|
||||
src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8,
|
||||
) -> __m256;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.ps"]
|
||||
fn pgatherqps(
|
||||
src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8
|
||||
src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8,
|
||||
) -> __m128;
|
||||
#[link_name = "llvm.x86.avx2.gather.q.ps.256"]
|
||||
fn vpgatherqps(
|
||||
src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8
|
||||
src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8,
|
||||
) -> __m128;
|
||||
#[link_name = "llvm.x86.avx2.psll.dq"]
|
||||
fn vpslldq(a: i64x4, b: i32) -> i64x4;
|
||||
@@ -4718,10 +4603,7 @@ mod tests {
|
||||
7, 6, 5, 4, 3, 2, 1, 0,
|
||||
);
|
||||
let r = _mm256_cmpeq_epi8(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4737,10 +4619,7 @@ mod tests {
|
||||
7, 6, 5, 4, 3, 2, 1, 0,
|
||||
);
|
||||
let r = _mm256_cmpeq_epi16(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4758,10 +4637,7 @@ mod tests {
|
||||
let a = _mm256_setr_epi64x(0, 1, 2, 3);
|
||||
let b = _mm256_setr_epi64x(3, 2, 2, 0);
|
||||
let r = _mm256_cmpeq_epi64(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4769,10 +4645,7 @@ mod tests {
|
||||
let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
|
||||
let b = _mm256_set1_epi8(0);
|
||||
let r = _mm256_cmpgt_epi8(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4780,10 +4653,7 @@ mod tests {
|
||||
let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
|
||||
let b = _mm256_set1_epi16(0);
|
||||
let r = _mm256_cmpgt_epi16(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4791,10 +4661,7 @@ mod tests {
|
||||
let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
|
||||
let b = _mm256_set1_epi32(0);
|
||||
let r = _mm256_cmpgt_epi32(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -4802,10 +4669,7 @@ mod tests {
|
||||
let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
|
||||
let b = _mm256_set1_epi64x(0);
|
||||
let r = _mm256_cmpgt_epi64(a, b);
|
||||
assert_eq_m256i(
|
||||
r,
|
||||
_mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0),
|
||||
);
|
||||
assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx2")]
|
||||
@@ -5997,16 +5861,7 @@ mod tests {
|
||||
);
|
||||
assert_eq_m256(
|
||||
r,
|
||||
_mm256_setr_ps(
|
||||
0.0,
|
||||
16.0,
|
||||
64.0,
|
||||
256.0,
|
||||
256.0,
|
||||
256.0,
|
||||
256.0,
|
||||
256.0,
|
||||
),
|
||||
_mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -21,17 +21,14 @@ use stdsimd_test::assert_instr;
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
|
||||
_bextr2_u32(
|
||||
a,
|
||||
(start & 0xff_u32) | ((len & 0xff_u32) << 8_u32),
|
||||
)
|
||||
_bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
|
||||
}
|
||||
|
||||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits `[15,8]` specify the length of the range.
|
||||
/// Bits `[7,0]` of `control` specify the index to the first bit in the range
|
||||
/// to be extracted, and bits `[15,8]` specify the length of the range.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
|
||||
#[inline]
|
||||
|
||||
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor(mem_addr: *const u8) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use coresimd::x86::*;
|
||||
use std::{fmt, cmp::PartialEq};
|
||||
use std::{cmp::PartialEq, fmt};
|
||||
use stdsimd_test::simd_test;
|
||||
|
||||
#[repr(align(16))]
|
||||
|
||||
@@ -380,7 +380,7 @@ pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
|
||||
#[inline]
|
||||
#[target_feature(enable = "mmx")]
|
||||
pub unsafe fn _mm_set_pi8(
|
||||
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8
|
||||
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
|
||||
) -> __m64 {
|
||||
_mm_setr_pi8(e0, e1, e2, e3, e4, e5, e6, e7)
|
||||
}
|
||||
@@ -426,7 +426,7 @@ pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
|
||||
#[inline]
|
||||
#[target_feature(enable = "mmx")]
|
||||
pub unsafe fn _mm_setr_pi8(
|
||||
e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8
|
||||
e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8,
|
||||
) -> __m64 {
|
||||
mem::transmute(i8x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
|
||||
}
|
||||
@@ -514,12 +514,8 @@ mod tests {
|
||||
-30001,
|
||||
i16::max_value() - 1,
|
||||
);
|
||||
let e = _mm_setr_pi16(
|
||||
i16::min_value(),
|
||||
30000,
|
||||
-30000,
|
||||
i16::max_value(),
|
||||
);
|
||||
let e =
|
||||
_mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value());
|
||||
assert_eq_m64(e, _mm_add_pi16(a, b));
|
||||
assert_eq_m64(e, _m_paddw(a, b));
|
||||
}
|
||||
@@ -537,16 +533,8 @@ mod tests {
|
||||
unsafe fn test_mm_adds_pi8() {
|
||||
let a = _mm_setr_pi8(-100, -1, 1, 100, -1, 0, 1, 0);
|
||||
let b = _mm_setr_pi8(-100, 1, -1, 100, 0, -1, 0, 1);
|
||||
let e = _mm_setr_pi8(
|
||||
i8::min_value(),
|
||||
0,
|
||||
0,
|
||||
i8::max_value(),
|
||||
-1,
|
||||
-1,
|
||||
1,
|
||||
1,
|
||||
);
|
||||
let e =
|
||||
_mm_setr_pi8(i8::min_value(), 0, 0, i8::max_value(), -1, -1, 1, 1);
|
||||
assert_eq_m64(e, _mm_adds_pi8(a, b));
|
||||
assert_eq_m64(e, _m_paddsb(a, b));
|
||||
}
|
||||
|
||||
@@ -444,11 +444,12 @@ impl m256iExt for __m256i {
|
||||
}
|
||||
}
|
||||
|
||||
use coresimd::simd::{f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8,
|
||||
i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, i8x32, i8x8,
|
||||
m16x16, m16x4, m16x8, m32x2, m32x4, m32x8, m64x2, m64x4,
|
||||
m8x16, m8x32, m8x8, u16x16, u16x4, u16x8, u32x2, u32x4,
|
||||
u32x8, u64x2, u64x4, u8x16, u8x32, u8x8};
|
||||
use coresimd::simd::{
|
||||
f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8, i32x2, i32x4,
|
||||
i32x8, i64x2, i64x4, i8x16, i8x32, i8x8, m16x16, m16x4, m16x8, m32x2,
|
||||
m32x4, m32x8, m64x2, m64x4, m8x16, m8x32, m8x8, u16x16, u16x4, u16x8,
|
||||
u32x2, u32x4, u32x8, u64x2, u64x4, u8x16, u8x32, u8x8,
|
||||
};
|
||||
|
||||
impl_from_bits_!(
|
||||
__m64: u32x2,
|
||||
|
||||
@@ -25,20 +25,25 @@ extern "C" {
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
|
||||
#[inline]
|
||||
#[target_feature(enable = "pclmulqdq")]
|
||||
#[cfg_attr(all(test, not(target_os = "linux")),
|
||||
assert_instr(pclmulqdq, imm8 = 0))]
|
||||
#[cfg_attr(all(test, target_os = "linux"),
|
||||
assert_instr(pclmullqlqdq, imm8 = 0))]
|
||||
#[cfg_attr(all(test, target_os = "linux"),
|
||||
assert_instr(pclmulhqlqdq, imm8 = 1))]
|
||||
#[cfg_attr(all(test, target_os = "linux"),
|
||||
assert_instr(pclmullqhqdq, imm8 = 16))]
|
||||
#[cfg_attr(all(test, target_os = "linux"),
|
||||
assert_instr(pclmulhqhqdq, imm8 = 17))]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17)
|
||||
)]
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_clmulepi64_si128(
|
||||
a: __m128i, b: __m128i, imm8: i32
|
||||
a: __m128i, b: __m128i, imm8: i32,
|
||||
) -> __m128i {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
//! RDRAND and RDSEED instructions for returning random numbers from an Intel
|
||||
//! on-chip hardware random number generator which has been seeded by an
|
||||
//! on-chip entropy source.
|
||||
|
||||
@@ -75,7 +75,7 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_sha1rnds4_epu32(
|
||||
a: __m128i, b: __m128i, func: i32
|
||||
a: __m128i, b: __m128i, func: i32,
|
||||
) -> __m128i {
|
||||
let a = a.as_i32x4();
|
||||
let b = b.as_i32x4();
|
||||
@@ -126,13 +126,9 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
|
||||
#[cfg_attr(test, assert_instr(sha256rnds2))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_sha256rnds2_epu32(
|
||||
a: __m128i, b: __m128i, k: __m128i
|
||||
a: __m128i, b: __m128i, k: __m128i,
|
||||
) -> __m128i {
|
||||
mem::transmute(sha256rnds2(
|
||||
a.as_i32x4(),
|
||||
b.as_i32x4(),
|
||||
k.as_i32x4(),
|
||||
))
|
||||
mem::transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -230,8 +230,10 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
// i586 only seems to generate plain `and` instructions, so ignore it.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(andps))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(andps)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
|
||||
let a: __m128i = mem::transmute(a);
|
||||
@@ -249,8 +251,10 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
|
||||
#[target_feature(enable = "sse")]
|
||||
// i586 only seems to generate plain `not` and `and` instructions, so ignore
|
||||
// it.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(andnps))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(andnps)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
|
||||
let a: __m128i = mem::transmute(a);
|
||||
@@ -265,8 +269,10 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
// i586 only seems to generate plain `or` instructions, so we ignore it.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(orps))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(orps)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
|
||||
let a: __m128i = mem::transmute(a);
|
||||
@@ -281,8 +287,10 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
// i586 only seems to generate plain `xor` instructions, so we ignore it.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(xorps))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(xorps)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
|
||||
let a: __m128i = mem::transmute(a);
|
||||
@@ -1132,10 +1140,14 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
|
||||
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))]
|
||||
// 32-bit codegen does not generate `movhps` or `movhpd`, but instead
|
||||
// `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2).
|
||||
#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
|
||||
assert_instr(movlhps))]
|
||||
#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
|
||||
assert_instr(unpcklps))]
|
||||
#[cfg_attr(
|
||||
all(test, target_arch = "x86", target_feature = "sse2"),
|
||||
assert_instr(movlhps)
|
||||
)]
|
||||
#[cfg_attr(
|
||||
all(test, target_arch = "x86", not(target_feature = "sse2")),
|
||||
assert_instr(unpcklps)
|
||||
)]
|
||||
// TODO: This function is actually not limited to floats, but that's what
|
||||
// what matches the C type most closely: (__m128, *const __m64) -> __m128
|
||||
pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
|
||||
@@ -1185,11 +1197,15 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
|
||||
// #[cfg_attr(test, assert_instr(movlps))]
|
||||
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))]
|
||||
// On 32-bit targets with SSE2, it just generates two `movsd`.
|
||||
#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
|
||||
assert_instr(movsd))]
|
||||
#[cfg_attr(
|
||||
all(test, target_arch = "x86", target_feature = "sse2"),
|
||||
assert_instr(movsd)
|
||||
)]
|
||||
// It should really generate "movlps", but oh well...
|
||||
#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
|
||||
assert_instr(movss))]
|
||||
#[cfg_attr(
|
||||
all(test, target_arch = "x86", not(target_feature = "sse2")),
|
||||
assert_instr(movss)
|
||||
)]
|
||||
// TODO: Like _mm_loadh_pi, this also isn't limited to floats.
|
||||
pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
|
||||
let q = p as *const f32x2;
|
||||
@@ -1321,8 +1337,10 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
|
||||
// On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
|
||||
// fine.
|
||||
// On i586 (no SSE2) it just generates plain MOV instructions.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(movhpd))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(movhpd)
|
||||
)]
|
||||
pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
|
||||
#[cfg(target_arch = "x86")]
|
||||
{
|
||||
@@ -1349,8 +1367,10 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
// On i586 the codegen just generates plane MOVs. No need to test for that.
|
||||
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(movlps))]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_arch = "x86_64", target_feature = "sse2")),
|
||||
assert_instr(movlps)
|
||||
)]
|
||||
pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
|
||||
#[cfg(target_arch = "x86")]
|
||||
{
|
||||
@@ -1929,7 +1949,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
|
||||
#[target_feature(enable = "sse")]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _MM_TRANSPOSE4_PS(
|
||||
row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128
|
||||
row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128,
|
||||
) {
|
||||
let tmp0 = _mm_unpacklo_ps(*row0, *row1);
|
||||
let tmp2 = _mm_unpacklo_ps(*row2, *row3);
|
||||
@@ -2734,12 +2754,8 @@ mod tests {
|
||||
|
||||
let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
|
||||
let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
|
||||
let e2: u32x4 = transmute(_mm_setr_ps(
|
||||
transmute(0xffffffffu32),
|
||||
2.0,
|
||||
3.0,
|
||||
4.0,
|
||||
));
|
||||
let e2: u32x4 =
|
||||
transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
|
||||
assert_eq!(r2, e2);
|
||||
}
|
||||
|
||||
@@ -3453,22 +3469,9 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse")]
|
||||
unsafe fn test_mm_cvtss_si32() {
|
||||
let inputs = &[
|
||||
42.0f32,
|
||||
-3.1,
|
||||
4.0e10,
|
||||
4.0e-20,
|
||||
NAN,
|
||||
2147483500.1,
|
||||
];
|
||||
let result = &[
|
||||
42i32,
|
||||
-3,
|
||||
i32::min_value(),
|
||||
0,
|
||||
i32::min_value(),
|
||||
2147483520,
|
||||
];
|
||||
let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
|
||||
let result =
|
||||
&[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520];
|
||||
for i in 0..inputs.len() {
|
||||
let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
|
||||
let e = result[i];
|
||||
@@ -3672,10 +3675,8 @@ mod tests {
|
||||
}
|
||||
|
||||
let r = _mm_load_ps(p);
|
||||
let e = _mm_add_ps(
|
||||
_mm_setr_ps(1.0, 2.0, 3.0, 4.0),
|
||||
_mm_set1_ps(fixup),
|
||||
);
|
||||
let e =
|
||||
_mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
|
||||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
@@ -3705,10 +3706,8 @@ mod tests {
|
||||
}
|
||||
|
||||
let r = _mm_loadr_ps(p);
|
||||
let e = _mm_add_ps(
|
||||
_mm_setr_ps(4.0, 3.0, 2.0, 1.0),
|
||||
_mm_set1_ps(fixup),
|
||||
);
|
||||
let e =
|
||||
_mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
|
||||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
@@ -3947,9 +3946,7 @@ mod tests {
|
||||
#[simd_test(enable = "sse")]
|
||||
unsafe fn test_mm_stream_ps() {
|
||||
let a = _mm_set1_ps(7.0);
|
||||
let mut mem = Memory {
|
||||
data: [-1.0; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [-1.0; 4] };
|
||||
|
||||
_mm_stream_ps(&mut mem.data[0] as *mut f32, a);
|
||||
for i in 0..4 {
|
||||
@@ -4157,12 +4154,8 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse,mmx")]
|
||||
unsafe fn test_mm_movemask_pi8() {
|
||||
let a = _mm_setr_pi16(
|
||||
0b1000_0000,
|
||||
0b0100_0000,
|
||||
0b1000_0000,
|
||||
0b0100_0000,
|
||||
);
|
||||
let a =
|
||||
_mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
|
||||
let r = _mm_movemask_pi8(a);
|
||||
assert_eq!(r, 0b10001);
|
||||
|
||||
|
||||
@@ -1010,7 +1010,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
|
||||
// no particular instruction to test
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_set_epi16(
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
|
||||
) -> __m128i {
|
||||
mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
|
||||
}
|
||||
@@ -1095,7 +1095,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
|
||||
// no particular instruction to test
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_setr_epi16(
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
|
||||
) -> __m128i {
|
||||
_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
|
||||
}
|
||||
@@ -1134,10 +1134,15 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse2")]
|
||||
// FIXME movsd on windows
|
||||
#[cfg_attr(all(test, not(windows),
|
||||
#[cfg_attr(
|
||||
all(
|
||||
test,
|
||||
not(windows),
|
||||
not(all(target_os = "linux", target_arch = "x86_64")),
|
||||
target_arch = "x86_64"),
|
||||
assert_instr(movq))]
|
||||
target_arch = "x86_64"
|
||||
),
|
||||
assert_instr(movq)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
|
||||
_mm_set_epi64x(0, simd_extract((*mem_addr).as_i64x2(), 0))
|
||||
@@ -1190,7 +1195,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
|
||||
#[cfg_attr(test, assert_instr(maskmovdqu))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_maskmoveu_si128(
|
||||
a: __m128i, mask: __m128i, mem_addr: *mut i8
|
||||
a: __m128i, mask: __m128i, mem_addr: *mut i8,
|
||||
) {
|
||||
maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
|
||||
}
|
||||
@@ -1229,10 +1234,15 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse2")]
|
||||
// FIXME mov on windows, movlps on i686
|
||||
#[cfg_attr(all(test, not(windows),
|
||||
#[cfg_attr(
|
||||
all(
|
||||
test,
|
||||
not(windows),
|
||||
not(all(target_os = "linux", target_arch = "x86_64")),
|
||||
target_arch = "x86_64"),
|
||||
assert_instr(movq))]
|
||||
target_arch = "x86_64"
|
||||
),
|
||||
assert_instr(movq)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
|
||||
ptr::copy_nonoverlapping(
|
||||
@@ -1275,8 +1285,9 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse2")]
|
||||
// FIXME movd on windows, movd on i686
|
||||
#[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
|
||||
assert_instr(movq))]
|
||||
#[cfg_attr(
|
||||
all(test, not(windows), target_arch = "x86_64"), assert_instr(movq)
|
||||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128();
|
||||
@@ -1341,11 +1352,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
|
||||
mem::transmute(simd_insert(
|
||||
a.as_i16x8(),
|
||||
(imm8 & 7) as u32,
|
||||
i as i16,
|
||||
))
|
||||
mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16))
|
||||
}
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
@@ -1443,16 +1450,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
|
||||
simd_shuffle8(
|
||||
a,
|
||||
a,
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
$x01 + 4,
|
||||
$x23 + 4,
|
||||
$x45 + 4,
|
||||
$x67 + 4,
|
||||
],
|
||||
[0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4],
|
||||
)
|
||||
};
|
||||
}
|
||||
@@ -1567,9 +1565,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
|
||||
mem::transmute::<i8x16, _>(simd_shuffle16(
|
||||
a.as_i8x16(),
|
||||
b.as_i8x16(),
|
||||
[
|
||||
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
|
||||
],
|
||||
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
|
||||
))
|
||||
}
|
||||
|
||||
@@ -1630,9 +1626,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
|
||||
mem::transmute::<i8x16, _>(simd_shuffle16(
|
||||
a.as_i8x16(),
|
||||
b.as_i8x16(),
|
||||
[
|
||||
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
|
||||
],
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
|
||||
))
|
||||
}
|
||||
|
||||
@@ -1644,11 +1638,8 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
|
||||
#[cfg_attr(test, assert_instr(punpcklwd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
|
||||
let x = simd_shuffle8(
|
||||
a.as_i16x8(),
|
||||
b.as_i16x8(),
|
||||
[0, 8, 1, 9, 2, 10, 3, 11],
|
||||
);
|
||||
let x =
|
||||
simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
|
||||
mem::transmute::<i16x8, _>(x)
|
||||
}
|
||||
|
||||
@@ -1947,11 +1938,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
#[cfg_attr(test, assert_instr(cmpltsd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
simd_insert(
|
||||
_mm_cmplt_sd(b, a),
|
||||
1,
|
||||
simd_extract::<_, f64>(a, 1),
|
||||
)
|
||||
simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the
|
||||
@@ -1963,11 +1950,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
#[cfg_attr(test, assert_instr(cmplesd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
simd_insert(
|
||||
_mm_cmple_sd(b, a),
|
||||
1,
|
||||
simd_extract::<_, f64>(a, 1),
|
||||
)
|
||||
simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the result
|
||||
@@ -2042,11 +2025,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
#[cfg_attr(test, assert_instr(cmpnltsd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
simd_insert(
|
||||
_mm_cmpnlt_sd(b, a),
|
||||
1,
|
||||
simd_extract::<_, f64>(a, 1),
|
||||
)
|
||||
simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the
|
||||
@@ -2058,11 +2037,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
#[cfg_attr(test, assert_instr(cmpnlesd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
|
||||
simd_insert(
|
||||
_mm_cmpnle_sd(b, a),
|
||||
1,
|
||||
simd_extract::<_, f64>(a, 1),
|
||||
)
|
||||
simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
|
||||
}
|
||||
|
||||
/// Compare corresponding elements in `a` and `b` for equality.
|
||||
@@ -2881,8 +2856,9 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
|
||||
/// The resulting `__m128d` element is composed by the low-order values of
|
||||
/// the two `__m128d` interleaved input elements, i.e.:
|
||||
///
|
||||
/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
|
||||
/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
|
||||
/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
|
||||
/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
|
||||
/// input
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
|
||||
#[inline]
|
||||
@@ -3223,22 +3199,7 @@ mod tests {
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_add_epi8() {
|
||||
let a = _mm_setr_epi8(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = _mm_setr_epi8(
|
||||
@@ -3290,22 +3251,7 @@ mod tests {
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_adds_epi8() {
|
||||
let a = _mm_setr_epi8(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = _mm_setr_epi8(
|
||||
@@ -3363,22 +3309,7 @@ mod tests {
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_adds_epu8() {
|
||||
let a = _mm_setr_epi8(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = _mm_setr_epi8(
|
||||
@@ -3629,22 +3560,7 @@ mod tests {
|
||||
);
|
||||
let r = _mm_slli_si128(a, 1);
|
||||
let e = _mm_setr_epi8(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
assert_eq_m128i(r, e);
|
||||
|
||||
@@ -3888,41 +3804,10 @@ mod tests {
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_cmpeq_epi8() {
|
||||
let a = _mm_setr_epi8(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
);
|
||||
let b = _mm_setr_epi8(
|
||||
15,
|
||||
14,
|
||||
2,
|
||||
12,
|
||||
11,
|
||||
10,
|
||||
9,
|
||||
8,
|
||||
7,
|
||||
6,
|
||||
5,
|
||||
4,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
0,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
let b =
|
||||
_mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
let r = _mm_cmpeq_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
assert_eq_m128i(
|
||||
@@ -4869,9 +4754,7 @@ mod tests {
|
||||
pub data: [f64; 2],
|
||||
}
|
||||
let a = _mm_set1_pd(7.0);
|
||||
let mut mem = Memory {
|
||||
data: [-1.0; 2],
|
||||
};
|
||||
let mut mem = Memory { data: [-1.0; 2] };
|
||||
|
||||
_mm_stream_pd(&mut mem.data[0] as *mut f64, a);
|
||||
for i in 0..2 {
|
||||
@@ -4889,9 +4772,7 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_store_pd() {
|
||||
let mut mem = Memory {
|
||||
data: [0.0f64; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [0.0f64; 4] };
|
||||
let vals = &mut mem.data;
|
||||
let a = _mm_setr_pd(1.0, 2.0);
|
||||
let d = vals.as_mut_ptr();
|
||||
@@ -4903,9 +4784,7 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse")]
|
||||
unsafe fn test_mm_storeu_pd() {
|
||||
let mut mem = Memory {
|
||||
data: [0.0f64; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [0.0f64; 4] };
|
||||
let vals = &mut mem.data;
|
||||
let a = _mm_setr_pd(1.0, 2.0);
|
||||
|
||||
@@ -4929,9 +4808,7 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_store1_pd() {
|
||||
let mut mem = Memory {
|
||||
data: [0.0f64; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [0.0f64; 4] };
|
||||
let vals = &mut mem.data;
|
||||
let a = _mm_setr_pd(1.0, 2.0);
|
||||
let d = vals.as_mut_ptr();
|
||||
@@ -4943,9 +4820,7 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_store_pd1() {
|
||||
let mut mem = Memory {
|
||||
data: [0.0f64; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [0.0f64; 4] };
|
||||
let vals = &mut mem.data;
|
||||
let a = _mm_setr_pd(1.0, 2.0);
|
||||
let d = vals.as_mut_ptr();
|
||||
@@ -4957,9 +4832,7 @@ mod tests {
|
||||
|
||||
#[simd_test(enable = "sse2")]
|
||||
unsafe fn test_mm_storer_pd() {
|
||||
let mut mem = Memory {
|
||||
data: [0.0f64; 4],
|
||||
};
|
||||
let mut mem = Memory { data: [0.0f64; 4] };
|
||||
let vals = &mut mem.data;
|
||||
let a = _mm_setr_pd(1.0, 2.0);
|
||||
let d = vals.as_mut_ptr();
|
||||
@@ -5013,10 +4886,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let r = _mm_loadu_pd(d);
|
||||
let e = _mm_add_pd(
|
||||
_mm_setr_pd(1.0, 2.0),
|
||||
_mm_set1_pd(offset as f64),
|
||||
);
|
||||
let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
|
||||
assert_eq_m128d(r, e);
|
||||
}
|
||||
|
||||
@@ -5091,12 +4961,8 @@ mod tests {
|
||||
|
||||
assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
|
||||
|
||||
let a = _mm_setr_ps(
|
||||
-1.1,
|
||||
f32::NEG_INFINITY,
|
||||
f32::MAX,
|
||||
f32::NEG_INFINITY,
|
||||
);
|
||||
let a =
|
||||
_mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
|
||||
let b = _mm_setr_pd(f64::INFINITY, -5.0);
|
||||
|
||||
let r = _mm_cvtsd_ss(a, b);
|
||||
@@ -5161,12 +5027,8 @@ mod tests {
|
||||
let r = _mm_cvttps_epi32(a);
|
||||
assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
|
||||
|
||||
let a = _mm_setr_ps(
|
||||
f32::NEG_INFINITY,
|
||||
f32::INFINITY,
|
||||
f32::MIN,
|
||||
f32::MAX,
|
||||
);
|
||||
let a =
|
||||
_mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
|
||||
let r = _mm_cvttps_epi32(a);
|
||||
assert_eq_m128i(
|
||||
r,
|
||||
|
||||
@@ -66,13 +66,9 @@ pub const _MM_FROUND_NEARBYINT: i32 =
|
||||
#[cfg_attr(test, assert_instr(pblendvb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_blendv_epi8(
|
||||
a: __m128i, b: __m128i, mask: __m128i
|
||||
a: __m128i, b: __m128i, mask: __m128i,
|
||||
) -> __m128i {
|
||||
mem::transmute(pblendvb(
|
||||
a.as_i8x16(),
|
||||
b.as_i8x16(),
|
||||
mask.as_i8x16(),
|
||||
))
|
||||
mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
|
||||
}
|
||||
|
||||
/// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
|
||||
@@ -250,11 +246,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
|
||||
mem::transmute(simd_insert(
|
||||
a.as_i8x16(),
|
||||
(imm8 & 0b1111) as u32,
|
||||
i as i8,
|
||||
))
|
||||
mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
|
||||
}
|
||||
|
||||
/// Return a copy of `a` with the 32-bit integer from `i` inserted at a
|
||||
@@ -267,11 +259,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
|
||||
#[rustc_args_required_const(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
|
||||
mem::transmute(simd_insert(
|
||||
a.as_i32x4(),
|
||||
(imm8 & 0b11) as u32,
|
||||
i,
|
||||
))
|
||||
mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
|
||||
}
|
||||
|
||||
/// Compare packed 8-bit integers in `a` and `b` and return packed maximum
|
||||
@@ -1778,16 +1766,12 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let a = _mm_setr_epi32(
|
||||
15,
|
||||
2, /* ignored */
|
||||
1234567,
|
||||
4, /* ignored */
|
||||
15, 2, /* ignored */
|
||||
1234567, 4, /* ignored */
|
||||
);
|
||||
let b = _mm_setr_epi32(
|
||||
-20,
|
||||
-256, /* ignored */
|
||||
666666,
|
||||
666666, /* ignored */
|
||||
-20, -256, /* ignored */
|
||||
666666, 666666, /* ignored */
|
||||
);
|
||||
let r = _mm_mul_epi32(a, b);
|
||||
let e = _mm_setr_epi64x(-300, 823043843622);
|
||||
|
||||
@@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestrm(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> __m128i {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -544,7 +544,7 @@ pub unsafe fn _mm_cmpestrm(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestri(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -567,7 +567,7 @@ pub unsafe fn _mm_cmpestri(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestrz(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -590,7 +590,7 @@ pub unsafe fn _mm_cmpestrz(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestrc(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -613,7 +613,7 @@ pub unsafe fn _mm_cmpestrc(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestrs(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -636,7 +636,7 @@ pub unsafe fn _mm_cmpestrs(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestro(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -660,7 +660,7 @@ pub unsafe fn _mm_cmpestro(
|
||||
#[rustc_args_required_const(4)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cmpestra(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
|
||||
) -> i32 {
|
||||
let a = a.as_i8x16();
|
||||
let b = b.as_i8x16();
|
||||
@@ -917,13 +917,8 @@ mod tests {
|
||||
unsafe fn test_mm_cmpestra() {
|
||||
let a = str_to_m128i(b"Cannot match a");
|
||||
let b = str_to_m128i(b"Null after 14");
|
||||
let i = _mm_cmpestra(
|
||||
a,
|
||||
14,
|
||||
b,
|
||||
16,
|
||||
_SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK,
|
||||
);
|
||||
let i =
|
||||
_mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK);
|
||||
assert_eq!(1, i);
|
||||
}
|
||||
|
||||
|
||||
@@ -25,8 +25,8 @@ extern "C" {
|
||||
/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
|
||||
///
|
||||
/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
|
||||
/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All other
|
||||
/// bits are ignored.
|
||||
/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
|
||||
/// other bits are ignored.
|
||||
///
|
||||
/// If the length is zero, it is interpreted as `64`. If the length and index
|
||||
/// are zero, the lower 64 bits of `x` are extracted.
|
||||
|
||||
@@ -596,24 +596,8 @@ mod tests {
|
||||
12, 5, 5, 10,
|
||||
4, 1, 8, 0,
|
||||
);
|
||||
let expected = _mm_setr_epi8(
|
||||
5,
|
||||
0,
|
||||
5,
|
||||
4,
|
||||
9,
|
||||
13,
|
||||
7,
|
||||
4,
|
||||
13,
|
||||
6,
|
||||
6,
|
||||
11,
|
||||
5,
|
||||
2,
|
||||
9,
|
||||
1,
|
||||
);
|
||||
let expected =
|
||||
_mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
|
||||
let r = _mm_shuffle_epi8(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ mod x86_polyfill {
|
||||
|
||||
#[target_feature(enable = "avx2")]
|
||||
pub unsafe fn _mm256_insert_epi64(
|
||||
a: __m256i, val: i64, idx: i32
|
||||
a: __m256i, val: i64, idx: i32,
|
||||
) -> __m256i {
|
||||
union A {
|
||||
a: __m256i,
|
||||
|
||||
@@ -38,11 +38,7 @@ extern "C" {
|
||||
#[cfg_attr(test, assert_instr(xsave))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsave(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial restore of the enabled processor states using
|
||||
@@ -110,11 +106,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
|
||||
#[cfg_attr(test, assert_instr(xsaveopt))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsaveopt(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial save of the enabled processor states to memory
|
||||
@@ -130,11 +122,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
|
||||
#[cfg_attr(test, assert_instr(xsavec))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsavec(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial save of the enabled processor states to memory at
|
||||
@@ -151,11 +139,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
|
||||
#[cfg_attr(test, assert_instr(xsaves))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsaves(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial restore of the enabled processor states using the
|
||||
@@ -196,9 +180,7 @@ mod tests {
|
||||
|
||||
impl XsaveArea {
|
||||
fn new() -> XsaveArea {
|
||||
XsaveArea {
|
||||
data: [0; 2560],
|
||||
}
|
||||
XsaveArea { data: [0; 2560] }
|
||||
}
|
||||
fn ptr(&mut self) -> *mut u8 {
|
||||
&mut self.data[0] as *mut _ as *mut u8
|
||||
|
||||
@@ -28,8 +28,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
|
||||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits `[15,8]` specify the length of the range.
|
||||
/// Bits `[7,0]` of `control` specify the index to the first bit in the range
|
||||
/// to be extracted, and bits `[15,8]` specify the length of the range.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
|
||||
#[inline]
|
||||
|
||||
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor64(mem_addr: *const u8) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use coresimd::x86_64::*;
|
||||
use std::{fmt, cmp::PartialEq};
|
||||
use std::{cmp::PartialEq, fmt};
|
||||
use stdsimd_test::simd_test;
|
||||
|
||||
#[repr(align(16))]
|
||||
|
||||
@@ -36,11 +36,7 @@ extern "C" {
|
||||
#[cfg_attr(test, assert_instr(xsave64))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsave64(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial restore of the enabled processor states using
|
||||
@@ -73,11 +69,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
|
||||
#[cfg_attr(test, assert_instr(xsaveopt64))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsaveopt64(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial save of the enabled processor states to memory
|
||||
@@ -93,11 +85,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
|
||||
#[cfg_attr(test, assert_instr(xsavec64))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsavec64(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial save of the enabled processor states to memory at
|
||||
@@ -114,11 +102,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
|
||||
#[cfg_attr(test, assert_instr(xsaves64))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
|
||||
xsaves64(
|
||||
mem_addr,
|
||||
(save_mask >> 32) as u32,
|
||||
save_mask as u32,
|
||||
);
|
||||
xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
|
||||
}
|
||||
|
||||
/// Perform a full or partial restore of the enabled processor states using the
|
||||
|
||||
@@ -21,7 +21,7 @@ use proc_macro2::TokenStream;
|
||||
|
||||
#[proc_macro_attribute]
|
||||
pub fn assert_instr(
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
|
||||
) -> proc_macro::TokenStream {
|
||||
let invoc = syn::parse::<Invoc>(attr)
|
||||
.expect("expected #[assert_instr(instr, a = b, ...)]");
|
||||
@@ -36,9 +36,10 @@ pub fn assert_instr(
|
||||
let name = &func.ident;
|
||||
|
||||
// Disable assert_instr for x86 targets compiled with avx enabled, which
|
||||
// causes LLVM to generate different intrinsics that the ones we are testing
|
||||
// for.
|
||||
let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
|
||||
// causes LLVM to generate different intrinsics that the ones we are
|
||||
// testing for.
|
||||
let disable_assert_instr =
|
||||
std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
|
||||
let maybe_ignore = if cfg!(optimized) && !disable_assert_instr {
|
||||
TokenStream::new()
|
||||
} else {
|
||||
@@ -72,11 +73,7 @@ pub fn assert_instr(
|
||||
syn::Pat::Ident(ref i) => &i.ident,
|
||||
_ => panic!("must have bare arguments"),
|
||||
};
|
||||
match invoc
|
||||
.args
|
||||
.iter()
|
||||
.find(|a| *ident == a.0)
|
||||
{
|
||||
match invoc.args.iter().find(|a| *ident == a.0) {
|
||||
Some(&(_, ref tts)) => {
|
||||
input_vals.push(quote! { #tts });
|
||||
}
|
||||
@@ -87,7 +84,8 @@ pub fn assert_instr(
|
||||
};
|
||||
}
|
||||
|
||||
let attrs = func.attrs
|
||||
let attrs = func
|
||||
.attrs
|
||||
.iter()
|
||||
.filter(|attr| {
|
||||
attr.path
|
||||
@@ -142,9 +140,8 @@ pub fn assert_instr(
|
||||
}
|
||||
}.into();
|
||||
// why? necessary now to get tests to work?
|
||||
let tts: TokenStream = tts.to_string()
|
||||
.parse()
|
||||
.expect("cannot parse tokenstream");
|
||||
let tts: TokenStream =
|
||||
tts.to_string().parse().expect("cannot parse tokenstream");
|
||||
|
||||
let tts: TokenStream = quote! {
|
||||
#item
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
use std::env;
|
||||
|
||||
fn main() {
|
||||
println!(
|
||||
"cargo:rustc-env=TARGET={}",
|
||||
env::var("TARGET").unwrap()
|
||||
);
|
||||
println!("cargo:rustc-env=TARGET={}", env::var("TARGET").unwrap());
|
||||
}
|
||||
|
||||
@@ -9,29 +9,35 @@
|
||||
#![cfg_attr(stdsimd_strict, deny(warnings))]
|
||||
#![allow(dead_code)]
|
||||
#![allow(unused_features)]
|
||||
#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
|
||||
simd_ffi, asm, proc_macro_gen,
|
||||
integer_atomics, stmt_expr_attributes, core_intrinsics,
|
||||
crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd,
|
||||
staged_api, core_float, core_slice_ext, align_offset,
|
||||
doc_cfg, mmx_target_feature, tbm_target_feature,
|
||||
sse4a_target_feature, arm_target_feature, aarch64_target_feature,
|
||||
mips_target_feature, powerpc_target_feature)]
|
||||
#![cfg_attr(test,
|
||||
feature(proc_macro, test, attr_literals, abi_vectorcall,
|
||||
untagged_unions))]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(inline_always, too_many_arguments, cast_sign_loss,
|
||||
cast_lossless, cast_possible_wrap,
|
||||
cast_possible_truncation, cast_precision_loss,
|
||||
#![feature(
|
||||
const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
|
||||
asm, proc_macro_gen, integer_atomics, stmt_expr_attributes,
|
||||
core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs,
|
||||
stdsimd, staged_api, core_float, core_slice_ext, align_offset, doc_cfg,
|
||||
mmx_target_feature, tbm_target_feature, sse4a_target_feature,
|
||||
arm_target_feature, aarch64_target_feature, mips_target_feature,
|
||||
powerpc_target_feature
|
||||
)]
|
||||
#![cfg_attr(
|
||||
test,
|
||||
feature(proc_macro, test, attr_literals, abi_vectorcall, untagged_unions)
|
||||
)]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
inline_always, too_many_arguments, cast_sign_loss, cast_lossless,
|
||||
cast_possible_wrap, cast_possible_truncation, cast_precision_loss,
|
||||
shadow_reuse, cyclomatic_complexity, similar_names,
|
||||
many_single_char_names))]
|
||||
many_single_char_names
|
||||
)
|
||||
)]
|
||||
#![cfg_attr(test, allow(unused_imports))]
|
||||
#![no_core]
|
||||
#![unstable(feature = "stdsimd", issue = "27731")]
|
||||
#![doc(test(attr(deny(warnings))),
|
||||
test(attr(allow(dead_code, deprecated, unused_variables,
|
||||
unused_mut))))]
|
||||
#![doc(
|
||||
test(attr(deny(warnings))),
|
||||
test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
|
||||
)]
|
||||
|
||||
#[cfg_attr(not(test), macro_use)]
|
||||
extern crate core as _core;
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#![feature(stdsimd)]
|
||||
#![cfg_attr(stdsimd_strict, deny(warnings))]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(option_unwrap_used, print_stdout, use_debug))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(option_unwrap_used, print_stdout, use_debug)
|
||||
)]
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
#[macro_use]
|
||||
@@ -14,53 +16,20 @@ fn x86_all() {
|
||||
println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
|
||||
println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
|
||||
println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
|
||||
println!(
|
||||
"sse4.1: {:?}",
|
||||
is_x86_feature_detected!("sse4.1")
|
||||
);
|
||||
println!(
|
||||
"sse4.2: {:?}",
|
||||
is_x86_feature_detected!("sse4.2")
|
||||
);
|
||||
println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
|
||||
println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
|
||||
println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
|
||||
println!("avx: {:?}", is_x86_feature_detected!("avx"));
|
||||
println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
|
||||
println!(
|
||||
"avx512f {:?}",
|
||||
is_x86_feature_detected!("avx512f")
|
||||
);
|
||||
println!(
|
||||
"avx512cd {:?}",
|
||||
is_x86_feature_detected!("avx512cd")
|
||||
);
|
||||
println!(
|
||||
"avx512er {:?}",
|
||||
is_x86_feature_detected!("avx512er")
|
||||
);
|
||||
println!(
|
||||
"avx512pf {:?}",
|
||||
is_x86_feature_detected!("avx512pf")
|
||||
);
|
||||
println!(
|
||||
"avx512bw {:?}",
|
||||
is_x86_feature_detected!("avx512bw")
|
||||
);
|
||||
println!(
|
||||
"avx512dq {:?}",
|
||||
is_x86_feature_detected!("avx512dq")
|
||||
);
|
||||
println!(
|
||||
"avx512vl {:?}",
|
||||
is_x86_feature_detected!("avx512vl")
|
||||
);
|
||||
println!(
|
||||
"avx512_ifma {:?}",
|
||||
is_x86_feature_detected!("avx512ifma")
|
||||
);
|
||||
println!(
|
||||
"avx512_vbmi {:?}",
|
||||
is_x86_feature_detected!("avx512vbmi")
|
||||
);
|
||||
println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
|
||||
println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
|
||||
println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
|
||||
println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
|
||||
println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
|
||||
println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
|
||||
println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
|
||||
println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
|
||||
println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
|
||||
println!(
|
||||
"avx512_vpopcntdq {:?}",
|
||||
is_x86_feature_detected!("avx512vpopcntdq")
|
||||
@@ -70,23 +39,11 @@ fn x86_all() {
|
||||
println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
|
||||
println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
|
||||
println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
|
||||
println!(
|
||||
"popcnt: {:?}",
|
||||
is_x86_feature_detected!("popcnt")
|
||||
);
|
||||
println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
|
||||
println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
|
||||
println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
|
||||
println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
|
||||
println!(
|
||||
"xsaveopt: {:?}",
|
||||
is_x86_feature_detected!("xsaveopt")
|
||||
);
|
||||
println!(
|
||||
"xsaves: {:?}",
|
||||
is_x86_feature_detected!("xsaves")
|
||||
);
|
||||
println!(
|
||||
"xsavec: {:?}",
|
||||
is_x86_feature_detected!("xsavec")
|
||||
);
|
||||
println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
|
||||
println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
|
||||
println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
|
||||
}
|
||||
|
||||
@@ -253,11 +253,7 @@ macro_rules! product_nan_test {
|
||||
}
|
||||
}
|
||||
let v = $id::splat(n0);
|
||||
assert!(
|
||||
v.product().is_nan(),
|
||||
"all nans | {:?}",
|
||||
v
|
||||
);
|
||||
assert!(v.product().is_nan(), "all nans | {:?}", v);
|
||||
}
|
||||
unsafe { test_fn() };
|
||||
}
|
||||
@@ -355,8 +351,7 @@ mod offset {
|
||||
// tolerate 1 ULP difference:
|
||||
if vsum.as_int() > tsum.as_int() {
|
||||
assert!(
|
||||
vsum.as_int() - tsum.as_int()
|
||||
< 2,
|
||||
vsum.as_int() - tsum.as_int() < 2,
|
||||
"v: {:?} | vsum: {} | tsum: {}",
|
||||
v,
|
||||
vsum,
|
||||
@@ -364,8 +359,7 @@ mod offset {
|
||||
);
|
||||
} else {
|
||||
assert!(
|
||||
tsum.as_int() - vsum.as_int()
|
||||
< 2,
|
||||
tsum.as_int() - vsum.as_int() < 2,
|
||||
"v: {:?} | vsum: {} | tsum: {}",
|
||||
v,
|
||||
vsum,
|
||||
|
||||
@@ -12,7 +12,7 @@ extern crate quote;
|
||||
|
||||
use std::env;
|
||||
|
||||
use proc_macro2::{Literal, Span, Ident, TokenStream, TokenTree};
|
||||
use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
|
||||
|
||||
fn string(s: &str) -> TokenTree {
|
||||
Literal::string(s).into()
|
||||
@@ -20,11 +20,9 @@ fn string(s: &str) -> TokenTree {
|
||||
|
||||
#[proc_macro_attribute]
|
||||
pub fn simd_test(
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
|
||||
) -> proc_macro::TokenStream {
|
||||
let tokens = TokenStream::from(attr)
|
||||
.into_iter()
|
||||
.collect::<Vec<_>>();
|
||||
let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
|
||||
if tokens.len() != 3 {
|
||||
panic!("expected #[simd_test(enable = \"feature\")]");
|
||||
}
|
||||
@@ -53,18 +51,19 @@ pub fn simd_test(
|
||||
let item = TokenStream::from(item);
|
||||
let name = find_name(item.clone());
|
||||
|
||||
let name: TokenStream = name.to_string().parse().expect(&format!(
|
||||
"failed to parse name: {}",
|
||||
name.to_string()
|
||||
));
|
||||
let name: TokenStream = name
|
||||
.to_string()
|
||||
.parse()
|
||||
.expect(&format!("failed to parse name: {}", name.to_string()));
|
||||
|
||||
let target = env::var("TARGET")
|
||||
.expect("TARGET environment variable should be set for rustc");
|
||||
let mut force_test = false;
|
||||
let macro_test = match target.split('-').next().expect(&format!(
|
||||
"target triple contained no \"-\": {}",
|
||||
target
|
||||
)) {
|
||||
let macro_test = match target
|
||||
.split('-')
|
||||
.next()
|
||||
.expect(&format!("target triple contained no \"-\": {}", target))
|
||||
{
|
||||
"i686" | "x86_64" | "i586" => "is_x86_feature_detected",
|
||||
"arm" | "armv7" => "is_arm_feature_detected",
|
||||
"aarch64" => "is_aarch64_feature_detected",
|
||||
|
||||
@@ -5,8 +5,10 @@
|
||||
//! assertions about the disassembly of a function.
|
||||
|
||||
#![feature(proc_macro)]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(missing_docs_in_private_items, print_stdout))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(missing_docs_in_private_items, print_stdout)
|
||||
)]
|
||||
|
||||
extern crate assert_instr_macro;
|
||||
extern crate backtrace;
|
||||
@@ -25,7 +27,8 @@ pub use assert_instr_macro::*;
|
||||
pub use simd_test_macro::*;
|
||||
|
||||
lazy_static! {
|
||||
static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
|
||||
static ref DISASSEMBLY: HashMap<String, Vec<Function>> =
|
||||
disassemble_myself();
|
||||
}
|
||||
|
||||
struct Function {
|
||||
@@ -39,14 +42,16 @@ struct Instruction {
|
||||
fn disassemble_myself() -> HashMap<String, Vec<Function>> {
|
||||
let me = env::current_exe().expect("failed to get current exe");
|
||||
|
||||
if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows")
|
||||
if cfg!(target_arch = "x86_64")
|
||||
&& cfg!(target_os = "windows")
|
||||
&& cfg!(target_env = "msvc")
|
||||
{
|
||||
let mut cmd = cc::windows_registry::find(
|
||||
"x86_64-pc-windows-msvc",
|
||||
"dumpbin.exe",
|
||||
).expect("failed to find `dumpbin` tool");
|
||||
let output = cmd.arg("/DISASM")
|
||||
let output = cmd
|
||||
.arg("/DISASM")
|
||||
.arg(&me)
|
||||
.output()
|
||||
.expect("failed to execute dumpbin");
|
||||
@@ -257,9 +262,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
||||
// in the disassembly.
|
||||
let mut sym = None;
|
||||
backtrace::resolve(fnptr as *mut _, |name| {
|
||||
sym = name.name()
|
||||
.and_then(|s| s.as_str())
|
||||
.map(normalize);
|
||||
sym = name.name().and_then(|s| s.as_str()).map(normalize);
|
||||
});
|
||||
|
||||
let functions =
|
||||
@@ -270,26 +273,17 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
||||
println!("assumed symbol name: `{}`", sym);
|
||||
}
|
||||
println!("maybe related functions");
|
||||
for f in DISASSEMBLY
|
||||
.keys()
|
||||
.filter(|k| k.contains(fnname))
|
||||
{
|
||||
for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
|
||||
println!("\t- {}", f);
|
||||
}
|
||||
panic!(
|
||||
"failed to find disassembly of {:#x} ({})",
|
||||
fnptr, fnname
|
||||
);
|
||||
panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
|
||||
};
|
||||
|
||||
assert_eq!(functions.len(), 1);
|
||||
let function = &functions[0];
|
||||
|
||||
let mut instrs = &function.instrs[..];
|
||||
while instrs
|
||||
.last()
|
||||
.map_or(false, |s| s.parts == ["nop"])
|
||||
{
|
||||
while instrs.last().map_or(false, |s| s.parts == ["nop"]) {
|
||||
instrs = &instrs[..instrs.len() - 1];
|
||||
}
|
||||
|
||||
@@ -400,10 +394,7 @@ pub fn assert_skip_test_ok(name: &str) {
|
||||
if env::var("STDSIMD_TEST_EVERYTHING").is_err() {
|
||||
return;
|
||||
}
|
||||
panic!(
|
||||
"skipped test `{}` when it shouldn't be skipped",
|
||||
name
|
||||
);
|
||||
panic!("skipped test `{}` when it shouldn't be skipped", name);
|
||||
}
|
||||
|
||||
// See comment in `assert-instr-macro` crate for why this exists
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
#![feature(proc_macro)]
|
||||
#![allow(bad_style)]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(shadow_reuse, cast_lossless, match_same_arms,
|
||||
nonminimal_bool, print_stdout, use_debug, eq_op,
|
||||
useless_format))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
shadow_reuse, cast_lossless, match_same_arms, nonminimal_bool,
|
||||
print_stdout, use_debug, eq_op, useless_format
|
||||
)
|
||||
)]
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
@@ -249,10 +252,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
|
||||
.flat_map(|c| c.to_lowercase())
|
||||
.collect::<String>();
|
||||
|
||||
let rust_feature = rust.target_feature.expect(&format!(
|
||||
"no target feature listed for {}",
|
||||
rust.name
|
||||
));
|
||||
let rust_feature = rust
|
||||
.target_feature
|
||||
.expect(&format!("no target feature listed for {}", rust.name));
|
||||
if rust_feature.contains(&cpuid) {
|
||||
continue;
|
||||
}
|
||||
@@ -314,24 +316,19 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
|
||||
if rust.arguments.len() != intel.parameters.len() {
|
||||
bail!("wrong number of arguments on {}", rust.name)
|
||||
}
|
||||
for (i, (a, b)) in intel
|
||||
.parameters
|
||||
.iter()
|
||||
.zip(rust.arguments)
|
||||
.enumerate()
|
||||
for (i, (a, b)) in
|
||||
intel.parameters.iter().zip(rust.arguments).enumerate()
|
||||
{
|
||||
let is_const = rust.required_const.contains(&i);
|
||||
equate(b, &a.type_, &intel.name, is_const)?;
|
||||
}
|
||||
}
|
||||
|
||||
let any_i64 = rust.arguments
|
||||
.iter()
|
||||
.cloned()
|
||||
.chain(rust.ret)
|
||||
.any(|arg| match *arg {
|
||||
let any_i64 = rust.arguments.iter().cloned().chain(rust.ret).any(|arg| {
|
||||
match *arg {
|
||||
Type::PrimSigned(64) | Type::PrimUnsigned(64) => true,
|
||||
_ => false,
|
||||
}
|
||||
});
|
||||
let any_i64_exempt = match rust.name {
|
||||
// These intrinsics have all been manually verified against Clang's
|
||||
@@ -363,7 +360,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
|
||||
}
|
||||
|
||||
fn equate(
|
||||
t: &Type, intel: &str, intrinsic: &str, is_const: bool
|
||||
t: &Type, intel: &str, intrinsic: &str, is_const: bool,
|
||||
) -> Result<(), String> {
|
||||
let intel = intel.replace(" *", "*");
|
||||
let intel = intel.replace(" const*", "*");
|
||||
@@ -371,9 +368,7 @@ fn equate(
|
||||
if is_const {
|
||||
return Ok(());
|
||||
}
|
||||
Err(format!(
|
||||
"argument required to be const but isn't"
|
||||
))
|
||||
Err(format!("argument required to be const but isn't"))
|
||||
};
|
||||
match (t, &intel[..]) {
|
||||
(&Type::PrimFloat(32), "float") => {}
|
||||
|
||||
@@ -1,11 +1,20 @@
|
||||
#![feature(stdsimd)]
|
||||
#![cfg_attr(stdsimd_strict, deny(warnings))]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(option_unwrap_used, use_debug, print_stdout))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(option_unwrap_used, use_debug, print_stdout)
|
||||
)]
|
||||
|
||||
#[cfg(any(target_arch = "arm", target_arch = "aarch64",
|
||||
target_arch = "x86", target_arch = "x86_64",
|
||||
target_arch = "powerpc", target_arch = "powerpc64"))]
|
||||
#[cfg(
|
||||
any(
|
||||
target_arch = "arm",
|
||||
target_arch = "aarch64",
|
||||
target_arch = "x86",
|
||||
target_arch = "x86_64",
|
||||
target_arch = "powerpc",
|
||||
target_arch = "powerpc64"
|
||||
)
|
||||
)]
|
||||
#[macro_use]
|
||||
extern crate stdsimd;
|
||||
|
||||
@@ -22,51 +31,30 @@ fn aarch64_linux() {
|
||||
println!("fp: {}", is_aarch64_feature_detected!("fp"));
|
||||
println!("fp16: {}", is_aarch64_feature_detected!("fp16"));
|
||||
println!("neon: {}", is_aarch64_feature_detected!("neon"));
|
||||
println!(
|
||||
"asimd: {}",
|
||||
is_aarch64_feature_detected!("asimd")
|
||||
);
|
||||
println!("asimd: {}", is_aarch64_feature_detected!("asimd"));
|
||||
println!("sve: {}", is_aarch64_feature_detected!("sve"));
|
||||
println!("crc: {}", is_aarch64_feature_detected!("crc"));
|
||||
println!(
|
||||
"crypto: {}",
|
||||
is_aarch64_feature_detected!("crypto")
|
||||
);
|
||||
println!("crypto: {}", is_aarch64_feature_detected!("crypto"));
|
||||
println!("lse: {}", is_aarch64_feature_detected!("lse"));
|
||||
println!("rdm: {}", is_aarch64_feature_detected!("rdm"));
|
||||
println!("rcpc: {}", is_aarch64_feature_detected!("rcpc"));
|
||||
println!(
|
||||
"dotprod: {}",
|
||||
is_aarch64_feature_detected!("dotprod")
|
||||
);
|
||||
println!("dotprod: {}", is_aarch64_feature_detected!("dotprod"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(target_arch = "powerpc", target_os = "linux"))]
|
||||
fn powerpc_linux() {
|
||||
println!(
|
||||
"altivec: {}",
|
||||
is_powerpc_feature_detected!("altivec")
|
||||
);
|
||||
println!("altivec: {}", is_powerpc_feature_detected!("altivec"));
|
||||
println!("vsx: {}", is_powerpc_feature_detected!("vsx"));
|
||||
println!(
|
||||
"power8: {}",
|
||||
is_powerpc_feature_detected!("power8")
|
||||
);
|
||||
println!("power8: {}", is_powerpc_feature_detected!("power8"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(target_arch = "powerpc64", target_os = "linux"))]
|
||||
fn powerpc64_linux() {
|
||||
println!(
|
||||
"altivec: {}",
|
||||
is_powerpc64_feature_detected!("altivec")
|
||||
);
|
||||
println!("altivec: {}", is_powerpc64_feature_detected!("altivec"));
|
||||
println!("vsx: {}", is_powerpc64_feature_detected!("vsx"));
|
||||
println!(
|
||||
"power8: {}",
|
||||
is_powerpc64_feature_detected!("power8")
|
||||
);
|
||||
println!("power8: {}", is_powerpc64_feature_detected!("power8"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -76,54 +64,21 @@ fn x86_all() {
|
||||
println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
|
||||
println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
|
||||
println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
|
||||
println!(
|
||||
"sse4.1: {:?}",
|
||||
is_x86_feature_detected!("sse4.1")
|
||||
);
|
||||
println!(
|
||||
"sse4.2: {:?}",
|
||||
is_x86_feature_detected!("sse4.2")
|
||||
);
|
||||
println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
|
||||
println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
|
||||
println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
|
||||
println!("sha: {:?}", is_x86_feature_detected!("sha"));
|
||||
println!("avx: {:?}", is_x86_feature_detected!("avx"));
|
||||
println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
|
||||
println!(
|
||||
"avx512f {:?}",
|
||||
is_x86_feature_detected!("avx512f")
|
||||
);
|
||||
println!(
|
||||
"avx512cd {:?}",
|
||||
is_x86_feature_detected!("avx512cd")
|
||||
);
|
||||
println!(
|
||||
"avx512er {:?}",
|
||||
is_x86_feature_detected!("avx512er")
|
||||
);
|
||||
println!(
|
||||
"avx512pf {:?}",
|
||||
is_x86_feature_detected!("avx512pf")
|
||||
);
|
||||
println!(
|
||||
"avx512bw {:?}",
|
||||
is_x86_feature_detected!("avx512bw")
|
||||
);
|
||||
println!(
|
||||
"avx512dq {:?}",
|
||||
is_x86_feature_detected!("avx512dq")
|
||||
);
|
||||
println!(
|
||||
"avx512vl {:?}",
|
||||
is_x86_feature_detected!("avx512vl")
|
||||
);
|
||||
println!(
|
||||
"avx512_ifma {:?}",
|
||||
is_x86_feature_detected!("avx512ifma")
|
||||
);
|
||||
println!(
|
||||
"avx512_vbmi {:?}",
|
||||
is_x86_feature_detected!("avx512vbmi")
|
||||
);
|
||||
println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
|
||||
println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
|
||||
println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
|
||||
println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
|
||||
println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
|
||||
println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
|
||||
println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
|
||||
println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
|
||||
println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
|
||||
println!(
|
||||
"avx512_vpopcntdq {:?}",
|
||||
is_x86_feature_detected!("avx512vpopcntdq")
|
||||
@@ -133,23 +88,11 @@ fn x86_all() {
|
||||
println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
|
||||
println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
|
||||
println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
|
||||
println!(
|
||||
"popcnt: {:?}",
|
||||
is_x86_feature_detected!("popcnt")
|
||||
);
|
||||
println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
|
||||
println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
|
||||
println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
|
||||
println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
|
||||
println!(
|
||||
"xsaveopt: {:?}",
|
||||
is_x86_feature_detected!("xsaveopt")
|
||||
);
|
||||
println!(
|
||||
"xsaves: {:?}",
|
||||
is_x86_feature_detected!("xsaves")
|
||||
);
|
||||
println!(
|
||||
"xsavec: {:?}",
|
||||
is_x86_feature_detected!("xsavec")
|
||||
);
|
||||
println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
|
||||
println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
|
||||
println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
|
||||
}
|
||||
|
||||
@@ -14,10 +14,13 @@
|
||||
|
||||
#![feature(stdsimd)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(result_unwrap_used, print_stdout, option_unwrap_used,
|
||||
shadow_reuse, cast_possible_wrap, cast_sign_loss,
|
||||
missing_docs_in_private_items))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
result_unwrap_used, print_stdout, option_unwrap_used, shadow_reuse,
|
||||
cast_possible_wrap, cast_sign_loss, missing_docs_in_private_items
|
||||
)
|
||||
)]
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
#[macro_use]
|
||||
@@ -68,7 +71,7 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
|
||||
#[target_feature(enable = "avx2")]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
unsafe fn hex_encode_avx2<'a>(
|
||||
mut src: &[u8], dst: &'a mut [u8]
|
||||
mut src: &[u8], dst: &'a mut [u8],
|
||||
) -> Result<&'a str, usize> {
|
||||
let ascii_zero = _mm256_set1_epi8(b'0' as i8);
|
||||
let nines = _mm256_set1_epi8(9);
|
||||
@@ -115,16 +118,14 @@ unsafe fn hex_encode_avx2<'a>(
|
||||
let i = i as usize;
|
||||
let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
|
||||
|
||||
Ok(str::from_utf8_unchecked(
|
||||
&dst[..src.len() * 2 + i * 2],
|
||||
))
|
||||
Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
|
||||
}
|
||||
|
||||
// copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
|
||||
#[target_feature(enable = "sse4.1")]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
unsafe fn hex_encode_sse41<'a>(
|
||||
mut src: &[u8], dst: &'a mut [u8]
|
||||
mut src: &[u8], dst: &'a mut [u8],
|
||||
) -> Result<&'a str, usize> {
|
||||
let ascii_zero = _mm_set1_epi8(b'0' as i8);
|
||||
let nines = _mm_set1_epi8(9);
|
||||
@@ -157,10 +158,7 @@ unsafe fn hex_encode_sse41<'a>(
|
||||
let res2 = _mm_unpackhi_epi8(masked2, masked1);
|
||||
|
||||
_mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
|
||||
_mm_storeu_si128(
|
||||
dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
|
||||
res2,
|
||||
);
|
||||
_mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
|
||||
src = &src[16..];
|
||||
i += 16;
|
||||
}
|
||||
@@ -168,13 +166,11 @@ unsafe fn hex_encode_sse41<'a>(
|
||||
let i = i as usize;
|
||||
let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
|
||||
|
||||
Ok(str::from_utf8_unchecked(
|
||||
&dst[..src.len() * 2 + i * 2],
|
||||
))
|
||||
Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
|
||||
}
|
||||
|
||||
fn hex_encode_fallback<'a>(
|
||||
src: &[u8], dst: &'a mut [u8]
|
||||
src: &[u8], dst: &'a mut [u8],
|
||||
) -> Result<&'a str, usize> {
|
||||
fn hex(byte: u8) -> u8 {
|
||||
static TABLE: &[u8] = b"0123456789abcdef";
|
||||
@@ -199,10 +195,7 @@ mod tests {
|
||||
fn test(input: &[u8], output: &str) {
|
||||
let tmp = || vec![0; input.len() * 2];
|
||||
|
||||
assert_eq!(
|
||||
hex_encode_fallback(input, &mut tmp()).unwrap(),
|
||||
output
|
||||
);
|
||||
assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
|
||||
assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
@@ -239,9 +232,7 @@ mod tests {
|
||||
fn odd() {
|
||||
test(
|
||||
&[0; 313],
|
||||
&iter::repeat('0')
|
||||
.take(313 * 2)
|
||||
.collect::<String>(),
|
||||
&iter::repeat('0').take(313 * 2).collect::<String>(),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,9 +5,13 @@
|
||||
|
||||
#![cfg_attr(stdsimd_strict, deny(warnings))]
|
||||
#![feature(stdsimd)]
|
||||
#![cfg_attr(feature = "cargo-clippy",
|
||||
allow(similar_names, missing_docs_in_private_items,
|
||||
shadow_reuse, print_stdout))]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
similar_names, missing_docs_in_private_items, shadow_reuse,
|
||||
print_stdout
|
||||
)
|
||||
)]
|
||||
|
||||
extern crate stdsimd;
|
||||
#[macro_use]
|
||||
@@ -15,8 +19,6 @@ extern crate cfg_if;
|
||||
|
||||
use stdsimd::simd::*;
|
||||
|
||||
|
||||
|
||||
const PI: f64 = std::f64::consts::PI;
|
||||
const SOLAR_MASS: f64 = 4.0 * PI * PI;
|
||||
const DAYS_PER_YEAR: f64 = 365.24;
|
||||
@@ -81,7 +83,7 @@ struct Body {
|
||||
|
||||
impl Body {
|
||||
fn new(
|
||||
x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
|
||||
x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64,
|
||||
) -> Self {
|
||||
Self {
|
||||
x: [x0, x1, x2],
|
||||
|
||||
@@ -64,7 +64,8 @@ macro_rules! is_aarch64_feature_detected {
|
||||
#[unstable(feature = "stdsimd", issue = "27731")]
|
||||
macro_rules! is_powerpc_feature_detected {
|
||||
($t:tt) => {
|
||||
compile_error!(r#"
|
||||
compile_error!(
|
||||
r#"
|
||||
is_powerpc_feature_detected can only be used on PowerPC targets.
|
||||
You can prevent it from being used in other architectures by
|
||||
guarding it behind a cfg(target_arch) as follows:
|
||||
@@ -72,7 +73,8 @@ guarding it behind a cfg(target_arch) as follows:
|
||||
#[cfg(target_arch = "powerpc")] {
|
||||
if is_powerpc_feature_detected(...) { ... }
|
||||
}
|
||||
"#)
|
||||
"#
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
@@ -81,7 +83,8 @@ guarding it behind a cfg(target_arch) as follows:
|
||||
#[unstable(feature = "stdsimd", issue = "27731")]
|
||||
macro_rules! is_powerpc64_feature_detected {
|
||||
($t:tt) => {
|
||||
compile_error!(r#"
|
||||
compile_error!(
|
||||
r#"
|
||||
is_powerpc64_feature_detected can only be used on PowerPC64 targets.
|
||||
You can prevent it from being used in other architectures by
|
||||
guarding it behind a cfg(target_arch) as follows:
|
||||
@@ -89,7 +92,8 @@ guarding it behind a cfg(target_arch) as follows:
|
||||
#[cfg(target_arch = "powerpc64")] {
|
||||
if is_powerpc64_feature_detected(...) { ... }
|
||||
}
|
||||
"#)
|
||||
"#
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -60,8 +60,8 @@ cfg_if! {
|
||||
}
|
||||
pub use self::arch::Feature;
|
||||
|
||||
mod cache;
|
||||
mod bit;
|
||||
mod cache;
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
|
||||
|
||||
Reference in New Issue
Block a user