reformat with latest rustfmt

2018-06-06 00:17:14 +02:00
parent c2d60b18e4
commit c3d273c980
58 changed files with 1038 additions and 1710 deletions
--- a/library/stdarch/coresimd/aarch64/crypto.rs
+++ b/library/stdarch/coresimd/aarch64/crypto.rs
@@ -16,36 +16,36 @@ extern "C" {
    fn vsha1h_u32_(hash_e: u32) -> u32;
    #[link_name = "llvm.aarch64.crypto.sha1su0"]
    fn vsha1su0q_u32_(
-        w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
+        w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha1su1"]
    fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha1c"]
    fn vsha1cq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha1p"]
    fn vsha1pq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha1m"]
    fn vsha1mq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha256h"]
    fn vsha256hq_u32_(
-        hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha256h2"]
    fn vsha256h2q_u32_(
-        hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
+        hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
    ) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha256su0"]
    fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
    #[link_name = "llvm.aarch64.crypto.sha256su1"]
    fn vsha256su1q_u32_(
-        tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
+        tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
    ) -> uint32x4_t;
 }
@@ -97,7 +97,7 @@ pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 {
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1c))]
 pub unsafe fn vsha1cq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
    vsha1cq_u32_(hash_abcd, hash_e, wk)
 }
@@ -107,7 +107,7 @@ pub unsafe fn vsha1cq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1m))]
 pub unsafe fn vsha1mq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
    vsha1mq_u32_(hash_abcd, hash_e, wk)
 }
@@ -117,7 +117,7 @@ pub unsafe fn vsha1mq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1p))]
 pub unsafe fn vsha1pq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
    vsha1pq_u32_(hash_abcd, hash_e, wk)
 }
@@ -127,7 +127,7 @@ pub unsafe fn vsha1pq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1su0))]
 pub unsafe fn vsha1su0q_u32(
-    w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
+    w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
 ) -> uint32x4_t {
    vsha1su0q_u32_(w0_3, w4_7, w8_11)
 }
@@ -137,7 +137,7 @@ pub unsafe fn vsha1su0q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1su1))]
 pub unsafe fn vsha1su1q_u32(
-    tw0_3: uint32x4_t, w12_15: uint32x4_t
+    tw0_3: uint32x4_t, w12_15: uint32x4_t,
 ) -> uint32x4_t {
    vsha1su1q_u32_(tw0_3, w12_15)
 }
@@ -147,7 +147,7 @@ pub unsafe fn vsha1su1q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256h))]
 pub unsafe fn vsha256hq_u32(
-    hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
 ) -> uint32x4_t {
    vsha256hq_u32_(hash_abcd, hash_efgh, wk)
 }
@@ -157,7 +157,7 @@ pub unsafe fn vsha256hq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256h2))]
 pub unsafe fn vsha256h2q_u32(
-    hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
+    hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
 ) -> uint32x4_t {
    vsha256h2q_u32_(hash_efgh, hash_abcd, wk)
 }
@@ -167,7 +167,7 @@ pub unsafe fn vsha256h2q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256su0))]
 pub unsafe fn vsha256su0q_u32(
-    w0_3: uint32x4_t, w4_7: uint32x4_t
+    w0_3: uint32x4_t, w4_7: uint32x4_t,
 ) -> uint32x4_t {
    vsha256su0q_u32_(w0_3, w4_7)
 }
@@ -177,7 +177,7 @@ pub unsafe fn vsha256su0q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256su1))]
 pub unsafe fn vsha256su1q_u32(
-    tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
+    tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
 ) -> uint32x4_t {
    vsha256su1q_u32_(tw0_3, w8_11, w12_15)
 }
@@ -199,22 +199,8 @@ mod tests {
        assert_eq!(
            r,
            u8x16::new(
-                124,
+                124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118,
-                123,
+                124, 123, 124, 197
                124,
                118,
                124,
                123,
                124,
                197,
                124,
                123,
                124,
                118,
                124,
                123,
                124,
                197
            )
        );
    }
@@ -229,22 +215,7 @@ mod tests {
        assert_eq!(
            r,
            u8x16::new(
-                9,
+                9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56
                213,
                9,
                251,
                9,
                213,
                9,
                56,
                9,
                213,
                9,
                251,
                9,
                213,
                9,
                56
            )
        );
    }
@@ -256,24 +227,7 @@ mod tests {
        let r: u8x16 = vaesmcq_u8(data).into_bits();
        assert_eq!(
            r,
-            u8x16::new(
+            u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30)
                3,
                4,
                9,
                10,
                15,
                8,
                21,
                30,
                3,
                4,
                9,
                10,
                15,
                8,
                21,
                30
            )
        );
    }
@@ -285,22 +239,8 @@ mod tests {
        assert_eq!(
            r,
            u8x16::new(
-                43,
+                43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80,
-                60,
+                125, 70
                33,
                50,
                103,
                80,
                125,
                70,
                43,
                60,
                33,
                50,
                103,
                80,
                125,
                70
            )
        );
    }
--- a/library/stdarch/coresimd/aarch64/neon.rs
+++ b/library/stdarch/coresimd/aarch64/neon.rs
@@ -546,7 +546,6 @@ pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    vpmaxq_f64_(a, b)
 }
 #[cfg(test)]
 mod tests {
    use coresimd::aarch64::*;
@@ -800,20 +799,11 @@ mod tests {
    #[simd_test(enable = "neon")]
    unsafe fn test_vpminq_s8() {
        #[cfg_attr(rustfmt, skip)]
-        let a = i8x16::new(
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
            1, -2, 3, -4, 5, 6, 7, 8,
            1, 2, 3, 4, 5, 6, 7, 8
        );
        #[cfg_attr(rustfmt, skip)]
-        let b = i8x16::new(
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
            0, 3, 2, 5, 4, 7, 6, 9,
            0, 3, 2, 5, 4, 7, 6, 9
        );
        #[cfg_attr(rustfmt, skip)]
-        let e = i8x16::new(
+        let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
            -2, -4, 5, 7, 1, 3, 5, 7,
            0, 2, 4, 6, 0, 2, 4, 6,
        );
        let r: i8x16 = vpminq_s8(a.into_bits(), b.into_bits()).into_bits();
        assert_eq!(r, e);
    }
@@ -839,20 +829,11 @@ mod tests {
    #[simd_test(enable = "neon")]
    unsafe fn test_vpminq_u8() {
        #[cfg_attr(rustfmt, skip)]
-        let a = u8x16::new(
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
            1, 2, 3, 4, 5, 6, 7, 8,
            1, 2, 3, 4, 5, 6, 7, 8
        );
        #[cfg_attr(rustfmt, skip)]
-        let b = u8x16::new(
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
            0, 3, 2, 5, 4, 7, 6, 9,
            0, 3, 2, 5, 4, 7, 6, 9
        );
        #[cfg_attr(rustfmt, skip)]
-        let e = u8x16::new(
+        let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
            1, 3, 5, 7, 1, 3, 5, 7,
            0, 2, 4, 6, 0, 2, 4, 6,
        );
        let r: u8x16 = vpminq_u8(a.into_bits(), b.into_bits()).into_bits();
        assert_eq!(r, e);
    }
@@ -896,20 +877,11 @@ mod tests {
    #[simd_test(enable = "neon")]
    unsafe fn test_vpmaxq_s8() {
        #[cfg_attr(rustfmt, skip)]
-        let a = i8x16::new(
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
            1, -2, 3, -4, 5, 6, 7, 8,
            1, 2, 3, 4, 5, 6, 7, 8
        );
        #[cfg_attr(rustfmt, skip)]
-        let b = i8x16::new(
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
            0, 3, 2, 5, 4, 7, 6, 9,
            0, 3, 2, 5, 4, 7, 6, 9
        );
        #[cfg_attr(rustfmt, skip)]
-        let e = i8x16::new(
+        let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
            1, 3, 6, 8, 2, 4, 6, 8,
            3, 5, 7, 9, 3, 5, 7, 9,
        );
        let r: i8x16 = vpmaxq_s8(a.into_bits(), b.into_bits()).into_bits();
        assert_eq!(r, e);
    }
@@ -935,20 +907,11 @@ mod tests {
    #[simd_test(enable = "neon")]
    unsafe fn test_vpmaxq_u8() {
        #[cfg_attr(rustfmt, skip)]
-        let a = u8x16::new(
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
            1, 2, 3, 4, 5, 6, 7, 8,
            1, 2, 3, 4, 5, 6, 7, 8
        );
        #[cfg_attr(rustfmt, skip)]
-        let b = u8x16::new(
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
            0, 3, 2, 5, 4, 7, 6, 9,
            0, 3, 2, 5, 4, 7, 6, 9
        );
        #[cfg_attr(rustfmt, skip)]
-        let e = u8x16::new(
+        let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
            2, 4, 6, 8, 2, 4, 6, 8,
            3, 5, 7, 9, 3, 5, 7, 9,
        );
        let r: u8x16 = vpmaxq_u8(a.into_bits(), b.into_bits()).into_bits();
        assert_eq!(r, e);
    }
--- a/library/stdarch/coresimd/arm/mod.rs
+++ b/library/stdarch/coresimd/arm/mod.rs
@@ -19,11 +19,19 @@ pub use self::v7::*;
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
-#[cfg(any(target_arch = "aarch64",
+#[cfg(
    any(
        target_arch = "aarch64",
        all(target_feature = "v7", target_feature = "neon"),
-          dox))]
+        dox
    )
 )]
 mod neon;
-#[cfg(any(target_arch = "aarch64",
+#[cfg(
    any(
        target_arch = "aarch64",
        all(target_feature = "v7", target_feature = "neon"),
-          dox))]
+        dox
    )
 )]
 pub use self::neon::*;
--- a/library/stdarch/coresimd/arm/neon.rs
+++ b/library/stdarch/coresimd/arm/neon.rs
@@ -366,52 +366,82 @@ impl_from_bits_!(
 #[allow(improper_ctypes)]
 extern "C" {
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32"
    )]
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
    fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8"
    )]
    fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16"
    )]
    fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32"
    )]
    fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8"
    )]
    fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16"
    )]
    fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32"
    )]
    fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32"
    )]
    fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8"
    )]
    fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16"
    )]
    fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32"
    )]
    fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8"
    )]
    fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16"
    )]
    fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32"
    )]
    fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
+    #[cfg_attr(
        target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32"
    )]
    fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
 }
@@ -916,7 +946,6 @@ pub unsafe fn vpmax_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t {
    vpmaxf_v2f32(a, b)
 }
 #[cfg(test)]
 mod tests {
    use coresimd::arm::*;
--- a/library/stdarch/coresimd/powerpc/altivec.rs
+++ b/library/stdarch/coresimd/powerpc/altivec.rs
@@ -75,8 +75,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    i8x16:
+    i8x16: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -114,8 +113,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    u8x16:
+    u8x16: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -135,11 +133,7 @@ impl_from_bits_!(
    vector_bool_short,
    vector_bool_int
 );
-impl_from_bits_!(
+impl_from_bits_!(m8x16: vector_bool_char, vector_bool_short, vector_bool_int);
    m8x16: vector_bool_char,
    vector_bool_short,
    vector_bool_int
 );
 impl_from_bits_!(
    vector_signed_short: u64x2,
@@ -166,8 +160,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    i16x8:
+    i16x8: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -204,8 +197,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    u16x8:
+    u16x8: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -251,8 +243,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    i32x4:
+    i32x4: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -289,8 +280,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    u32x4:
+    u32x4: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -345,8 +335,7 @@ impl_from_bits_!(
    vector_bool_int
 );
 impl_from_bits_!(
-    f32x4:
+    f32x4: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -361,9 +350,13 @@ impl_from_bits_!(
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.ppc.altivec.vperm"]
-fn vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int;
+    fn vperm(
        a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
    ) -> vector_signed_int;
    #[link_name = "llvm.ppc.altivec.vmhaddshs"]
-fn vmhaddshs(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short;
+    fn vmhaddshs(
        a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
    ) -> vector_signed_short;
 }
 mod sealed {
@@ -373,7 +366,9 @@ mod sealed {
    #[inline]
    #[target_feature(enable = "altivec")]
    #[cfg_attr(test, assert_instr(vperm))]
-    unsafe fn vec_vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int {
+    unsafe fn vec_vperm(
        a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
    ) -> vector_signed_int {
        vperm(a, b, c)
    }
@@ -703,7 +698,6 @@ where
    a.vec_add(b)
 }
 /// Endian-biased intrinsics
 #[cfg(target_endian = "little")]
 mod endian {
@@ -718,8 +712,10 @@ mod endian {
        // vperm has big-endian bias
        //
        // Xor the mask and flip the arguments
-        let d = u8x16::new(255, 255, 255, 255, 255, 255, 255, 255,
+        let d = u8x16::new(
-                           255, 255, 255, 255, 255, 255, 255, 255).into_bits();
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
            255, 255, 255,
        ).into_bits();
        let c = simd_xor(c, d);
        b.vec_vperm(a, c)
@@ -730,7 +726,9 @@ mod endian {
 #[inline]
 #[target_feature(enable = "altivec")]
 #[cfg_attr(test, assert_instr(vmhaddshs))]
-pub unsafe fn vec_madds(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short {
+pub unsafe fn vec_madds(
    a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
 ) -> vector_signed_short {
    vmhaddshs(a, b, c)
 }
@@ -850,9 +848,20 @@ mod tests {
    #[simd_test(enable = "altivec")]
    unsafe fn test_vec_madds() {
-        let a: vector_signed_short = i16x8::new(0 * 256, 1 * 256, 2 * 256, 3 * 256, 4 * 256, 5 * 256, 6 * 256, 7 * 256).into_bits();
+        let a: vector_signed_short = i16x8::new(
-        let b: vector_signed_short = i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
+            0 * 256,
-        let c: vector_signed_short = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
+            1 * 256,
            2 * 256,
            3 * 256,
            4 * 256,
            5 * 256,
            6 * 256,
            7 * 256,
        ).into_bits();
        let b: vector_signed_short =
            i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
        let c: vector_signed_short =
            i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
--- a/library/stdarch/coresimd/powerpc64/mod.rs
+++ b/library/stdarch/coresimd/powerpc64/mod.rs
@@ -1,6 +1,7 @@
 //! PowerPC 64
 //!
-//! The reference is the [64-Bit ELF V2 ABI Specification - Power Architecture].
+//! The reference is the [64-Bit ELF V2 ABI Specification - Power
 //! Architecture].
 //!
 //! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
--- a/library/stdarch/coresimd/powerpc64/vsx.rs
+++ b/library/stdarch/coresimd/powerpc64/vsx.rs
@@ -63,8 +63,7 @@ impl_from_bits_!(
    vector_double
 );
 impl_from_bits_!(
-    i64x2:
+    i64x2: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -109,8 +108,7 @@ impl_from_bits_!(
    vector_double
 );
 impl_from_bits_!(
-    u64x2:
+    u64x2: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -155,8 +153,7 @@ impl_from_bits_!(
    vector_bool_long
 );
 impl_from_bits_!(
-    f64x2:
+    f64x2: vector_signed_char,
    vector_signed_char,
    vector_unsigned_char,
    vector_bool_char,
    vector_signed_short,
@@ -234,8 +231,12 @@ mod sealed {
    // xxpermdi has an big-endian bias and extended mnemonics
    #[inline]
    #[target_feature(enable = "vsx")]
-    #[cfg_attr(all(test, target_endian="little"), assert_instr(xxmrgld, dm = 0x0))]
+    #[cfg_attr(
-    #[cfg_attr(all(test, target_endian="big"), assert_instr(xxspltd, dm = 0x0))]
+        all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0)
    )]
    #[cfg_attr(
        all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0)
    )]
    unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
        match dm & 0b11 {
            0 => simd_shuffle2(a, b, [0b00, 0b10]),
--- a/library/stdarch/coresimd/ppsv/api/arithmetic_reductions.rs
+++ b/library/stdarch/coresimd/ppsv/api/arithmetic_reductions.rs
@@ -165,7 +165,6 @@ macro_rules! impl_float_arithmetic_reductions {
    };
 }
 #[cfg(test)]
 macro_rules! test_int_arithmetic_reductions {
    ($id:ident, $elem_ty:ident) => {
@@ -237,10 +236,7 @@ macro_rules! test_float_arithmetic_reductions {
            let v = $id::splat(1 as $elem_ty);
            assert_eq!(v.sum(), $id::lanes() as $elem_ty);
            let v = alternating(2);
-            assert_eq!(
+            assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty);
                v.sum(),
                ($id::lanes() / 2 + $id::lanes()) as $elem_ty
            );
        }
        #[test]
        fn product() {
--- a/library/stdarch/coresimd/ppsv/api/float_math.rs
+++ b/library/stdarch/coresimd/ppsv/api/float_math.rs
@@ -59,7 +59,6 @@ macro_rules! impl_float_math {
 macro_rules! test_float_math {
    ($id:ident, $elem_ty:ident) => {
        fn sqrt2() -> $elem_ty {
            match ::mem::size_of::<$elem_ty>() {
                4 => 1.4142135 as $elem_ty,
--- a/library/stdarch/coresimd/ppsv/api/load_store.rs
+++ b/library/stdarch/coresimd/ppsv/api/load_store.rs
@@ -46,7 +46,7 @@ macro_rules! impl_load_store {
            /// undefined.
            #[inline]
            pub unsafe fn store_aligned_unchecked(
-                self, slice: &mut [$elem_ty]
+                self, slice: &mut [$elem_ty],
            ) {
                *(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) =
                    self;
@@ -59,7 +59,7 @@ macro_rules! impl_load_store {
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn store_unaligned_unchecked(
-                self, slice: &mut [$elem_ty]
+                self, slice: &mut [$elem_ty],
            ) {
                let target_ptr =
                    slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8;
@@ -121,7 +121,7 @@ macro_rules! impl_load_store {
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn load_unaligned_unchecked(
-                slice: &[$elem_ty]
+                slice: &[$elem_ty],
            ) -> Self {
                use mem::size_of;
                let target_ptr =
@@ -238,7 +238,8 @@ macro_rules! test_load_store {
                    data: [0 as $elem_ty; 2 * $id::lanes()],
                };
                // offset the aligned data by one byte:
-                let s: &mut [u8; 2 * $id::lanes()
+                let s: &mut [u8; 2
                                * $id::lanes()
                                * mem::size_of::<$elem_ty>()] =
                    mem::transmute(&mut aligned.data);
                let s: &mut [$elem_ty] = slice::from_raw_parts_mut(
@@ -296,7 +297,8 @@ macro_rules! test_load_store {
                    data: [0 as $elem_ty; 2 * $id::lanes()],
                };
                // offset the aligned data by one byte:
-                let s: &[u8; 2 * $id::lanes()
+                let s: &[u8; 2
                            * $id::lanes()
                            * mem::size_of::<$elem_ty>()] =
                    mem::transmute(&aligned.data);
                let s: &[$elem_ty] = slice::from_raw_parts(
--- a/library/stdarch/coresimd/ppsv/api/scalar_shifts.rs
+++ b/library/stdarch/coresimd/ppsv/api/scalar_shifts.rs
@@ -41,18 +41,7 @@ macro_rules! impl_shifts {
 macro_rules! impl_all_scalar_shifts {
    ($id:ident, $elem_ty:ident) => {
        impl_shifts!(
-            $id,
+            $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
            $elem_ty,
            u8,
            u16,
            u32,
            u64,
            usize,
            i8,
            i16,
            i32,
            i64,
            isize
        );
    };
 }
@@ -125,18 +114,7 @@ macro_rules! test_shift_ops {
 macro_rules! test_all_scalar_shift_ops {
    ($id:ident, $elem_ty:ident) => {
        test_shift_ops!(
-            $id,
+            $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
            $elem_ty,
            u8,
            u16,
            u32,
            u64,
            usize,
            i8,
            i16,
            i32,
            i64,
            isize
        );
    };
 }
--- a/library/stdarch/coresimd/ppsv/codegen/abs.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/abs.rs
@@ -31,7 +31,7 @@ macro_rules! impl_fabs {
                unsafe { $fn(self) }
            }
        }
-    }
+    };
 }
 impl_fabs!(f32x2: abs_v2f32);
--- a/library/stdarch/coresimd/ppsv/codegen/cos.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/cos.rs
@@ -31,7 +31,7 @@ macro_rules! impl_fcos {
                unsafe { $fn(self) }
            }
        }
-    }
+    };
 }
 impl_fcos!(f32x2: cos_v2f32);
--- a/library/stdarch/coresimd/ppsv/codegen/fma.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/fma.rs
@@ -31,7 +31,7 @@ macro_rules! impl_fma {
                unsafe { $fn(self, y, z) }
            }
        }
-    }
+    };
 }
 impl_fma!(f32x2: fma_v2f32);
--- a/library/stdarch/coresimd/ppsv/codegen/masks_reductions.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/masks_reductions.rs
@@ -25,11 +25,13 @@ macro_rules! default_impl {
        impl All for $id {
            #[inline]
            unsafe fn all(self) -> bool {
-                #[cfg(not(target_arch = "aarch64"))] {
+                #[cfg(not(target_arch = "aarch64"))]
                {
                    use coresimd::simd_llvm::simd_reduce_all;
                    simd_reduce_all(self)
                }
-                #[cfg(target_arch = "aarch64")] {
+                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: Broken on AArch64
                    // https://bugs.llvm.org/show_bug.cgi?id=36796
                    self.and()
@@ -40,11 +42,13 @@ macro_rules! default_impl {
        impl Any for $id {
            #[inline]
            unsafe fn any(self) -> bool {
-                #[cfg(not(target_arch = "aarch64"))] {
+                #[cfg(not(target_arch = "aarch64"))]
                {
                    use coresimd::simd_llvm::simd_reduce_any;
                    simd_reduce_any(self)
                }
-                #[cfg(target_arch = "aarch64")] {
+                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: Broken on AArch64
                    // https://bugs.llvm.org/show_bug.cgi?id=36796
                    self.or()
@@ -63,7 +67,12 @@ macro_rules! default_impl {
 // or floating point vectors, we can't currently work around this yet. The
 // performance impact for this shouldn't be large, but this is filled as:
 // https://bugs.llvm.org/show_bug.cgi?id=37087
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+#[cfg(
    all(
        any(target_arch = "x86", target_arch = "x86_64"),
        target_feature = "sse2"
    )
 )]
 macro_rules! x86_128_sse2_movemask_impl {
    ($id:ident) => {
        impl All for $id {
@@ -71,13 +80,15 @@ macro_rules! x86_128_sse2_movemask_impl {
            #[target_feature(enable = "sse2")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                use coresimd::arch::x86::_mm_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+                use coresimd::arch::x86_64::_mm_movemask_epi8;
-                // _mm_movemask_epi8(a) creates a 16bit mask containing the most
+                // _mm_movemask_epi8(a) creates a 16bit mask containing the
-                // significant bit of each byte of `a`. If all bits are set,
+                // most significant bit of each byte of `a`. If all
-                // then all 16 lanes of the mask are true.
+                // bits are set, then all 16 lanes of the mask are
-                _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32
+                // true.
                _mm_movemask_epi8(::mem::transmute(self))
                    == u16::max_value() as i32
            }
        }
        impl Any for $id {
@@ -85,14 +96,14 @@ macro_rules! x86_128_sse2_movemask_impl {
            #[target_feature(enable = "sse2")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                use coresimd::arch::x86::_mm_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+                use coresimd::arch::x86_64::_mm_movemask_epi8;
                _mm_movemask_epi8(::mem::transmute(self)) != 0
            }
        }
-    }
+    };
 }
 // On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256.
@@ -103,7 +114,12 @@ macro_rules! x86_128_sse2_movemask_impl {
 // integer or floating point vectors, we can't currently work around this yet.
 //
 // TODO: investigate perf impact and fill LLVM bugs as necessary.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))]
+#[cfg(
    all(
        any(target_arch = "x86", target_arch = "x86_64"),
        target_feature = "avx"
    )
 )]
 macro_rules! x86_256_avx_test_impl {
    ($id:ident) => {
        impl All for $id {
@@ -111,11 +127,13 @@ macro_rules! x86_256_avx_test_impl {
            #[target_feature(enable = "avx")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm256_testc_si256;
+                use coresimd::arch::x86::_mm256_testc_si256;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm256_testc_si256;
+                use coresimd::arch::x86_64::_mm256_testc_si256;
-                _mm256_testc_si256(::mem::transmute(self),
+                _mm256_testc_si256(
-                                   ::mem::transmute($id::splat(true))) != 0
+                    ::mem::transmute(self),
                    ::mem::transmute($id::splat(true)),
                ) != 0
            }
        }
        impl Any for $id {
@@ -123,20 +141,27 @@ macro_rules! x86_256_avx_test_impl {
            #[target_feature(enable = "avx")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm256_testz_si256;
+                use coresimd::arch::x86::_mm256_testz_si256;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm256_testz_si256;
+                use coresimd::arch::x86_64::_mm256_testz_si256;
-                _mm256_testz_si256(::mem::transmute(self),
+                _mm256_testz_si256(
-                                   ::mem::transmute(self)) == 0
+                    ::mem::transmute(self),
-            }
+                    ::mem::transmute(self),
                ) == 0
            }
        }
    };
 }
-// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing
+// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by
-// the algorithm for 128-bit on the higher and lower elements of the vector
+// executing the algorithm for 128-bit on the higher and lower elements of the
-// independently.
+// vector independently.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+#[cfg(
    all(
        any(target_arch = "x86", target_arch = "x86_64"),
        target_feature = "sse2"
    )
 )]
 macro_rules! x86_256_sse2_impl {
    ($id:ident, $v128:ident) => {
        impl All for $id {
@@ -146,7 +171,7 @@ macro_rules! x86_256_sse2_impl {
                unsafe {
                    union U {
                        halves: ($v128, $v128),
-                        vec: $id
+                        vec: $id,
                    }
                    let halves = U { vec: self }.halves;
                    halves.0.all() && halves.1.all()
@@ -160,14 +185,14 @@ macro_rules! x86_256_sse2_impl {
                unsafe {
                    union U {
                        halves: ($v128, $v128),
-                        vec: $id
+                        vec: $id,
                    }
                    let halves = U { vec: self }.halves;
                    halves.0.any() || halves.1.any()
                }
            }
        }
-    }
+    };
 }
 // Implementation for 64-bit wide masks on x86.
@@ -179,13 +204,14 @@ macro_rules! x86_64_mmx_movemask_impl {
            #[target_feature(enable = "mmx")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                use coresimd::arch::x86::_mm_movemask_pi8;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                use coresimd::arch::x86_64::_mm_movemask_pi8;
                // _mm_movemask_pi8(a) creates an 8bit mask containing the most
                // significant bit of each byte of `a`. If all bits are set,
                // then all 8 lanes of the mask are true.
-                 _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32
+                _mm_movemask_pi8(::mem::transmute(self))
                    == u8::max_value() as i32
            }
        }
        impl Any for $id {
@@ -193,14 +219,14 @@ macro_rules! x86_64_mmx_movemask_impl {
            #[target_feature(enable = "mmx")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                use coresimd::arch::x86::_mm_movemask_pi8;
                #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                use coresimd::arch::x86_64::_mm_movemask_pi8;
                _mm_movemask_pi8(::mem::transmute(self)) != 0
            }
        }
-    }
+    };
 }
 // Implementation for 128-bit wide masks on x86
@@ -214,7 +240,7 @@ macro_rules! x86_128_impl {
                default_impl!($id);
            }
        }
-    }
+    };
 }
 // Implementation for 256-bit wide masks on x86
@@ -230,22 +256,25 @@ macro_rules! x86_256_impl {
                default_impl!($id);
            }
        }
-    }
+    };
 }
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
 )]
 macro_rules! arm_64_x2_v7_neon_impl {
    ($id:ident, $vpmin:ident, $vpmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
+                use coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use mem::transmute;
                // pmin((a, b), (-,-)) => (b, -).0 => b
-                let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized()));
+                let tmp: $id =
                    transmute($vpmin(transmute(self), ::mem::uninitialized()));
                tmp.extract(0)
            }
        }
@@ -253,27 +282,30 @@ macro_rules! arm_64_x2_v7_neon_impl {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
+                use coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use mem::transmute;
                // pmax((a, b), (-,-)) => (b, -).0 => b
-                let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized()));
+                let tmp: $id =
                    transmute($vpmax(transmute(self), ::mem::uninitialized()));
                tmp.extract(0)
            }
        }
-    }
+    };
 }
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
 )]
 macro_rules! arm_64_x4_v7_neon_impl {
    ($id:ident, $vpmin:ident, $vpmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
+                use coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use mem::transmute;
                // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
                let tmp = $vpmin(transmute(self), ::mem::uninitialized());
                // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
@@ -285,8 +317,8 @@ macro_rules! arm_64_x4_v7_neon_impl {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
+                use coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use mem::transmute;
                // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
                let tmp = $vpmax(transmute(self), ::mem::uninitialized());
                // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
@@ -294,20 +326,22 @@ macro_rules! arm_64_x4_v7_neon_impl {
                tmp.extract(0)
            }
        }
-    }
+    };
 }
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
 )]
 macro_rules! arm_64_x8_v7_neon_impl {
    ($id:ident, $vpmin:ident, $vpmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
+                use coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use mem::transmute;
                // tmp = pmin(
                //     (a, b, c, d, e, f, g, h),
                //     (-, -, -, -, -, -, -, -)
@@ -330,8 +364,8 @@ macro_rules! arm_64_x8_v7_neon_impl {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
+                use coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use mem::transmute;
                // tmp = pmax(
                //     (a, b, c, d, e, f, g, h),
                //     (-, -, -, -, -, -, -, -)
@@ -350,28 +384,32 @@ macro_rules! arm_64_x8_v7_neon_impl {
                tmp.extract(0)
            }
        }
    };
 }
 }
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with
 // more than two elements.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
 )]
 macro_rules! arm_128_v7_neon_impl {
    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
+                use coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use mem::transmute;
                union U {
                    halves: ($half, $half),
-                    vec: $id
+                    vec: $id,
                }
                let halves = U { vec: self }.halves;
-                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
+                let h: $half = transmute($vpmin(
                    transmute(halves.0),
                    transmute(halves.1),
                ));
                h.all()
            }
        }
@@ -379,18 +417,21 @@ macro_rules! arm_128_v7_neon_impl {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
+                use coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use mem::transmute;
                union U {
                    halves: ($half, $half),
-                    vec: $id
+                    vec: $id,
                }
                let halves = U { vec: self }.halves;
-                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
+                let h: $half = transmute($vpmax(
                    transmute(halves.0),
                    transmute(halves.1),
                ));
                h.any()
            }
        }
-    }
+    };
 }
 // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
@@ -402,7 +443,7 @@ macro_rules! aarch64_128_neon_impl {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn all(self) -> bool {
-                use ::coresimd::arch::aarch64::$vmin;
+                use coresimd::arch::aarch64::$vmin;
                $vmin(::mem::transmute(self)) != 0
            }
        }
@@ -410,11 +451,11 @@ macro_rules! aarch64_128_neon_impl {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn any(self) -> bool {
-                use ::coresimd::arch::aarch64::$vmax;
+                use coresimd::arch::aarch64::$vmax;
                $vmax(::mem::transmute(self)) != 0
            }
        }
-    }
+    };
 }
 // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
@@ -431,9 +472,12 @@ macro_rules! aarch64_64_neon_impl {
            unsafe fn all(self) -> bool {
                union U {
                    halves: ($id, $id),
-                    vec: $vec128
+                    vec: $vec128,
                }
-                U { halves: (self, self) }.vec.all()
+                U {
                    halves: (self, self),
                }.vec
                    .all()
            }
        }
        impl Any for $id {
@@ -442,12 +486,15 @@ macro_rules! aarch64_64_neon_impl {
            unsafe fn any(self) -> bool {
                union U {
                    halves: ($id, $id),
-                    vec: $vec128
+                    vec: $vec128,
                }
                U { halves: (self, self) }.vec.any()
                }
                U {
                    halves: (self, self),
                }.vec
                    .any()
            }
        }
    };
 }
 macro_rules! impl_mask_all_any {
--- a/library/stdarch/coresimd/ppsv/codegen/mod.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/mod.rs
@@ -5,8 +5,8 @@ pub mod wrapping;
 pub mod masks_reductions;
 pub mod sqrt;
 pub mod abs;
 pub mod cos;
 pub mod fma;
 pub mod sin;
-pub mod cos;
+pub mod sqrt;
--- a/library/stdarch/coresimd/ppsv/codegen/sin.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/sin.rs
@@ -31,7 +31,7 @@ macro_rules! impl_fsin {
                unsafe { $fn(self) }
            }
        }
-    }
+    };
 }
 impl_fsin!(f32x2: sin_v2f32);
--- a/library/stdarch/coresimd/ppsv/codegen/sqrt.rs
+++ b/library/stdarch/coresimd/ppsv/codegen/sqrt.rs
@@ -31,7 +31,7 @@ macro_rules! impl_fsqrt {
                unsafe { $fn(self) }
            }
        }
-    }
+    };
 }
 impl_fsqrt!(f32x2: sqrt_v2f32);
--- a/library/stdarch/coresimd/ppsv/mod.rs
+++ b/library/stdarch/coresimd/ppsv/mod.rs
@@ -66,8 +66,12 @@ where
    U: FromBits<T>,
 {
    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
-    #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
+    #[cfg_attr(
-    #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
+        any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
    )]
    #[cfg_attr(
        not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
    )]
    fn into_bits(self) -> U {
        debug_assert!(::mem::size_of::<Self>() == ::mem::size_of::<U>());
        U::from_bits(self)
@@ -77,8 +81,12 @@ where
 // FromBits (and thus IntoBits) is reflexive.
 impl<T> FromBits<T> for T {
    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
-    #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
+    #[cfg_attr(
-    #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
+        any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
    )]
    #[cfg_attr(
        not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
    )]
    fn from_bits(t: Self) -> Self {
        t
    }
--- a/library/stdarch/coresimd/ppsv/v128.rs
+++ b/library/stdarch/coresimd/ppsv/v128.rs
@@ -110,9 +110,11 @@ macro_rules! from_bits_x86 {
    };
 }
-#[cfg(all(target_arch = "arm", target_feature = "neon",
+#[cfg(
-          target_feature = "v7"))]
+    all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
-use coresimd::arch::arm::{// FIXME: float16x8_t,
+)]
 use coresimd::arch::arm::{
    // FIXME: float16x8_t,
    float32x4_t,
    int16x8_t,
    int32x4_t,
@@ -123,10 +125,12 @@ use coresimd::arch::arm::{// FIXME: float16x8_t,
    uint16x8_t,
    uint32x4_t,
    uint64x2_t,
-                          uint8x16_t};
+    uint8x16_t,
 };
 #[cfg(target_arch = "aarch64")]
-use coresimd::arch::aarch64::{// FIXME: float16x8_t,
+use coresimd::arch::aarch64::{
    // FIXME: float16x8_t,
    float32x4_t,
    float64x2_t,
    int16x8_t,
@@ -138,13 +142,21 @@ use coresimd::arch::aarch64::{// FIXME: float16x8_t,
    uint16x8_t,
    uint32x4_t,
    uint64x2_t,
-                              uint8x16_t};
+    uint8x16_t,
 };
 macro_rules! from_bits_arm {
    ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
-        #[cfg(any(all(target_arch = "arm", target_feature = "neon",
+        #[cfg(
-                      target_feature = "v7"),
+            any(
-                  target_arch = "aarch64"))]
+                all(
                    target_arch = "arm",
                    target_feature = "neon",
                    target_feature = "v7"
                ),
                target_arch = "aarch64"
            )
        )]
        impl_from_bits_!(
            $id: int8x16_t,
            uint8x16_t,
@@ -182,12 +194,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(u64x2, u64, u64x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64);
    u64x2,
    u64,
    u64x2_from_bits_arm,
    u64x2_from_bits_aarch64
 );
 impl_from_bits!(
    i64x2: i64,
@@ -207,12 +214,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(i64x2, i64, i64x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64);
    i64x2,
    i64,
    i64x2_from_bits_arm,
    i64x2_from_bits_aarch64
 );
 impl_from_bits!(
    f64x2: f64,
@@ -232,12 +234,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(f64x2, f64, f64x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64);
    f64x2,
    f64,
    f64x2_from_bits_arm,
    f64x2_from_bits_aarch64
 );
 impl_from_bits!(
    u32x4: u32,
@@ -257,12 +254,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(u32x4, u32, u32x4_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64);
    u32x4,
    u32,
    u32x4_from_bits_arm,
    u32x4_from_bits_aarch64
 );
 impl_from_bits!(
    i32x4: i32,
@@ -282,12 +274,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(i32x4, i32, i32x4_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64);
    i32x4,
    i32,
    i32x4_from_bits_arm,
    i32x4_from_bits_aarch64
 );
 impl_from_bits!(
    f32x4: f32,
@@ -307,12 +294,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(f32x4, f32, f32x4_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64);
    f32x4,
    f32,
    f32x4_from_bits_arm,
    f32x4_from_bits_aarch64
 );
 impl_from_bits!(
    u16x8: u16,
@@ -332,12 +314,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(u16x8, u16, u16x8_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64);
    u16x8,
    u16,
    u16x8_from_bits_arm,
    u16x8_from_bits_aarch64
 );
 impl_from_bits!(
    i16x8: i16,
@@ -357,12 +334,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(i16x8, i16, i16x8_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64);
    i16x8,
    i16,
    i16x8_from_bits_arm,
    i16x8_from_bits_aarch64
 );
 impl_from_bits!(
    u8x16: u8,
@@ -382,12 +354,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(u8x16, u8, u8x16_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64);
    u8x16,
    u8,
    u8x16_from_bits_arm,
    u8x16_from_bits_aarch64
 );
 impl_from_bits!(
    i8x16: i8,
@@ -407,12 +374,7 @@ impl_from_bits!(
    m8x16
 );
 from_bits_x86!(i8x16, i8, i8x16_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64);
    i8x16,
    i8,
    i8x16_from_bits_arm,
    i8x16_from_bits_aarch64
 );
 impl_from!(
    f64x2: f64,
@@ -552,31 +514,37 @@ impl_from!(
    m8x8
 );
-impl_from!(u8x16: u8, u8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, i8x16, m8x16);
+impl_from!(
-impl_from!(i8x16: i8, i8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, u8x16, m8x16);
+    u8x16: u8,
    u8x16_from,
    test_v128 | i32x16,
    u32x16,
    f32x16,
    m1x16,
    i16x16,
    u16x16,
    m16x16,
    i8x16,
    m8x16
 );
 impl_from!(
    i8x16: i8,
    i8x16_from,
    test_v128 | i32x16,
    u32x16,
    f32x16,
    m1x16,
    i16x16,
    u16x16,
    m16x16,
    u8x16,
    m8x16
 );
 impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16);
-impl_from!(
+impl_from!(m16x8: i16, m16x8_from, test_v128 | m1x8, m32x8, m8x8);
    m16x8: i16,
    m16x8_from,
    test_v128 | m1x8,
    m32x8,
    m8x8
 );
-impl_from!(
+impl_from!(m32x4: i32, m32x4_from, test_v128 | m64x4, m16x4, m8x4);
    m32x4: i32,
    m32x4_from,
    test_v128 | m64x4,
    m16x4,
    m8x4
 );
-impl_from!(
+impl_from!(m64x2: i64, m64x2_from, test_v128 | m32x2, m16x2, m8x2);
    m64x2: i64,
    m64x2_from,
    test_v128 | m32x2,
    m16x2,
    m8x2
 );
--- a/library/stdarch/coresimd/ppsv/v16.rs
+++ b/library/stdarch/coresimd/ppsv/v16.rs
@@ -57,10 +57,4 @@ impl_from!(
    m8x2
 );
-impl_from!(
+impl_from!(m8x2: i8, m8x2_from, test_v16 | m64x2, m32x2, m16x2);
    m8x2: i8,
    m8x2_from,
    test_v16 | m64x2,
    m32x2,
    m16x2
 );
--- a/library/stdarch/coresimd/ppsv/v256.rs
+++ b/library/stdarch/coresimd/ppsv/v256.rs
@@ -465,25 +465,8 @@ impl_from!(
 impl_from!(m8x32: i8, m8x32_from, test_v256 | m1x32);
-impl_from!(
+impl_from!(m16x16: i16, m16x16_from, test_v256 | m1x16, m8x16);
    m16x16: i16,
    m16x16_from,
    test_v256 | m1x16,
    m8x16
 );
-impl_from!(
+impl_from!(m32x8: i32, m32x8_from, test_v256 | m1x8, m16x8, m8x8);
    m32x8: i32,
    m32x8_from,
    test_v256 | m1x8,
    m16x8,
    m8x8
 );
-impl_from!(
+impl_from!(m64x4: i64, m64x4_from, test_v256 | m32x4, m16x4, m8x4);
    m64x4: i64,
    m64x4_from,
    test_v256 | m32x4,
    m16x4,
    m8x4
 );
--- a/library/stdarch/coresimd/ppsv/v32.rs
+++ b/library/stdarch/coresimd/ppsv/v32.rs
@@ -151,18 +151,6 @@ impl_from!(
    m8x4
 );
-impl_from!(
+impl_from!(m8x4: i8, m8x4_from, test_v32 | m64x4, m32x4, m16x4);
    m8x4: i8,
    m8x4_from,
    test_v32 | m64x4,
    m32x4,
    m16x4
 );
-impl_from!(
+impl_from!(m16x2: i16, m16x2_from, test_v32 | m64x2, m32x2, m8x2);
    m16x2: i16,
    m16x2_from,
    test_v32 | m64x2,
    m32x2,
    m8x2
 );
--- a/library/stdarch/coresimd/ppsv/v512.rs
+++ b/library/stdarch/coresimd/ppsv/v512.rs
@@ -446,17 +446,6 @@ impl_from!(u8x64: u8, u8x64_from, test_v512 | i8x64, m1x64);
 impl_from!(m1x32: i16, m1x32_from, test_v512 | m8x32);
-impl_from!(
+impl_from!(m1x16: i32, m1x16_from, test_v512 | m16x16, m8x16);
    m1x16: i32,
    m1x16_from,
    test_v512 | m16x16,
    m8x16
 );
-impl_from!(
+impl_from!(m1x8: i64, m1x8_from, test_v512 | m32x8, m16x8, m8x8);
    m1x8: i64,
    m1x8_from,
    test_v512 | m32x8,
    m16x8,
    m8x8
 );
--- a/library/stdarch/coresimd/ppsv/v64.rs
+++ b/library/stdarch/coresimd/ppsv/v64.rs
@@ -83,9 +83,11 @@ macro_rules! from_bits_x86 {
    };
 }
-#[cfg(all(target_arch = "arm", target_feature = "neon",
+#[cfg(
-          target_feature = "v7"))]
+    all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
-use coresimd::arch::arm::{// FIXME: float16x4_t,
+)]
 use coresimd::arch::arm::{
    // FIXME: float16x4_t,
    float32x2_t,
    int16x4_t,
    int32x2_t,
@@ -96,10 +98,12 @@ use coresimd::arch::arm::{// FIXME: float16x4_t,
    uint16x4_t,
    uint32x2_t,
    uint64x1_t,
-                          uint8x8_t};
+    uint8x8_t,
 };
 #[cfg(target_arch = "aarch64")]
-use coresimd::arch::aarch64::{// FIXME: float16x4_t,
+use coresimd::arch::aarch64::{
    // FIXME: float16x4_t,
    float32x2_t,
    float64x1_t,
    int16x4_t,
@@ -111,13 +115,21 @@ use coresimd::arch::aarch64::{// FIXME: float16x4_t,
    uint16x4_t,
    uint32x2_t,
    uint64x1_t,
-                              uint8x8_t};
+    uint8x8_t,
 };
 macro_rules! from_bits_arm {
    ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
-        #[cfg(any(all(target_arch = "arm", target_feature = "neon",
+        #[cfg(
-                      target_feature = "v7"),
+            any(
-                  target_arch = "aarch64"))]
+                all(
                    target_arch = "arm",
                    target_feature = "neon",
                    target_feature = "v7"
                ),
                target_arch = "aarch64"
            )
        )]
        impl_from_bits_!(
            $id: int64x1_t,
            uint64x1_t,
@@ -151,12 +163,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(u32x2, u32, u32x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64);
    u32x2,
    u32,
    u32x2_from_bits_arm,
    u32x2_from_bits_aarch64
 );
 impl_from_bits!(
    i32x2: i32,
@@ -172,12 +179,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(i32x2, i32, i32x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64);
    i32x2,
    i32,
    i32x2_from_bits_arm,
    i32x2_from_bits_aarch64
 );
 impl_from_bits!(
    f32x2: f32,
@@ -193,12 +195,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(f32x2, f32, f32x2_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64);
    f32x2,
    f32,
    f32x2_from_bits_arm,
    f32x2_from_bits_aarch64
 );
 impl_from_bits!(
    u16x4: u16,
@@ -213,12 +210,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(u16x4, u16, u16x4_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64);
    u16x4,
    u16,
    u16x4_from_bits_arm,
    u16x4_from_bits_aarch64
 );
 impl_from_bits!(
    i16x4: i16,
@@ -233,12 +225,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(i16x4, i16, i16x4_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64);
    i16x4,
    i16,
    i16x4_from_bits_arm,
    i16x4_from_bits_aarch64
 );
 impl_from_bits!(
    u8x8: u8,
@@ -253,12 +240,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(u8x8, u8, u8x8_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64);
    u8x8,
    u8,
    u8x8_from_bits_arm,
    u8x8_from_bits_aarch64
 );
 impl_from_bits!(
    i8x8: i8,
@@ -273,12 +255,7 @@ impl_from_bits!(
    m8x8
 );
 from_bits_x86!(i8x8, i8, i8x8_from_bits_x86);
-from_bits_arm!(
+from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64);
    i8x8,
    i8,
    i8x8_from_bits_arm,
    i8x8_from_bits_aarch64
 );
 impl_from!(
    f32x2: f32,
@@ -404,26 +381,8 @@ impl_from!(
    m8x8
 );
-impl_from!(
+impl_from!(m8x8: i8, m8x8_from, test_v64 | m1x8, m32x8, m16x8);
    m8x8: i8,
    m8x8_from,
    test_v64 | m1x8,
    m32x8,
    m16x8
 );
-impl_from!(
+impl_from!(m16x4: i16, m16x4_from, test_v64 | m64x4, m32x4, m8x4);
    m16x4: i16,
    m16x4_from,
    test_v64 | m64x4,
    m32x4,
    m8x4
 );
-impl_from!(
+impl_from!(m32x2: i32, m32x2_from, test_v64 | m64x2, m16x2, m8x2);
    m32x2: i32,
    m32x2_from,
    test_v64 | m64x2,
    m16x2,
    m8x2
 );
--- a/library/stdarch/coresimd/x86/avx.rs
+++ b/library/stdarch/coresimd/x86/avx.rs
@@ -1387,7 +1387,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_ps(
-    a: __m256, b: __m256, imm8: i32
+    a: __m256, b: __m256, imm8: i32,
 ) -> __m256 {
    macro_rules! call {
        ($imm8:expr) => {
@@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_permute2f128_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_pd(
-    a: __m256d, b: __m256d, imm8: i32
+    a: __m256d, b: __m256d, imm8: i32,
 ) -> __m256d {
    macro_rules! call {
        ($imm8:expr) => {
@@ -1427,7 +1427,7 @@ pub unsafe fn _mm256_permute2f128_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_si256(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
    let a = a.as_i32x8();
    let b = b.as_i32x8();
@@ -1529,7 +1529,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_pd(
-    a: __m256d, b: __m128d, imm8: i32
+    a: __m256d, b: __m128d, imm8: i32,
 ) -> __m256d {
    match imm8 & 1 {
        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
@@ -1547,7 +1547,7 @@ pub unsafe fn _mm256_insertf128_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_si256(
-    a: __m256i, b: __m128i, imm8: i32
+    a: __m256i, b: __m128i, imm8: i32,
 ) -> __m256i {
    let b = _mm256_castsi128_si256(b).as_i64x4();
    let dst: i64x4 = match imm8 & 1 {
@@ -1567,11 +1567,7 @@ pub unsafe fn _mm256_insertf128_si256(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
-    mem::transmute(simd_insert(
+    mem::transmute(simd_insert(a.as_i8x32(), (index as u32) & 31, i))
        a.as_i8x32(),
        (index as u32) & 31,
        i,
    ))
 }
 /// Copy `a` to result, and insert the 16-bit integer `i` into result
@@ -1584,11 +1580,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
-    mem::transmute(simd_insert(
+    mem::transmute(simd_insert(a.as_i16x16(), (index as u32) & 15, i))
        a.as_i16x16(),
        (index as u32) & 15,
        i,
    ))
 }
 /// Copy `a` to result, and insert the 32-bit integer `i` into result
@@ -1790,7 +1782,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_pd(
-    mem_addr: *const f64, mask: __m256i
+    mem_addr: *const f64, mask: __m256i,
 ) -> __m256d {
    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
 }
@@ -1804,7 +1796,7 @@ pub unsafe fn _mm256_maskload_pd(
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_pd(
-    mem_addr: *mut f64, mask: __m256i, a: __m256d
+    mem_addr: *mut f64, mask: __m256i, a: __m256d,
 ) {
    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
 }
@@ -1844,7 +1836,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_ps(
-    mem_addr: *const f32, mask: __m256i
+    mem_addr: *const f32, mask: __m256i,
 ) -> __m256 {
    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
 }
@@ -1858,7 +1850,7 @@ pub unsafe fn _mm256_maskload_ps(
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_ps(
-    mem_addr: *mut f32, mask: __m256i, a: __m256
+    mem_addr: *mut f32, mask: __m256i, a: __m256,
 ) {
    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
 }
@@ -2383,7 +2375,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_ps(
-    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
+    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
 ) -> __m256 {
    _mm256_setr_ps(h, g, f, e, d, c, b, a)
 }
@@ -2440,7 +2432,7 @@ pub unsafe fn _mm256_set_epi16(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi32(
-    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
+    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
 ) -> __m256i {
    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
 }
@@ -2477,7 +2469,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_ps(
-    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
+    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
 ) -> __m256 {
    __m256(a, b, c, d, e, f, g, h)
 }
@@ -2536,7 +2528,7 @@ pub unsafe fn _mm256_setr_epi16(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi32(
-    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
+    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
 ) -> __m256i {
    mem::transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -2950,7 +2942,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128(
-    hiaddr: *const f32, loaddr: *const f32
+    hiaddr: *const f32, loaddr: *const f32,
 ) -> __m256 {
    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
    _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
@@ -2967,7 +2959,7 @@ pub unsafe fn _mm256_loadu2_m128(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128d(
-    hiaddr: *const f64, loaddr: *const f64
+    hiaddr: *const f64, loaddr: *const f64,
 ) -> __m256d {
    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
    _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
@@ -2983,7 +2975,7 @@ pub unsafe fn _mm256_loadu2_m128d(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128i(
-    hiaddr: *const __m128i, loaddr: *const __m128i
+    hiaddr: *const __m128i, loaddr: *const __m128i,
 ) -> __m256i {
    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
    _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
@@ -3000,7 +2992,7 @@ pub unsafe fn _mm256_loadu2_m128i(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128(
-    hiaddr: *mut f32, loaddr: *mut f32, a: __m256
+    hiaddr: *mut f32, loaddr: *mut f32, a: __m256,
 ) {
    let lo = _mm256_castps256_ps128(a);
    _mm_storeu_ps(loaddr, lo);
@@ -3019,7 +3011,7 @@ pub unsafe fn _mm256_storeu2_m128(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128d(
-    hiaddr: *mut f64, loaddr: *mut f64, a: __m256d
+    hiaddr: *mut f64, loaddr: *mut f64, a: __m256d,
 ) {
    let lo = _mm256_castpd256_pd128(a);
    _mm_storeu_pd(loaddr, lo);
@@ -3037,7 +3029,7 @@ pub unsafe fn _mm256_storeu2_m128d(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128i(
-    hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
+    hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i,
 ) {
    let lo = _mm256_castsi256_si128(a);
    _mm_storeu_si128(loaddr, lo);
@@ -3500,20 +3492,11 @@ mod tests {
        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_blend_ps(a, b, 0x0);
-        assert_eq_m256(
+        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
            r,
            _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.),
        );
        let r = _mm256_blend_ps(a, b, 0x3);
-        assert_eq_m256(
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
            r,
            _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.),
        );
        let r = _mm256_blend_ps(a, b, 0xF);
-        assert_eq_m256(
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
            r,
            _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.),
        );
    }
    #[simd_test(enable = "avx")]
@@ -3544,16 +3527,8 @@ mod tests {
        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
        let r = _mm256_dp_ps(a, b, 0xFF);
-        let e = _mm256_setr_ps(
+        let e =
-            200.,
+            _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
            200.,
            200.,
            200.,
            2387.,
            2387.,
            2387.,
            2387.,
        );
        assert_eq_m256(r, e);
    }
@@ -4234,9 +4209,7 @@ mod tests {
            pub data: [f64; 4],
        }
        let a = _mm256_set1_pd(7.0);
-        let mut mem = Memory {
+        let mut mem = Memory { data: [-1.0; 4] };
            data: [-1.0; 4],
        };
        _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
        for i in 0..4 {
@@ -4251,9 +4224,7 @@ mod tests {
            pub data: [f32; 8],
        }
        let a = _mm256_set1_ps(7.0);
-        let mut mem = Memory {
+        let mut mem = Memory { data: [-1.0; 8] };
            data: [-1.0; 8],
        };
        _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
        for i in 0..8 {
@@ -4534,10 +4505,7 @@ mod tests {
    #[simd_test(enable = "avx")]
    unsafe fn test_mm256_set_ps() {
        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(
+        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
            r,
            _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.),
        );
    }
    #[simd_test(enable = "avx")]
@@ -4595,10 +4563,7 @@ mod tests {
    #[simd_test(enable = "avx")]
    unsafe fn test_mm256_setr_ps() {
        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(
+        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
            r,
            _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.),
        );
    }
    #[simd_test(enable = "avx")]
--- a/library/stdarch/coresimd/x86/avx2.rs
+++ b/library/stdarch/coresimd/x86/avx2.rs
@@ -413,7 +413,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi32(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
    let imm8 = (imm8 & 0xFF) as u8;
    let a = a.as_i32x8();
@@ -480,7 +480,7 @@ pub unsafe fn _mm256_blend_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi16(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
    let imm8 = (imm8 & 0xFF) as u8;
    let a = a.as_i16x16();
@@ -531,76 +531,20 @@ pub unsafe fn _mm256_blend_epi16(
        ) => {
            match (imm8 >> 6) & 0b11 {
                0b00 => blend4!(
-                    $a,
+                    $a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2,
-                    $b,
+                    $f2, 14, 15
                    $c,
                    $d,
                    $e,
                    $f,
                    6,
                    7,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    $e2,
                    $f2,
                    14,
                    15
                ),
                0b01 => blend4!(
-                    $a,
+                    $a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2,
-                    $b,
+                    $e2, $f2, 30, 15
                    $c,
                    $d,
                    $e,
                    $f,
                    22,
                    7,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    $e2,
                    $f2,
                    30,
                    15
                ),
                0b10 => blend4!(
-                    $a,
+                    $a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2,
-                    $b,
+                    $e2, $f2, 14, 31
                    $c,
                    $d,
                    $e,
                    $f,
                    6,
                    23,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    $e2,
                    $f2,
                    14,
                    31
                ),
                _ => blend4!(
-                    $a,
+                    $a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2,
-                    $b,
+                    $e2, $f2, 30, 31
                    $c,
                    $d,
                    $e,
                    $f,
                    22,
                    23,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    $e2,
                    $f2,
                    30,
                    31
                ),
            }
        };
@@ -618,60 +562,16 @@ pub unsafe fn _mm256_blend_epi16(
        ) => {
            match (imm8 >> 4) & 0b11 {
                0b00 => blend3!(
-                    $a,
+                    $a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13
                    $b,
                    $c,
                    $d,
                    4,
                    5,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    12,
                    13
                ),
                0b01 => blend3!(
-                    $a,
+                    $a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13
                    $b,
                    $c,
                    $d,
                    20,
                    5,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    28,
                    13
                ),
                0b10 => blend3!(
-                    $a,
+                    $a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29
                    $b,
                    $c,
                    $d,
                    4,
                    21,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    12,
                    29
                ),
                _ => blend3!(
-                    $a,
+                    $a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29
                    $b,
                    $c,
                    $d,
                    20,
                    21,
                    $a2,
                    $b2,
                    $c2,
                    $d2,
                    28,
                    29
                ),
            }
        };
@@ -703,13 +603,9 @@ pub unsafe fn _mm256_blend_epi16(
 #[cfg_attr(test, assert_instr(vpblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(
-    a: __m256i, b: __m256i, mask: __m256i
+    a: __m256i, b: __m256i, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(pblendvb(
+    mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
        a.as_i8x32(),
        b.as_i8x32(),
        mask.as_i8x32(),
    ))
 }
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
@@ -1226,7 +1122,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi32(
-    slice: *const i32, offsets: __m128i, scale: i32
+    slice: *const i32, offsets: __m128i, scale: i32,
 ) -> __m128i {
    let zero = _mm_setzero_si128().as_i32x4();
    let neg_one = _mm_set1_epi32(-1).as_i32x4();
@@ -1280,7 +1176,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi32(
-    slice: *const i32, offsets: __m256i, scale: i32
+    slice: *const i32, offsets: __m256i, scale: i32,
 ) -> __m256i {
    let zero = _mm256_setzero_si256().as_i32x8();
    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
@@ -1334,7 +1230,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_ps(
-    slice: *const f32, offsets: __m128i, scale: i32
+    slice: *const f32, offsets: __m128i, scale: i32,
 ) -> __m128 {
    let zero = _mm_setzero_ps();
    let neg_one = _mm_set1_ps(-1.0);
@@ -1360,7 +1256,7 @@ pub unsafe fn _mm_i32gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_ps(
-    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
 ) -> __m128 {
    let offsets = offsets.as_i32x4();
    let slice = slice as *const i8;
@@ -1383,7 +1279,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_ps(
-    slice: *const f32, offsets: __m256i, scale: i32
+    slice: *const f32, offsets: __m256i, scale: i32,
 ) -> __m256 {
    let zero = _mm256_setzero_ps();
    let neg_one = _mm256_set1_ps(-1.0);
@@ -1409,7 +1305,7 @@ pub unsafe fn _mm256_i32gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_ps(
-    src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32
+    src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32,
 ) -> __m256 {
    let offsets = offsets.as_i32x8();
    let slice = slice as *const i8;
@@ -1432,7 +1328,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m128i {
    let zero = _mm_setzero_si128().as_i64x2();
    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
@@ -1486,7 +1382,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m256i {
    let zero = _mm256_setzero_si256().as_i64x4();
    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
@@ -1540,7 +1436,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m128d {
    let zero = _mm_setzero_pd();
    let neg_one = _mm_set1_pd(-1.0);
@@ -1590,7 +1486,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m256d {
    let zero = _mm256_setzero_pd();
    let neg_one = _mm256_set1_pd(-1.0);
@@ -1640,7 +1536,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi32(
-    slice: *const i32, offsets: __m128i, scale: i32
+    slice: *const i32, offsets: __m128i, scale: i32,
 ) -> __m128i {
    let zero = _mm_setzero_si128().as_i32x4();
    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
@@ -1694,7 +1590,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi32(
-    slice: *const i32, offsets: __m256i, scale: i32
+    slice: *const i32, offsets: __m256i, scale: i32,
 ) -> __m128i {
    let zero = _mm_setzero_si128().as_i32x4();
    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
@@ -1748,7 +1644,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_ps(
-    slice: *const f32, offsets: __m128i, scale: i32
+    slice: *const f32, offsets: __m128i, scale: i32,
 ) -> __m128 {
    let zero = _mm_setzero_ps();
    let neg_one = _mm_set1_ps(-1.0);
@@ -1774,7 +1670,7 @@ pub unsafe fn _mm_i64gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_ps(
-    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
 ) -> __m128 {
    let offsets = offsets.as_i64x2();
    let slice = slice as *const i8;
@@ -1797,7 +1693,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_ps(
-    slice: *const f32, offsets: __m256i, scale: i32
+    slice: *const f32, offsets: __m256i, scale: i32,
 ) -> __m128 {
    let zero = _mm_setzero_ps();
    let neg_one = _mm_set1_ps(-1.0);
@@ -1823,7 +1719,7 @@ pub unsafe fn _mm256_i64gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_ps(
-    src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32,
 ) -> __m128 {
    let offsets = offsets.as_i64x4();
    let slice = slice as *const i8;
@@ -1846,7 +1742,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m128i {
    let zero = _mm_setzero_si128().as_i64x2();
    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
@@ -1900,7 +1796,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi64(
-    slice: *const i64, offsets: __m256i, scale: i32
+    slice: *const i64, offsets: __m256i, scale: i32,
 ) -> __m256i {
    let zero = _mm256_setzero_si256().as_i64x4();
    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
@@ -1954,7 +1850,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m128d {
    let zero = _mm_setzero_pd();
    let neg_one = _mm_set1_pd(-1.0);
@@ -2004,7 +1900,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_pd(
-    slice: *const f64, offsets: __m256i, scale: i32
+    slice: *const f64, offsets: __m256i, scale: i32,
 ) -> __m256d {
    let zero = _mm256_setzero_pd();
    let neg_one = _mm256_set1_pd(-1.0);
@@ -2053,7 +1949,7 @@ pub unsafe fn _mm256_mask_i64gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_inserti128_si256(
-    a: __m256i, b: __m128i, imm8: i32
+    a: __m256i, b: __m128i, imm8: i32,
 ) -> __m256i {
    let a = a.as_i64x4();
    let b = _mm256_castsi128_si256(b).as_i64x4();
@@ -2101,12 +1997,9 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi32(
-    mem_addr: *const i32, mask: __m128i
+    mem_addr: *const i32, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(maskloadd(
+    mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
        mem_addr as *const i8,
        mask.as_i32x4(),
    ))
 }
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2119,12 +2012,9 @@ pub unsafe fn _mm_maskload_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi32(
-    mem_addr: *const i32, mask: __m256i
+    mem_addr: *const i32, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(maskloadd256(
+    mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
        mem_addr as *const i8,
        mask.as_i32x8(),
    ))
 }
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2137,12 +2027,9 @@ pub unsafe fn _mm256_maskload_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi64(
-    mem_addr: *const i64, mask: __m128i
+    mem_addr: *const i64, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(maskloadq(
+    mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
        mem_addr as *const i8,
        mask.as_i64x2(),
    ))
 }
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2155,12 +2042,9 @@ pub unsafe fn _mm_maskload_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi64(
-    mem_addr: *const i64, mask: __m256i
+    mem_addr: *const i64, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(maskloadq256(
+    mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
        mem_addr as *const i8,
        mask.as_i64x4(),
    ))
 }
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -2173,13 +2057,9 @@ pub unsafe fn _mm256_maskload_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi32(
-    mem_addr: *mut i32, mask: __m128i, a: __m128i
+    mem_addr: *mut i32, mask: __m128i, a: __m128i,
 ) {
-    maskstored(
+    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
        mem_addr as *mut i8,
        mask.as_i32x4(),
        a.as_i32x4(),
    )
 }
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -2192,13 +2072,9 @@ pub unsafe fn _mm_maskstore_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi32(
-    mem_addr: *mut i32, mask: __m256i, a: __m256i
+    mem_addr: *mut i32, mask: __m256i, a: __m256i,
 ) {
-    maskstored256(
+    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
        mem_addr as *mut i8,
        mask.as_i32x8(),
        a.as_i32x8(),
    )
 }
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -2211,13 +2087,9 @@ pub unsafe fn _mm256_maskstore_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi64(
-    mem_addr: *mut i64, mask: __m128i, a: __m128i
+    mem_addr: *mut i64, mask: __m128i, a: __m128i,
 ) {
-    maskstoreq(
+    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
        mem_addr as *mut i8,
        mask.as_i64x2(),
        a.as_i64x2(),
    )
 }
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -2230,13 +2102,9 @@ pub unsafe fn _mm_maskstore_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi64(
-    mem_addr: *mut i64, mask: __m256i, a: __m256i
+    mem_addr: *mut i64, mask: __m256i, a: __m256i,
 ) {
-    maskstoreq256(
+    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
        mem_addr as *mut i8,
        mask.as_i64x4(),
        a.as_i64x4(),
    )
 }
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
@@ -2410,7 +2278,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mpsadbw_epu8(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
    let a = a.as_u8x32();
    let b = b.as_u8x32();
@@ -2656,7 +2524,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2x128_si256(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
    let a = a.as_i64x4();
    let b = b.as_i64x4();
@@ -3559,16 +3427,23 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// let a = _mm256_setr_epi8(
-/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
+/// );
 /// let b = _mm256_setr_epi8(
 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
 ///     -30, -31,
 /// );
 ///
 /// let c = _mm256_unpackhi_epi8(a, b);
 ///
-/// let expected = _mm256_setr_epi8(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13,
+/// let expected = _mm256_setr_epi8(
-/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30,
+///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
-/// 31,-31);
+///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
 ///     -31,
 /// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3612,15 +3487,22 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// let a = _mm256_setr_epi8(
-/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
+/// );
 /// let b = _mm256_setr_epi8(
 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
 ///     -30, -31,
 /// );
 ///
 /// let c = _mm256_unpacklo_epi8(a, b);
 ///
-/// let expected = _mm256_setr_epi8(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7,
+/// let expected = _mm256_setr_epi8(
-/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
+///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
 ///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
 /// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3664,13 +3546,18 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+/// let a = _mm256_setr_epi16(
-/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 /// );
 /// let b = _mm256_setr_epi16(
 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
 /// );
 ///
 /// let c = _mm256_unpackhi_epi16(a, b);
 ///
-/// let expected = _mm256_setr_epi16(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14,
+/// let expected = _mm256_setr_epi16(
-/// 15,-15);
+///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
 /// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3688,9 +3575,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
    let r: i16x16 = simd_shuffle16(
        a.as_i16x16(),
        b.as_i16x16(),
-        [
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
            4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31
        ],
    );
    mem::transmute(r)
 }
@@ -3715,13 +3600,18 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 ///
-/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+/// let a = _mm256_setr_epi16(
-/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 /// );
 /// let b = _mm256_setr_epi16(
 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
 /// );
 ///
 /// let c = _mm256_unpacklo_epi16(a, b);
 ///
-/// let expected = _mm256_setr_epi16(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10,
+/// let expected = _mm256_setr_epi16(
-/// 11,-11);
+///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
 /// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3739,9 +3629,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
    let r: i16x16 = simd_shuffle16(
        a.as_i16x16(),
        b.as_i16x16(),
-        [
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
            0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27
        ],
    );
    mem::transmute(r)
 }
@@ -3832,11 +3720,8 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 =
-        a.as_i32x8(),
+        simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
        b.as_i32x8(),
        [0, 8, 1, 9, 4, 12, 5, 13],
    );
    mem::transmute(r)
 }
@@ -4183,35 +4068,35 @@ extern "C" {
    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
    #[link_name = "llvm.x86.avx2.gather.d.d"]
    fn pgatherdd(
-        src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8,
    ) -> i32x4;
    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
    fn vpgatherdd(
-        src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8
+        src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8,
    ) -> i32x8;
    #[link_name = "llvm.x86.avx2.gather.d.q"]
    fn pgatherdq(
-        src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8
+        src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8,
    ) -> i64x2;
    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
    fn vpgatherdq(
-        src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8
+        src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8,
    ) -> i64x4;
    #[link_name = "llvm.x86.avx2.gather.q.d"]
    fn pgatherqd(
-        src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8,
    ) -> i32x4;
    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
    fn vpgatherqd(
-        src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8,
    ) -> i32x4;
    #[link_name = "llvm.x86.avx2.gather.q.q"]
    fn pgatherqq(
-        src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8
+        src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8,
    ) -> i64x2;
    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
    fn vpgatherqq(
-        src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8
+        src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8,
    ) -> i64x4;
    #[link_name = "llvm.x86.avx2.gather.d.pd"]
    fn pgatherdpd(
@@ -4235,19 +4120,19 @@ extern "C" {
    ) -> __m256d;
    #[link_name = "llvm.x86.avx2.gather.d.ps"]
    fn pgatherdps(
-        src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8,
    ) -> __m128;
    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
    fn vpgatherdps(
-        src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8
+        src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8,
    ) -> __m256;
    #[link_name = "llvm.x86.avx2.gather.q.ps"]
    fn pgatherqps(
-        src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8,
    ) -> __m128;
    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
    fn vpgatherqps(
-        src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8,
    ) -> __m128;
    #[link_name = "llvm.x86.avx2.psll.dq"]
    fn vpslldq(a: i64x4, b: i32) -> i64x4;
@@ -4718,10 +4603,7 @@ mod tests {
            7, 6, 5, 4, 3, 2, 1, 0,
        );
        let r = _mm256_cmpeq_epi8(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
            r,
            _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4737,10 +4619,7 @@ mod tests {
            7, 6, 5, 4, 3, 2, 1, 0,
        );
        let r = _mm256_cmpeq_epi16(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
            r,
            _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4758,10 +4637,7 @@ mod tests {
        let a = _mm256_setr_epi64x(0, 1, 2, 3);
        let b = _mm256_setr_epi64x(3, 2, 2, 0);
        let r = _mm256_cmpeq_epi64(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
            r,
            _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4769,10 +4645,7 @@ mod tests {
        let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
        let b = _mm256_set1_epi8(0);
        let r = _mm256_cmpgt_epi8(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
            r,
            _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4780,10 +4653,7 @@ mod tests {
        let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
        let b = _mm256_set1_epi16(0);
        let r = _mm256_cmpgt_epi16(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
            r,
            _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4791,10 +4661,7 @@ mod tests {
        let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
        let b = _mm256_set1_epi32(0);
        let r = _mm256_cmpgt_epi32(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
            r,
            _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -4802,10 +4669,7 @@ mod tests {
        let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
        let b = _mm256_set1_epi64x(0);
        let r = _mm256_cmpgt_epi64(a, b);
-        assert_eq_m256i(
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
            r,
            _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0),
        );
    }
    #[simd_test(enable = "avx2")]
@@ -5997,16 +5861,7 @@ mod tests {
        );
        assert_eq_m256(
            r,
-            _mm256_setr_ps(
+            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
                0.0,
                16.0,
                64.0,
                256.0,
                256.0,
                256.0,
                256.0,
                256.0,
            ),
        );
    }
--- a/library/stdarch/coresimd/x86/bmi1.rs
+++ b/library/stdarch/coresimd/x86/bmi1.rs
@@ -21,17 +21,14 @@ use stdsimd_test::assert_instr;
 #[cfg_attr(test, assert_instr(bextr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
-    _bextr2_u32(
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
        a,
        (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32),
    )
 }
 /// Extracts bits of `a` specified by `control` into
 /// the least significant bits of the result.
 ///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
-/// be extracted, and bits `[15,8]` specify the length of the range.
+/// to be extracted, and bits `[15,8]` specify the length of the range.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
 #[inline]
--- a/library/stdarch/coresimd/x86/fxsr.rs
+++ b/library/stdarch/coresimd/x86/fxsr.rs
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor(mem_addr: *const u8) {
 #[cfg(test)]
 mod tests {
    use coresimd::x86::*;
-    use std::{fmt, cmp::PartialEq};
+    use std::{cmp::PartialEq, fmt};
    use stdsimd_test::simd_test;
    #[repr(align(16))]
--- a/library/stdarch/coresimd/x86/mmx.rs
+++ b/library/stdarch/coresimd/x86/mmx.rs
@@ -380,7 +380,7 @@ pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
 #[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi8(
-    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8
+    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
 ) -> __m64 {
    _mm_setr_pi8(e0, e1, e2, e3, e4, e5, e6, e7)
 }
@@ -426,7 +426,7 @@ pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
 #[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi8(
-    e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8
+    e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8,
 ) -> __m64 {
    mem::transmute(i8x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -514,12 +514,8 @@ mod tests {
            -30001,
            i16::max_value() - 1,
        );
-        let e = _mm_setr_pi16(
+        let e =
-            i16::min_value(),
+            _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value());
            30000,
            -30000,
            i16::max_value(),
        );
        assert_eq_m64(e, _mm_add_pi16(a, b));
        assert_eq_m64(e, _m_paddw(a, b));
    }
@@ -537,16 +533,8 @@ mod tests {
    unsafe fn test_mm_adds_pi8() {
        let a = _mm_setr_pi8(-100, -1, 1, 100, -1, 0, 1, 0);
        let b = _mm_setr_pi8(-100, 1, -1, 100, 0, -1, 0, 1);
-        let e = _mm_setr_pi8(
+        let e =
-            i8::min_value(),
+            _mm_setr_pi8(i8::min_value(), 0, 0, i8::max_value(), -1, -1, 1, 1);
            0,
            0,
            i8::max_value(),
            -1,
            -1,
            1,
            1,
        );
        assert_eq_m64(e, _mm_adds_pi8(a, b));
        assert_eq_m64(e, _m_paddsb(a, b));
    }
--- a/library/stdarch/coresimd/x86/mod.rs
+++ b/library/stdarch/coresimd/x86/mod.rs
@@ -444,11 +444,12 @@ impl m256iExt for __m256i {
    }
 }
-use coresimd::simd::{f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8,
+use coresimd::simd::{
-                     i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, i8x32, i8x8,
+    f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8, i32x2, i32x4,
-                     m16x16, m16x4, m16x8, m32x2, m32x4, m32x8, m64x2, m64x4,
+    i32x8, i64x2, i64x4, i8x16, i8x32, i8x8, m16x16, m16x4, m16x8, m32x2,
-                     m8x16, m8x32, m8x8, u16x16, u16x4, u16x8, u32x2, u32x4,
+    m32x4, m32x8, m64x2, m64x4, m8x16, m8x32, m8x8, u16x16, u16x4, u16x8,
-                     u32x8, u64x2, u64x4, u8x16, u8x32, u8x8};
+    u32x2, u32x4, u32x8, u64x2, u64x4, u8x16, u8x32, u8x8,
 };
 impl_from_bits_!(
    __m64: u32x2,
--- a/library/stdarch/coresimd/x86/pclmulqdq.rs
+++ b/library/stdarch/coresimd/x86/pclmulqdq.rs
@@ -25,20 +25,25 @@ extern "C" {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
-#[cfg_attr(all(test, not(target_os = "linux")),
+#[cfg_attr(
-           assert_instr(pclmulqdq, imm8 = 0))]
+    all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0)
-#[cfg_attr(all(test, target_os = "linux"),
+)]
-           assert_instr(pclmullqlqdq, imm8 = 0))]
+#[cfg_attr(
-#[cfg_attr(all(test, target_os = "linux"),
+    all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0)
-           assert_instr(pclmulhqlqdq, imm8 = 1))]
+)]
-#[cfg_attr(all(test, target_os = "linux"),
+#[cfg_attr(
-           assert_instr(pclmullqhqdq, imm8 = 16))]
+    all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1)
-#[cfg_attr(all(test, target_os = "linux"),
+)]
-           assert_instr(pclmulhqhqdq, imm8 = 17))]
+#[cfg_attr(
    all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16)
 )]
 #[cfg_attr(
    all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17)
 )]
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_clmulepi64_si128(
-    a: __m128i, b: __m128i, imm8: i32
+    a: __m128i, b: __m128i, imm8: i32,
 ) -> __m128i {
    macro_rules! call {
        ($imm8:expr) => {
--- a/library/stdarch/coresimd/x86/rdrand.rs
+++ b/library/stdarch/coresimd/x86/rdrand.rs
@@ -1,4 +1,3 @@
 //! RDRAND and RDSEED instructions for returning random numbers from an Intel
 //! on-chip hardware random number generator which has been seeded by an
 //! on-chip entropy source.
--- a/library/stdarch/coresimd/x86/sha.rs
+++ b/library/stdarch/coresimd/x86/sha.rs
@@ -75,7 +75,7 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1rnds4_epu32(
-    a: __m128i, b: __m128i, func: i32
+    a: __m128i, b: __m128i, func: i32,
 ) -> __m128i {
    let a = a.as_i32x4();
    let b = b.as_i32x4();
@@ -126,13 +126,9 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(sha256rnds2))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256rnds2_epu32(
-    a: __m128i, b: __m128i, k: __m128i
+    a: __m128i, b: __m128i, k: __m128i,
 ) -> __m128i {
-    mem::transmute(sha256rnds2(
+    mem::transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4()))
        a.as_i32x4(),
        b.as_i32x4(),
        k.as_i32x4(),
    ))
 }
 #[cfg(test)]
--- a/library/stdarch/coresimd/x86/sse.rs
+++ b/library/stdarch/coresimd/x86/sse.rs
@@ -230,8 +230,10 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `and` instructions, so ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(andps))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(andps)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
    let a: __m128i = mem::transmute(a);
@@ -249,8 +251,10 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 // it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(andnps))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(andnps)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
    let a: __m128i = mem::transmute(a);
@@ -265,8 +269,10 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `or` instructions, so we ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(orps))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(orps)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
    let a: __m128i = mem::transmute(a);
@@ -281,8 +287,10 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `xor` instructions, so we ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(xorps))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(xorps)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
    let a: __m128i = mem::transmute(a);
@@ -1132,10 +1140,14 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))]
 // 32-bit codegen does not generate `movhps` or `movhpd`, but instead
 // `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2).
-#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
+#[cfg_attr(
-           assert_instr(movlhps))]
+    all(test, target_arch = "x86", target_feature = "sse2"),
-#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
+    assert_instr(movlhps)
-           assert_instr(unpcklps))]
+)]
 #[cfg_attr(
    all(test, target_arch = "x86", not(target_feature = "sse2")),
    assert_instr(unpcklps)
 )]
 // TODO: This function is actually not limited to floats, but that's what
 // what matches the C type most closely: (__m128, *const __m64) -> __m128
 pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
@@ -1185,11 +1197,15 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
 // #[cfg_attr(test, assert_instr(movlps))]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))]
 // On 32-bit targets with SSE2, it just generates two `movsd`.
-#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
+#[cfg_attr(
-           assert_instr(movsd))]
+    all(test, target_arch = "x86", target_feature = "sse2"),
    assert_instr(movsd)
 )]
 // It should really generate "movlps", but oh well...
-#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(movss))]
+    all(test, target_arch = "x86", not(target_feature = "sse2")),
    assert_instr(movss)
 )]
 // TODO: Like _mm_loadh_pi, this also isn't limited to floats.
 pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
    let q = p as *const f32x2;
@@ -1321,8 +1337,10 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
 // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
 // fine.
 // On i586 (no SSE2) it just generates plain MOV instructions.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(movhpd))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(movhpd)
 )]
 pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
    #[cfg(target_arch = "x86")]
    {
@@ -1349,8 +1367,10 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
 #[inline]
 #[target_feature(enable = "sse")]
 // On i586 the codegen just generates plane MOVs. No need to test for that.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+#[cfg_attr(
-           assert_instr(movlps))]
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
    assert_instr(movlps)
 )]
 pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
    #[cfg(target_arch = "x86")]
    {
@@ -1929,7 +1949,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_TRANSPOSE4_PS(
-    row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128
+    row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128,
 ) {
    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
@@ -2734,12 +2754,8 @@ mod tests {
        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
-        let e2: u32x4 = transmute(_mm_setr_ps(
+        let e2: u32x4 =
-            transmute(0xffffffffu32),
+            transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
            2.0,
            3.0,
            4.0,
        ));
        assert_eq!(r2, e2);
    }
@@ -3453,22 +3469,9 @@ mod tests {
    #[simd_test(enable = "sse")]
    unsafe fn test_mm_cvtss_si32() {
-        let inputs = &[
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
-            42.0f32,
+        let result =
-            -3.1,
+            &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520];
            4.0e10,
            4.0e-20,
            NAN,
            2147483500.1,
        ];
        let result = &[
            42i32,
            -3,
            i32::min_value(),
            0,
            i32::min_value(),
            2147483520,
        ];
        for i in 0..inputs.len() {
            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
            let e = result[i];
@@ -3672,10 +3675,8 @@ mod tests {
        }
        let r = _mm_load_ps(p);
-        let e = _mm_add_ps(
+        let e =
-            _mm_setr_ps(1.0, 2.0, 3.0, 4.0),
+            _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
            _mm_set1_ps(fixup),
        );
        assert_eq_m128(r, e);
    }
@@ -3705,10 +3706,8 @@ mod tests {
        }
        let r = _mm_loadr_ps(p);
-        let e = _mm_add_ps(
+        let e =
-            _mm_setr_ps(4.0, 3.0, 2.0, 1.0),
+            _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
            _mm_set1_ps(fixup),
        );
        assert_eq_m128(r, e);
    }
@@ -3947,9 +3946,7 @@ mod tests {
    #[simd_test(enable = "sse")]
    unsafe fn test_mm_stream_ps() {
        let a = _mm_set1_ps(7.0);
-        let mut mem = Memory {
+        let mut mem = Memory { data: [-1.0; 4] };
            data: [-1.0; 4],
        };
        _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
        for i in 0..4 {
@@ -4157,12 +4154,8 @@ mod tests {
    #[simd_test(enable = "sse,mmx")]
    unsafe fn test_mm_movemask_pi8() {
-        let a = _mm_setr_pi16(
+        let a =
-            0b1000_0000,
+            _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
            0b0100_0000,
            0b1000_0000,
            0b0100_0000,
        );
        let r = _mm_movemask_pi8(a);
        assert_eq!(r, 0b10001);
--- a/library/stdarch/coresimd/x86/sse2.rs
+++ b/library/stdarch/coresimd/x86/sse2.rs
@@ -1010,7 +1010,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 // no particular instruction to test
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi16(
-    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
+    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> __m128i {
    mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -1095,7 +1095,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 // no particular instruction to test
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_epi16(
-    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
+    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> __m128i {
    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
 }
@@ -1134,10 +1134,15 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movsd on windows
-#[cfg_attr(all(test, not(windows),
+#[cfg_attr(
    all(
        test,
        not(windows),
        not(all(target_os = "linux", target_arch = "x86_64")),
-               target_arch = "x86_64"),
+        target_arch = "x86_64"
-           assert_instr(movq))]
+    ),
    assert_instr(movq)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
    _mm_set_epi64x(0, simd_extract((*mem_addr).as_i64x2(), 0))
@@ -1190,7 +1195,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(maskmovdqu))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskmoveu_si128(
-    a: __m128i, mask: __m128i, mem_addr: *mut i8
+    a: __m128i, mask: __m128i, mem_addr: *mut i8,
 ) {
    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
 }
@@ -1229,10 +1234,15 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME mov on windows, movlps on i686
-#[cfg_attr(all(test, not(windows),
+#[cfg_attr(
    all(
        test,
        not(windows),
        not(all(target_os = "linux", target_arch = "x86_64")),
-               target_arch = "x86_64"),
+        target_arch = "x86_64"
-           assert_instr(movq))]
+    ),
    assert_instr(movq)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
    ptr::copy_nonoverlapping(
@@ -1275,8 +1285,9 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movd on windows, movd on i686
-#[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
+#[cfg_attr(
-           assert_instr(movq))]
+    all(test, not(windows), target_arch = "x86_64"), assert_instr(movq)
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
@@ -1341,11 +1352,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
+    mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16))
        a.as_i16x8(),
        (imm8 & 7) as u32,
        i as i16,
    ))
 }
 /// Return a mask of the most significant bit of each element in `a`.
@@ -1443,16 +1450,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
            simd_shuffle8(
                a,
                a,
-                [
+                [0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4],
                    0,
                    1,
                    2,
                    3,
                    $x01 + 4,
                    $x23 + 4,
                    $x45 + 4,
                    $x67 + 4,
                ],
            )
        };
    }
@@ -1567,9 +1565,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute::<i8x16, _>(simd_shuffle16(
        a.as_i8x16(),
        b.as_i8x16(),
-        [
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
        ],
    ))
 }
@@ -1630,9 +1626,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute::<i8x16, _>(simd_shuffle16(
        a.as_i8x16(),
        b.as_i8x16(),
-        [
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
        ],
    ))
 }
@@ -1644,11 +1638,8 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(
+    let x =
-        a.as_i16x8(),
+        simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
        b.as_i16x8(),
        [0, 8, 1, 9, 2, 10, 3, 11],
    );
    mem::transmute::<i16x8, _>(x)
 }
@@ -1947,11 +1938,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
+    simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
        _mm_cmplt_sd(b, a),
        1,
        simd_extract::<_, f64>(a, 1),
    )
 }
 /// Return a new vector with the low element of `a` replaced by the
@@ -1963,11 +1950,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmplesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
+    simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
        _mm_cmple_sd(b, a),
        1,
        simd_extract::<_, f64>(a, 1),
    )
 }
 /// Return a new vector with the low element of `a` replaced by the result
@@ -2042,11 +2025,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
+    simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
        _mm_cmpnlt_sd(b, a),
        1,
        simd_extract::<_, f64>(a, 1),
    )
 }
 /// Return a new vector with the low element of `a` replaced by the
@@ -2058,11 +2037,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
+    simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
        _mm_cmpnle_sd(b, a),
        1,
        simd_extract::<_, f64>(a, 1),
    )
 }
 /// Compare corresponding elements in `a` and `b` for equality.
@@ -2881,8 +2856,9 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 /// The resulting `__m128d` element is composed by the low-order values of
 /// the two `__m128d` interleaved input elements, i.e.:
 ///
-/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
-/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
 /// input
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
 #[inline]
@@ -3223,22 +3199,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_add_epi8() {
        let a = _mm_setr_epi8(
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
        );
        #[cfg_attr(rustfmt, rustfmt_skip)]
        let b = _mm_setr_epi8(
@@ -3290,22 +3251,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_adds_epi8() {
        let a = _mm_setr_epi8(
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
        );
        #[cfg_attr(rustfmt, rustfmt_skip)]
        let b = _mm_setr_epi8(
@@ -3363,22 +3309,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_adds_epu8() {
        let a = _mm_setr_epi8(
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
        );
        #[cfg_attr(rustfmt, rustfmt_skip)]
        let b = _mm_setr_epi8(
@@ -3629,22 +3560,7 @@ mod tests {
        );
        let r = _mm_slli_si128(a, 1);
        let e = _mm_setr_epi8(
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
        );
        assert_eq_m128i(r, e);
@@ -3888,41 +3804,10 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_cmpeq_epi8() {
        let a = _mm_setr_epi8(
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
        );
        let b = _mm_setr_epi8(
            15,
            14,
            2,
            12,
            11,
            10,
            9,
            8,
            7,
            6,
            5,
            4,
            3,
            2,
            1,
            0,
        );
        let b =
            _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm_cmpeq_epi8(a, b);
        #[cfg_attr(rustfmt, rustfmt_skip)]
        assert_eq_m128i(
@@ -4869,9 +4754,7 @@ mod tests {
            pub data: [f64; 2],
        }
        let a = _mm_set1_pd(7.0);
-        let mut mem = Memory {
+        let mut mem = Memory { data: [-1.0; 2] };
            data: [-1.0; 2],
        };
        _mm_stream_pd(&mut mem.data[0] as *mut f64, a);
        for i in 0..2 {
@@ -4889,9 +4772,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_store_pd() {
-        let mut mem = Memory {
+        let mut mem = Memory { data: [0.0f64; 4] };
            data: [0.0f64; 4],
        };
        let vals = &mut mem.data;
        let a = _mm_setr_pd(1.0, 2.0);
        let d = vals.as_mut_ptr();
@@ -4903,9 +4784,7 @@ mod tests {
    #[simd_test(enable = "sse")]
    unsafe fn test_mm_storeu_pd() {
-        let mut mem = Memory {
+        let mut mem = Memory { data: [0.0f64; 4] };
            data: [0.0f64; 4],
        };
        let vals = &mut mem.data;
        let a = _mm_setr_pd(1.0, 2.0);
@@ -4929,9 +4808,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_store1_pd() {
-        let mut mem = Memory {
+        let mut mem = Memory { data: [0.0f64; 4] };
            data: [0.0f64; 4],
        };
        let vals = &mut mem.data;
        let a = _mm_setr_pd(1.0, 2.0);
        let d = vals.as_mut_ptr();
@@ -4943,9 +4820,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_store_pd1() {
-        let mut mem = Memory {
+        let mut mem = Memory { data: [0.0f64; 4] };
            data: [0.0f64; 4],
        };
        let vals = &mut mem.data;
        let a = _mm_setr_pd(1.0, 2.0);
        let d = vals.as_mut_ptr();
@@ -4957,9 +4832,7 @@ mod tests {
    #[simd_test(enable = "sse2")]
    unsafe fn test_mm_storer_pd() {
-        let mut mem = Memory {
+        let mut mem = Memory { data: [0.0f64; 4] };
            data: [0.0f64; 4],
        };
        let vals = &mut mem.data;
        let a = _mm_setr_pd(1.0, 2.0);
        let d = vals.as_mut_ptr();
@@ -5013,10 +4886,7 @@ mod tests {
        }
        let r = _mm_loadu_pd(d);
-        let e = _mm_add_pd(
+        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
            _mm_setr_pd(1.0, 2.0),
            _mm_set1_pd(offset as f64),
        );
        assert_eq_m128d(r, e);
    }
@@ -5091,12 +4961,8 @@ mod tests {
        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
-        let a = _mm_setr_ps(
+        let a =
-            -1.1,
+            _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
            f32::NEG_INFINITY,
            f32::MAX,
            f32::NEG_INFINITY,
        );
        let b = _mm_setr_pd(f64::INFINITY, -5.0);
        let r = _mm_cvtsd_ss(a, b);
@@ -5161,12 +5027,8 @@ mod tests {
        let r = _mm_cvttps_epi32(a);
        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
-        let a = _mm_setr_ps(
+        let a =
-            f32::NEG_INFINITY,
+            _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
            f32::INFINITY,
            f32::MIN,
            f32::MAX,
        );
        let r = _mm_cvttps_epi32(a);
        assert_eq_m128i(
            r,
--- a/library/stdarch/coresimd/x86/sse41.rs
+++ b/library/stdarch/coresimd/x86/sse41.rs
@@ -66,13 +66,9 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 #[cfg_attr(test, assert_instr(pblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(
-    a: __m128i, b: __m128i, mask: __m128i
+    a: __m128i, b: __m128i, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(pblendvb(
+    mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
        a.as_i8x16(),
        b.as_i8x16(),
        mask.as_i8x16(),
    ))
 }
 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
@@ -250,11 +246,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
+    mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
        a.as_i8x16(),
        (imm8 & 0b1111) as u32,
        i as i8,
    ))
 }
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
@@ -267,11 +259,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
+    mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
        a.as_i32x4(),
        (imm8 & 0b11) as u32,
        i,
    ))
 }
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
@@ -1778,16 +1766,12 @@ mod tests {
        }
        {
            let a = _mm_setr_epi32(
-                15,
+                15, 2, /* ignored */
-                2, /* ignored */
+                1234567, 4, /* ignored */
                1234567,
                4, /* ignored */
            );
            let b = _mm_setr_epi32(
-                -20,
+                -20, -256, /* ignored */
-                -256, /* ignored */
+                666666, 666666, /* ignored */
                666666,
                666666, /* ignored */
            );
            let r = _mm_mul_epi32(a, b);
            let e = _mm_setr_epi64x(-300, 823043843622);
--- a/library/stdarch/coresimd/x86/sse42.rs
+++ b/library/stdarch/coresimd/x86/sse42.rs
@@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrm(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> __m128i {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -544,7 +544,7 @@ pub unsafe fn _mm_cmpestrm(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestri(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -567,7 +567,7 @@ pub unsafe fn _mm_cmpestri(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrz(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -590,7 +590,7 @@ pub unsafe fn _mm_cmpestrz(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrc(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -613,7 +613,7 @@ pub unsafe fn _mm_cmpestrc(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrs(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -636,7 +636,7 @@ pub unsafe fn _mm_cmpestrs(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestro(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -660,7 +660,7 @@ pub unsafe fn _mm_cmpestro(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestra(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@@ -917,13 +917,8 @@ mod tests {
    unsafe fn test_mm_cmpestra() {
        let a = str_to_m128i(b"Cannot match a");
        let b = str_to_m128i(b"Null after 14");
-        let i = _mm_cmpestra(
+        let i =
-            a,
+            _mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK);
            14,
            b,
            16,
            _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK,
        );
        assert_eq!(1, i);
    }
--- a/library/stdarch/coresimd/x86/sse4a.rs
+++ b/library/stdarch/coresimd/x86/sse4a.rs
@@ -25,8 +25,8 @@ extern "C" {
 /// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
 ///
 /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
-/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All other
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
-/// bits are ignored.
+/// other bits are ignored.
 ///
 /// If the length is zero, it is interpreted as `64`. If the length and index
 /// are zero, the lower 64 bits of `x` are extracted.
--- a/library/stdarch/coresimd/x86/ssse3.rs
+++ b/library/stdarch/coresimd/x86/ssse3.rs
@@ -596,24 +596,8 @@ mod tests {
            12, 5, 5, 10,
            4, 1, 8, 0,
        );
-        let expected = _mm_setr_epi8(
+        let expected =
-            5,
+            _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
            0,
            5,
            4,
            9,
            13,
            7,
            4,
            13,
            6,
            6,
            11,
            5,
            2,
            9,
            1,
        );
        let r = _mm_shuffle_epi8(a, b);
        assert_eq_m128i(r, expected);
    }
--- a/library/stdarch/coresimd/x86/test.rs
+++ b/library/stdarch/coresimd/x86/test.rs
@@ -121,7 +121,7 @@ mod x86_polyfill {
    #[target_feature(enable = "avx2")]
    pub unsafe fn _mm256_insert_epi64(
-        a: __m256i, val: i64, idx: i32
+        a: __m256i, val: i64, idx: i32,
    ) -> __m256i {
        union A {
            a: __m256i,
--- a/library/stdarch/coresimd/x86/xsave.rs
+++ b/library/stdarch/coresimd/x86/xsave.rs
@@ -38,11 +38,7 @@ extern "C" {
 #[cfg_attr(test, assert_instr(xsave))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
-    xsave(
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial restore of the enabled processor states using
@@ -110,11 +106,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 #[cfg_attr(test, assert_instr(xsaveopt))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
-    xsaveopt(
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial save of the enabled processor states to memory
@@ -130,11 +122,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsavec))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
-    xsavec(
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial save of the enabled processor states to memory at
@@ -151,11 +139,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaves))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
-    xsaves(
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial restore of the enabled processor states using the
@@ -196,9 +180,7 @@ mod tests {
    impl XsaveArea {
        fn new() -> XsaveArea {
-            XsaveArea {
+            XsaveArea { data: [0; 2560] }
                data: [0; 2560],
            }
        }
        fn ptr(&mut self) -> *mut u8 {
            &mut self.data[0] as *mut _ as *mut u8
--- a/library/stdarch/coresimd/x86_64/bmi.rs
+++ b/library/stdarch/coresimd/x86_64/bmi.rs
@@ -28,8 +28,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 /// Extracts bits of `a` specified by `control` into
 /// the least significant bits of the result.
 ///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
-/// be extracted, and bits `[15,8]` specify the length of the range.
+/// to be extracted, and bits `[15,8]` specify the length of the range.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
 #[inline]
--- a/library/stdarch/coresimd/x86_64/fxsr.rs
+++ b/library/stdarch/coresimd/x86_64/fxsr.rs
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor64(mem_addr: *const u8) {
 #[cfg(test)]
 mod tests {
    use coresimd::x86_64::*;
-    use std::{fmt, cmp::PartialEq};
+    use std::{cmp::PartialEq, fmt};
    use stdsimd_test::simd_test;
    #[repr(align(16))]
--- a/library/stdarch/coresimd/x86_64/xsave.rs
+++ b/library/stdarch/coresimd/x86_64/xsave.rs
@@ -36,11 +36,7 @@ extern "C" {
 #[cfg_attr(test, assert_instr(xsave64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
-    xsave64(
+    xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial restore of the enabled processor states using
@@ -73,11 +69,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaveopt64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
-    xsaveopt64(
+    xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial save of the enabled processor states to memory
@@ -93,11 +85,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsavec64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
-    xsavec64(
+    xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial save of the enabled processor states to memory at
@@ -114,11 +102,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaves64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
-    xsaves64(
+    xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
        mem_addr,
        (save_mask >> 32) as u32,
        save_mask as u32,
    );
 }
 /// Perform a full or partial restore of the enabled processor states using the
--- a/library/stdarch/crates/assert-instr-macro/src/lib.rs
+++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs
@@ -21,7 +21,7 @@ use proc_macro2::TokenStream;
 #[proc_macro_attribute]
 pub fn assert_instr(
-    attr: proc_macro::TokenStream, item: proc_macro::TokenStream
+    attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
 ) -> proc_macro::TokenStream {
    let invoc = syn::parse::<Invoc>(attr)
        .expect("expected #[assert_instr(instr, a = b, ...)]");
@@ -36,9 +36,10 @@ pub fn assert_instr(
    let name = &func.ident;
    // Disable assert_instr for x86 targets compiled with avx enabled, which
-    // causes LLVM to generate different intrinsics that the ones we are testing
+    // causes LLVM to generate different intrinsics that the ones we are
-    // for.
+    // testing for.
-    let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
+    let disable_assert_instr =
        std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
    let maybe_ignore = if cfg!(optimized) && !disable_assert_instr {
        TokenStream::new()
    } else {
@@ -72,11 +73,7 @@ pub fn assert_instr(
            syn::Pat::Ident(ref i) => &i.ident,
            _ => panic!("must have bare arguments"),
        };
-        match invoc
+        match invoc.args.iter().find(|a| *ident == a.0) {
            .args
            .iter()
            .find(|a| *ident == a.0)
        {
            Some(&(_, ref tts)) => {
                input_vals.push(quote! { #tts });
            }
@@ -87,7 +84,8 @@ pub fn assert_instr(
        };
    }
-    let attrs = func.attrs
+    let attrs = func
        .attrs
        .iter()
        .filter(|attr| {
            attr.path
@@ -142,9 +140,8 @@ pub fn assert_instr(
        }
    }.into();
    // why? necessary now to get tests to work?
-    let tts: TokenStream = tts.to_string()
+    let tts: TokenStream =
-        .parse()
+        tts.to_string().parse().expect("cannot parse tokenstream");
        .expect("cannot parse tokenstream");
    let tts: TokenStream = quote! {
        #item
--- a/library/stdarch/crates/coresimd/build.rs
+++ b/library/stdarch/crates/coresimd/build.rs
@@ -1,8 +1,5 @@
 use std::env;
 fn main() {
-    println!(
+    println!("cargo:rustc-env=TARGET={}", env::var("TARGET").unwrap());
        "cargo:rustc-env=TARGET={}",
        env::var("TARGET").unwrap()
    );
 }
--- a/library/stdarch/crates/coresimd/src/lib.rs
+++ b/library/stdarch/crates/coresimd/src/lib.rs
@@ -9,29 +9,35 @@
 #![cfg_attr(stdsimd_strict, deny(warnings))]
 #![allow(dead_code)]
 #![allow(unused_features)]
-#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
+#![feature(
-           simd_ffi, asm, proc_macro_gen,
+    const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
-           integer_atomics, stmt_expr_attributes, core_intrinsics,
+    asm, proc_macro_gen, integer_atomics, stmt_expr_attributes,
-           crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd,
+    core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs,
-           staged_api, core_float, core_slice_ext, align_offset,
+    stdsimd, staged_api, core_float, core_slice_ext, align_offset, doc_cfg,
-           doc_cfg, mmx_target_feature, tbm_target_feature,
+    mmx_target_feature, tbm_target_feature, sse4a_target_feature,
-           sse4a_target_feature, arm_target_feature, aarch64_target_feature,
+    arm_target_feature, aarch64_target_feature, mips_target_feature,
-           mips_target_feature, powerpc_target_feature)]
+    powerpc_target_feature
-#![cfg_attr(test,
+)]
-            feature(proc_macro, test, attr_literals, abi_vectorcall,
+#![cfg_attr(
-                    untagged_unions))]
+    test,
-#![cfg_attr(feature = "cargo-clippy",
+    feature(proc_macro, test, attr_literals, abi_vectorcall, untagged_unions)
-            allow(inline_always, too_many_arguments, cast_sign_loss,
+)]
-                  cast_lossless, cast_possible_wrap,
+#![cfg_attr(
-                  cast_possible_truncation, cast_precision_loss,
+    feature = "cargo-clippy",
    allow(
        inline_always, too_many_arguments, cast_sign_loss, cast_lossless,
        cast_possible_wrap, cast_possible_truncation, cast_precision_loss,
        shadow_reuse, cyclomatic_complexity, similar_names,
-                  many_single_char_names))]
+        many_single_char_names
    )
 )]
 #![cfg_attr(test, allow(unused_imports))]
 #![no_core]
 #![unstable(feature = "stdsimd", issue = "27731")]
-#![doc(test(attr(deny(warnings))),
+#![doc(
-       test(attr(allow(dead_code, deprecated, unused_variables,
+    test(attr(deny(warnings))),
-                       unused_mut))))]
+    test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
 )]
 #[cfg_attr(not(test), macro_use)]
 extern crate core as _core;
--- a/library/stdarch/crates/coresimd/tests/cpu-detection.rs
+++ b/library/stdarch/crates/coresimd/tests/cpu-detection.rs
@@ -1,7 +1,9 @@
 #![feature(stdsimd)]
 #![cfg_attr(stdsimd_strict, deny(warnings))]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(option_unwrap_used, print_stdout, use_debug))]
+    feature = "cargo-clippy",
    allow(option_unwrap_used, print_stdout, use_debug)
 )]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
@@ -14,53 +16,20 @@ fn x86_all() {
    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
-    println!(
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
-        "sse4.1: {:?}",
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
        is_x86_feature_detected!("sse4.1")
    );
    println!(
        "sse4.2: {:?}",
        is_x86_feature_detected!("sse4.2")
    );
    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
    println!("avx: {:?}", is_x86_feature_detected!("avx"));
    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
-    println!(
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
-        "avx512f {:?}",
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
-        is_x86_feature_detected!("avx512f")
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
-    );
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
-    println!(
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
-        "avx512cd {:?}",
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
-        is_x86_feature_detected!("avx512cd")
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
-    );
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
-    println!(
+    println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
        "avx512er {:?}",
        is_x86_feature_detected!("avx512er")
    );
    println!(
        "avx512pf {:?}",
        is_x86_feature_detected!("avx512pf")
    );
    println!(
        "avx512bw {:?}",
        is_x86_feature_detected!("avx512bw")
    );
    println!(
        "avx512dq {:?}",
        is_x86_feature_detected!("avx512dq")
    );
    println!(
        "avx512vl {:?}",
        is_x86_feature_detected!("avx512vl")
    );
    println!(
        "avx512_ifma {:?}",
        is_x86_feature_detected!("avx512ifma")
    );
    println!(
        "avx512_vbmi {:?}",
        is_x86_feature_detected!("avx512vbmi")
    );
    println!(
        "avx512_vpopcntdq {:?}",
        is_x86_feature_detected!("avx512vpopcntdq")
@@ -70,23 +39,11 @@ fn x86_all() {
    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
-    println!(
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
        "popcnt: {:?}",
        is_x86_feature_detected!("popcnt")
    );
    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
-    println!(
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
-        "xsaveopt: {:?}",
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
-        is_x86_feature_detected!("xsaveopt")
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
    );
    println!(
        "xsaves: {:?}",
        is_x86_feature_detected!("xsaves")
    );
    println!(
        "xsavec: {:?}",
        is_x86_feature_detected!("xsavec")
    );
 }
--- a/library/stdarch/crates/coresimd/tests/reductions.rs
+++ b/library/stdarch/crates/coresimd/tests/reductions.rs
@@ -253,11 +253,7 @@ macro_rules! product_nan_test {
                    }
                }
                let v = $id::splat(n0);
-                assert!(
+                assert!(v.product().is_nan(), "all nans | {:?}", v);
                    v.product().is_nan(),
                    "all nans | {:?}",
                    v
                );
            }
            unsafe { test_fn() };
        }
@@ -355,8 +351,7 @@ mod offset {
                    // tolerate 1 ULP difference:
                    if vsum.as_int() > tsum.as_int() {
                        assert!(
-                            vsum.as_int() - tsum.as_int()
+                            vsum.as_int() - tsum.as_int() < 2,
                                < 2,
                            "v: {:?} | vsum: {} | tsum: {}",
                            v,
                            vsum,
@@ -364,8 +359,7 @@ mod offset {
                        );
                    } else {
                        assert!(
-                            tsum.as_int() - vsum.as_int()
+                            tsum.as_int() - vsum.as_int() < 2,
                                < 2,
                            "v: {:?} | vsum: {} | tsum: {}",
                            v,
                            vsum,
--- a/library/stdarch/crates/simd-test-macro/src/lib.rs
+++ b/library/stdarch/crates/simd-test-macro/src/lib.rs
@@ -12,7 +12,7 @@ extern crate quote;
 use std::env;
-use proc_macro2::{Literal, Span, Ident, TokenStream, TokenTree};
+use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
 fn string(s: &str) -> TokenTree {
    Literal::string(s).into()
@@ -20,11 +20,9 @@ fn string(s: &str) -> TokenTree {
 #[proc_macro_attribute]
 pub fn simd_test(
-    attr: proc_macro::TokenStream, item: proc_macro::TokenStream
+    attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
 ) -> proc_macro::TokenStream {
-    let tokens = TokenStream::from(attr)
+    let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
        .into_iter()
        .collect::<Vec<_>>();
    if tokens.len() != 3 {
        panic!("expected #[simd_test(enable = \"feature\")]");
    }
@@ -53,18 +51,19 @@ pub fn simd_test(
    let item = TokenStream::from(item);
    let name = find_name(item.clone());
-    let name: TokenStream = name.to_string().parse().expect(&format!(
+    let name: TokenStream = name
-        "failed to parse name: {}",
+        .to_string()
-        name.to_string()
+        .parse()
-    ));
+        .expect(&format!("failed to parse name: {}", name.to_string()));
    let target = env::var("TARGET")
        .expect("TARGET environment variable should be set for rustc");
    let mut force_test = false;
-    let macro_test = match target.split('-').next().expect(&format!(
+    let macro_test = match target
-        "target triple contained no \"-\": {}",
+        .split('-')
-        target
+        .next()
-    )) {
+        .expect(&format!("target triple contained no \"-\": {}", target))
    {
        "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
        "arm" | "armv7" => "is_arm_feature_detected",
        "aarch64" => "is_aarch64_feature_detected",
--- a/library/stdarch/crates/stdsimd-test/src/lib.rs
+++ b/library/stdarch/crates/stdsimd-test/src/lib.rs
@@ -5,8 +5,10 @@
 //! assertions about the disassembly of a function.
 #![feature(proc_macro)]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(missing_docs_in_private_items, print_stdout))]
+    feature = "cargo-clippy",
    allow(missing_docs_in_private_items, print_stdout)
 )]
 extern crate assert_instr_macro;
 extern crate backtrace;
@@ -25,7 +27,8 @@ pub use assert_instr_macro::*;
 pub use simd_test_macro::*;
 lazy_static! {
-    static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
+    static ref DISASSEMBLY: HashMap<String, Vec<Function>> =
        disassemble_myself();
 }
 struct Function {
@@ -39,14 +42,16 @@ struct Instruction {
 fn disassemble_myself() -> HashMap<String, Vec<Function>> {
    let me = env::current_exe().expect("failed to get current exe");
-    if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows")
+    if cfg!(target_arch = "x86_64")
        && cfg!(target_os = "windows")
        && cfg!(target_env = "msvc")
    {
        let mut cmd = cc::windows_registry::find(
            "x86_64-pc-windows-msvc",
            "dumpbin.exe",
        ).expect("failed to find `dumpbin` tool");
-        let output = cmd.arg("/DISASM")
+        let output = cmd
            .arg("/DISASM")
            .arg(&me)
            .output()
            .expect("failed to execute dumpbin");
@@ -257,9 +262,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
    // in the disassembly.
    let mut sym = None;
    backtrace::resolve(fnptr as *mut _, |name| {
-        sym = name.name()
+        sym = name.name().and_then(|s| s.as_str()).map(normalize);
            .and_then(|s| s.as_str())
            .map(normalize);
    });
    let functions =
@@ -270,26 +273,17 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
                println!("assumed symbol name: `{}`", sym);
            }
            println!("maybe related functions");
-            for f in DISASSEMBLY
+            for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
                .keys()
                .filter(|k| k.contains(fnname))
            {
                println!("\t- {}", f);
            }
-            panic!(
+            panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
                "failed to find disassembly of {:#x} ({})",
                fnptr, fnname
            );
        };
    assert_eq!(functions.len(), 1);
    let function = &functions[0];
    let mut instrs = &function.instrs[..];
-    while instrs
+    while instrs.last().map_or(false, |s| s.parts == ["nop"]) {
        .last()
        .map_or(false, |s| s.parts == ["nop"])
    {
        instrs = &instrs[..instrs.len() - 1];
    }
@@ -400,10 +394,7 @@ pub fn assert_skip_test_ok(name: &str) {
    if env::var("STDSIMD_TEST_EVERYTHING").is_err() {
        return;
    }
-    panic!(
+    panic!("skipped test `{}` when it shouldn't be skipped", name);
        "skipped test `{}` when it shouldn't be skipped",
        name
    );
 }
 // See comment in `assert-instr-macro` crate for why this exists
--- a/library/stdarch/crates/stdsimd-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdsimd-verify/tests/x86-intel.rs
@@ -1,9 +1,12 @@
 #![feature(proc_macro)]
 #![allow(bad_style)]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(shadow_reuse, cast_lossless, match_same_arms,
+    feature = "cargo-clippy",
-                  nonminimal_bool, print_stdout, use_debug, eq_op,
+    allow(
-                  useless_format))]
+        shadow_reuse, cast_lossless, match_same_arms, nonminimal_bool,
        print_stdout, use_debug, eq_op, useless_format
    )
 )]
 #[macro_use]
 extern crate serde_derive;
@@ -249,10 +252,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
            .flat_map(|c| c.to_lowercase())
            .collect::<String>();
-        let rust_feature = rust.target_feature.expect(&format!(
+        let rust_feature = rust
-            "no target feature listed for {}",
+            .target_feature
-            rust.name
+            .expect(&format!("no target feature listed for {}", rust.name));
        ));
        if rust_feature.contains(&cpuid) {
            continue;
        }
@@ -314,24 +316,19 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
        if rust.arguments.len() != intel.parameters.len() {
            bail!("wrong number of arguments on {}", rust.name)
        }
-        for (i, (a, b)) in intel
+        for (i, (a, b)) in
-            .parameters
+            intel.parameters.iter().zip(rust.arguments).enumerate()
            .iter()
            .zip(rust.arguments)
            .enumerate()
        {
            let is_const = rust.required_const.contains(&i);
            equate(b, &a.type_, &intel.name, is_const)?;
        }
    }
-    let any_i64 = rust.arguments
+    let any_i64 = rust.arguments.iter().cloned().chain(rust.ret).any(|arg| {
-        .iter()
+        match *arg {
        .cloned()
        .chain(rust.ret)
        .any(|arg| match *arg {
            Type::PrimSigned(64) | Type::PrimUnsigned(64) => true,
            _ => false,
        }
    });
    let any_i64_exempt = match rust.name {
        // These intrinsics have all been manually verified against Clang's
@@ -363,7 +360,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
 }
 fn equate(
-    t: &Type, intel: &str, intrinsic: &str, is_const: bool
+    t: &Type, intel: &str, intrinsic: &str, is_const: bool,
 ) -> Result<(), String> {
    let intel = intel.replace(" *", "*");
    let intel = intel.replace(" const*", "*");
@@ -371,9 +368,7 @@ fn equate(
        if is_const {
            return Ok(());
        }
-        Err(format!(
+        Err(format!("argument required to be const but isn't"))
            "argument required to be const but isn't"
        ))
    };
    match (t, &intel[..]) {
        (&Type::PrimFloat(32), "float") => {}
--- a/library/stdarch/crates/stdsimd/tests/cpu-detection.rs
+++ b/library/stdarch/crates/stdsimd/tests/cpu-detection.rs
@@ -1,11 +1,20 @@
 #![feature(stdsimd)]
 #![cfg_attr(stdsimd_strict, deny(warnings))]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(option_unwrap_used, use_debug, print_stdout))]
+    feature = "cargo-clippy",
    allow(option_unwrap_used, use_debug, print_stdout)
 )]
-#[cfg(any(target_arch = "arm", target_arch = "aarch64",
+#[cfg(
-          target_arch = "x86", target_arch = "x86_64",
+    any(
-          target_arch = "powerpc", target_arch = "powerpc64"))]
+        target_arch = "arm",
        target_arch = "aarch64",
        target_arch = "x86",
        target_arch = "x86_64",
        target_arch = "powerpc",
        target_arch = "powerpc64"
    )
 )]
 #[macro_use]
 extern crate stdsimd;
@@ -22,51 +31,30 @@ fn aarch64_linux() {
    println!("fp: {}", is_aarch64_feature_detected!("fp"));
    println!("fp16: {}", is_aarch64_feature_detected!("fp16"));
    println!("neon: {}", is_aarch64_feature_detected!("neon"));
-    println!(
+    println!("asimd: {}", is_aarch64_feature_detected!("asimd"));
        "asimd: {}",
        is_aarch64_feature_detected!("asimd")
    );
    println!("sve: {}", is_aarch64_feature_detected!("sve"));
    println!("crc: {}", is_aarch64_feature_detected!("crc"));
-    println!(
+    println!("crypto: {}", is_aarch64_feature_detected!("crypto"));
        "crypto: {}",
        is_aarch64_feature_detected!("crypto")
    );
    println!("lse: {}", is_aarch64_feature_detected!("lse"));
    println!("rdm: {}", is_aarch64_feature_detected!("rdm"));
    println!("rcpc: {}", is_aarch64_feature_detected!("rcpc"));
-    println!(
+    println!("dotprod: {}", is_aarch64_feature_detected!("dotprod"));
        "dotprod: {}",
        is_aarch64_feature_detected!("dotprod")
    );
 }
 #[test]
 #[cfg(all(target_arch = "powerpc", target_os = "linux"))]
 fn powerpc_linux() {
-    println!(
+    println!("altivec: {}", is_powerpc_feature_detected!("altivec"));
        "altivec: {}",
        is_powerpc_feature_detected!("altivec")
    );
    println!("vsx: {}", is_powerpc_feature_detected!("vsx"));
-    println!(
+    println!("power8: {}", is_powerpc_feature_detected!("power8"));
        "power8: {}",
        is_powerpc_feature_detected!("power8")
    );
 }
 #[test]
 #[cfg(all(target_arch = "powerpc64", target_os = "linux"))]
 fn powerpc64_linux() {
-    println!(
+    println!("altivec: {}", is_powerpc64_feature_detected!("altivec"));
        "altivec: {}",
        is_powerpc64_feature_detected!("altivec")
    );
    println!("vsx: {}", is_powerpc64_feature_detected!("vsx"));
-    println!(
+    println!("power8: {}", is_powerpc64_feature_detected!("power8"));
        "power8: {}",
        is_powerpc64_feature_detected!("power8")
    );
 }
 #[test]
@@ -76,54 +64,21 @@ fn x86_all() {
    println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
    println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
    println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
-    println!(
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
-        "sse4.1: {:?}",
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
        is_x86_feature_detected!("sse4.1")
    );
    println!(
        "sse4.2: {:?}",
        is_x86_feature_detected!("sse4.2")
    );
    println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
    println!("sha: {:?}", is_x86_feature_detected!("sha"));
    println!("avx: {:?}", is_x86_feature_detected!("avx"));
    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
-    println!(
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
-        "avx512f {:?}",
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
-        is_x86_feature_detected!("avx512f")
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
-    );
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
-    println!(
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
-        "avx512cd {:?}",
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
-        is_x86_feature_detected!("avx512cd")
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
-    );
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
-    println!(
+    println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
        "avx512er {:?}",
        is_x86_feature_detected!("avx512er")
    );
    println!(
        "avx512pf {:?}",
        is_x86_feature_detected!("avx512pf")
    );
    println!(
        "avx512bw {:?}",
        is_x86_feature_detected!("avx512bw")
    );
    println!(
        "avx512dq {:?}",
        is_x86_feature_detected!("avx512dq")
    );
    println!(
        "avx512vl {:?}",
        is_x86_feature_detected!("avx512vl")
    );
    println!(
        "avx512_ifma {:?}",
        is_x86_feature_detected!("avx512ifma")
    );
    println!(
        "avx512_vbmi {:?}",
        is_x86_feature_detected!("avx512vbmi")
    );
    println!(
        "avx512_vpopcntdq {:?}",
        is_x86_feature_detected!("avx512vpopcntdq")
@@ -133,23 +88,11 @@ fn x86_all() {
    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
    println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
-    println!(
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
        "popcnt: {:?}",
        is_x86_feature_detected!("popcnt")
    );
    println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
    println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
    println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
-    println!(
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
-        "xsaveopt: {:?}",
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
-        is_x86_feature_detected!("xsaveopt")
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
    );
    println!(
        "xsaves: {:?}",
        is_x86_feature_detected!("xsaves")
    );
    println!(
        "xsavec: {:?}",
        is_x86_feature_detected!("xsavec")
    );
 }
--- a/library/stdarch/examples/hex.rs
+++ b/library/stdarch/examples/hex.rs
@@ -14,10 +14,13 @@
 #![feature(stdsimd)]
 #![cfg_attr(test, feature(test))]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(result_unwrap_used, print_stdout, option_unwrap_used,
+    feature = "cargo-clippy",
-                  shadow_reuse, cast_possible_wrap, cast_sign_loss,
+    allow(
-                  missing_docs_in_private_items))]
+        result_unwrap_used, print_stdout, option_unwrap_used, shadow_reuse,
        cast_possible_wrap, cast_sign_loss, missing_docs_in_private_items
    )
 )]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
@@ -68,7 +71,7 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
 #[target_feature(enable = "avx2")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 unsafe fn hex_encode_avx2<'a>(
-    mut src: &[u8], dst: &'a mut [u8]
+    mut src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
    let ascii_zero = _mm256_set1_epi8(b'0' as i8);
    let nines = _mm256_set1_epi8(9);
@@ -115,16 +118,14 @@ unsafe fn hex_encode_avx2<'a>(
    let i = i as usize;
    let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
-    Ok(str::from_utf8_unchecked(
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
        &dst[..src.len() * 2 + i * 2],
    ))
 }
 // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
 #[target_feature(enable = "sse4.1")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 unsafe fn hex_encode_sse41<'a>(
-    mut src: &[u8], dst: &'a mut [u8]
+    mut src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
    let ascii_zero = _mm_set1_epi8(b'0' as i8);
    let nines = _mm_set1_epi8(9);
@@ -157,10 +158,7 @@ unsafe fn hex_encode_sse41<'a>(
        let res2 = _mm_unpackhi_epi8(masked2, masked1);
        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
-        _mm_storeu_si128(
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
            dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
            res2,
        );
        src = &src[16..];
        i += 16;
    }
@@ -168,13 +166,11 @@ unsafe fn hex_encode_sse41<'a>(
    let i = i as usize;
    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
-    Ok(str::from_utf8_unchecked(
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
        &dst[..src.len() * 2 + i * 2],
    ))
 }
 fn hex_encode_fallback<'a>(
-    src: &[u8], dst: &'a mut [u8]
+    src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
    fn hex(byte: u8) -> u8 {
        static TABLE: &[u8] = b"0123456789abcdef";
@@ -199,10 +195,7 @@ mod tests {
    fn test(input: &[u8], output: &str) {
        let tmp = || vec![0; input.len() * 2];
-        assert_eq!(
+        assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
            hex_encode_fallback(input, &mut tmp()).unwrap(),
            output
        );
        assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -239,9 +232,7 @@ mod tests {
    fn odd() {
        test(
            &[0; 313],
-            &iter::repeat('0')
+            &iter::repeat('0').take(313 * 2).collect::<String>(),
                .take(313 * 2)
                .collect::<String>(),
        );
    }
--- a/library/stdarch/examples/nbody.rs
+++ b/library/stdarch/examples/nbody.rs
@@ -5,9 +5,13 @@
 #![cfg_attr(stdsimd_strict, deny(warnings))]
 #![feature(stdsimd)]
-#![cfg_attr(feature = "cargo-clippy",
+#![cfg_attr(
-            allow(similar_names, missing_docs_in_private_items,
+    feature = "cargo-clippy",
-                  shadow_reuse, print_stdout))]
+    allow(
        similar_names, missing_docs_in_private_items, shadow_reuse,
        print_stdout
    )
 )]
 extern crate stdsimd;
 #[macro_use]
@@ -15,8 +19,6 @@ extern crate cfg_if;
 use stdsimd::simd::*;
 const PI: f64 = std::f64::consts::PI;
 const SOLAR_MASS: f64 = 4.0 * PI * PI;
 const DAYS_PER_YEAR: f64 = 365.24;
@@ -81,7 +83,7 @@ struct Body {
 impl Body {
    fn new(
-        x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
+        x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64,
    ) -> Self {
        Self {
            x: [x0, x1, x2],
--- a/library/stdarch/stdsimd/arch/detect/error_macros.rs
+++ b/library/stdarch/stdsimd/arch/detect/error_macros.rs
@@ -64,7 +64,8 @@ macro_rules! is_aarch64_feature_detected {
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_powerpc_feature_detected {
    ($t:tt) => {
-        compile_error!(r#"
+        compile_error!(
            r#"
 is_powerpc_feature_detected can only be used on PowerPC targets.
 You can prevent it from being used in other architectures by
 guarding it behind a cfg(target_arch) as follows:
@@ -72,7 +73,8 @@ guarding it behind a cfg(target_arch) as follows:
    #[cfg(target_arch = "powerpc")] {
        if is_powerpc_feature_detected(...) { ... }
    }
-"#)
+"#
        )
    };
 }
@@ -81,7 +83,8 @@ guarding it behind a cfg(target_arch) as follows:
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_powerpc64_feature_detected {
    ($t:tt) => {
-        compile_error!(r#"
+        compile_error!(
            r#"
 is_powerpc64_feature_detected can only be used on PowerPC64 targets.
 You can prevent it from being used in other architectures by
 guarding it behind a cfg(target_arch) as follows:
@@ -89,7 +92,8 @@ guarding it behind a cfg(target_arch) as follows:
    #[cfg(target_arch = "powerpc64")] {
        if is_powerpc64_feature_detected(...) { ... }
    }
-"#)
+"#
        )
    };
 }
--- a/library/stdarch/stdsimd/arch/detect/mod.rs
+++ b/library/stdarch/stdsimd/arch/detect/mod.rs
@@ -60,8 +60,8 @@ cfg_if! {
 }
 pub use self::arch::Feature;
 mod cache;
 mod bit;
 mod cache;
 cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {