Update codegen for simd wasm intrinsics with LLVM 13 (#1203)

2021-08-22 19:45:59 -05:00
parent 870cf5751d
commit b10d00cae0
1 changed files with 226 additions and 142 deletions
--- a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@@ -78,27 +78,6 @@ conversions! {

 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.wasm.load32.zero"]
-    fn llvm_load32_zero(x: *const u32) -> simd::i32x4;
-    #[link_name = "llvm.wasm.load64.zero"]
-    fn llvm_load64_zero(x: *const u64) -> simd::i64x2;
-    #[link_name = "llvm.wasm.load8.lane"]
-    fn llvm_load8_lane(x: *const u8, v: simd::u8x16, l: usize) -> simd::u8x16;
-    #[link_name = "llvm.wasm.load16.lane"]
-    fn llvm_load16_lane(x: *const u16, v: simd::u16x8, l: usize) -> simd::u16x8;
-    #[link_name = "llvm.wasm.load32.lane"]
-    fn llvm_load32_lane(x: *const u32, v: simd::u32x4, l: usize) -> simd::u32x4;
-    #[link_name = "llvm.wasm.load64.lane"]
-    fn llvm_load64_lane(x: *const u64, v: simd::u64x2, l: usize) -> simd::u64x2;
-    #[link_name = "llvm.wasm.store8.lane"]
-    fn llvm_store8_lane(x: *mut u8, v: simd::u8x16, l: usize);
-    #[link_name = "llvm.wasm.store16.lane"]
-    fn llvm_store16_lane(x: *mut u16, v: simd::u16x8, l: usize);
-    #[link_name = "llvm.wasm.store32.lane"]
-    fn llvm_store32_lane(x: *mut u32, v: simd::u32x4, l: usize);
-    #[link_name = "llvm.wasm.store64.lane"]
-    fn llvm_store64_lane(x: *mut u64, v: simd::u64x2, l: usize);
-
    #[link_name = "llvm.wasm.swizzle"]
    fn llvm_swizzle(a: simd::i8x16, b: simd::i8x16) -> simd::i8x16;

@@ -109,7 +88,7 @@ extern "C" {

    #[link_name = "llvm.wasm.alltrue.v16i8"]
    fn llvm_i8x16_all_true(x: simd::i8x16) -> i32;
-    #[link_name = "llvm.wasm.popcnt"]
+    #[link_name = "llvm.ctpop.v16i8"]
    fn llvm_popcnt(a: simd::i8x16) -> simd::i8x16;
    #[link_name = "llvm.wasm.bitmask.v16i8"]
    fn llvm_bitmask_i8x16(a: simd::i8x16) -> i32;
@@ -152,14 +131,6 @@ extern "C" {
    fn llvm_i16x8_sub_sat_u(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
    fn llvm_avgr_u_i16x8(a: simd::i16x8, b: simd::i16x8) -> simd::i16x8;
-    #[link_name = "llvm.wasm.extmul.low.signed.v8i16"]
-    fn llvm_i16x8_extmul_low_i8x16_s(a: simd::i8x16, b: simd::i8x16) -> simd::i16x8;
-    #[link_name = "llvm.wasm.extmul.high.signed.v8i16"]
-    fn llvm_i16x8_extmul_high_i8x16_s(a: simd::i8x16, b: simd::i8x16) -> simd::i16x8;
-    #[link_name = "llvm.wasm.extmul.low.unsigned.v8i16"]
-    fn llvm_i16x8_extmul_low_i8x16_u(a: simd::i8x16, b: simd::i8x16) -> simd::i16x8;
-    #[link_name = "llvm.wasm.extmul.high.unsigned.v8i16"]
-    fn llvm_i16x8_extmul_high_i8x16_u(a: simd::i8x16, b: simd::i8x16) -> simd::i16x8;

    #[link_name = "llvm.wasm.extadd.pairwise.signed.v16i8"]
    fn llvm_i32x4_extadd_pairwise_i16x8_s(x: simd::i16x8) -> simd::i32x4;
@@ -171,27 +142,11 @@ extern "C" {
    fn llvm_bitmask_i32x4(a: simd::i32x4) -> i32;
    #[link_name = "llvm.wasm.dot"]
    fn llvm_i32x4_dot_i16x8_s(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
-    #[link_name = "llvm.wasm.extmul.low.signed.v4i32"]
-    fn llvm_i32x4_extmul_low_i16x8_s(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
-    #[link_name = "llvm.wasm.extmul.high.signed.v4i32"]
-    fn llvm_i32x4_extmul_high_i16x8_s(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
-    #[link_name = "llvm.wasm.extmul.low.unsigned.v4i32"]
-    fn llvm_i32x4_extmul_low_i16x8_u(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;
-    #[link_name = "llvm.wasm.extmul.high.unsigned.v4i32"]
-    fn llvm_i32x4_extmul_high_i16x8_u(a: simd::i16x8, b: simd::i16x8) -> simd::i32x4;

    #[link_name = "llvm.wasm.alltrue.v2i64"]
    fn llvm_i64x2_all_true(x: simd::i64x2) -> i32;
    #[link_name = "llvm.wasm.bitmask.v2i64"]
    fn llvm_bitmask_i64x2(a: simd::i64x2) -> i32;
-    #[link_name = "llvm.wasm.extmul.low.signed.v2i64"]
-    fn llvm_i64x2_extmul_low_i32x4_s(a: simd::i32x4, b: simd::i32x4) -> simd::i64x2;
-    #[link_name = "llvm.wasm.extmul.high.signed.v2i64"]
-    fn llvm_i64x2_extmul_high_i32x4_s(a: simd::i32x4, b: simd::i32x4) -> simd::i64x2;
-    #[link_name = "llvm.wasm.extmul.low.unsigned.v2i64"]
-    fn llvm_i64x2_extmul_low_i32x4_u(a: simd::i32x4, b: simd::i32x4) -> simd::i64x2;
-    #[link_name = "llvm.wasm.extmul.high.unsigned.v2i64"]
-    fn llvm_i64x2_extmul_high_i32x4_u(a: simd::i32x4, b: simd::i32x4) -> simd::i64x2;

    #[link_name = "llvm.ceil.v4f32"]
    fn llvm_f32x4_ceil(x: simd::f32x4) -> simd::f32x4;
@@ -209,10 +164,6 @@ extern "C" {
    fn llvm_f32x4_min(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
    #[link_name = "llvm.maximum.v4f32"]
    fn llvm_f32x4_max(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
-    #[link_name = "llvm.wasm.pmin.v4f32"]
-    fn llvm_f32x4_pmin(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;
-    #[link_name = "llvm.wasm.pmax.v4f32"]
-    fn llvm_f32x4_pmax(x: simd::f32x4, y: simd::f32x4) -> simd::f32x4;

    #[link_name = "llvm.ceil.v2f64"]
    fn llvm_f64x2_ceil(x: simd::f64x2) -> simd::f64x2;
@@ -230,10 +181,6 @@ extern "C" {
    fn llvm_f64x2_min(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
    #[link_name = "llvm.maximum.v2f64"]
    fn llvm_f64x2_max(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
-    #[link_name = "llvm.wasm.pmin.v2f64"]
-    fn llvm_f64x2_pmin(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;
-    #[link_name = "llvm.wasm.pmax.v2f64"]
-    fn llvm_f64x2_pmax(x: simd::f64x2, y: simd::f64x2) -> simd::f64x2;

    #[link_name = "llvm.fptosi.sat.v4i32.v4f32"]
    fn llvm_i32x4_trunc_sat_f32x4_s(x: simd::f32x4) -> simd::i32x4;
@@ -243,10 +190,6 @@ extern "C" {
    fn llvm_i32x2_trunc_sat_f64x2_s(x: simd::f64x2) -> simd::i32x2;
    #[link_name = "llvm.fptoui.sat.v2i32.v2f64"]
    fn llvm_i32x2_trunc_sat_f64x2_u(x: simd::f64x2) -> simd::i32x2;
-    #[link_name = "llvm.wasm.demote.zero"]
-    fn llvm_f32x4_demote_f64x2_zero(x: simd::f64x2) -> simd::f32x4;
-    #[link_name = "llvm.wasm.promote.low"]
-    fn llvm_f64x2_promote_low_f32x4(x: simd::f32x4) -> simd::f64x2;
 }

 #[repr(packed)]
@@ -425,7 +368,7 @@ pub use i64x2_load_extend_u32x2 as u64x2_load_extend_u32x2;
 #[doc(alias("v128.load8_splat"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load8_splat(m: *const u8) -> v128 {
-    simd::u8x16::splat(*m).v128()
+    u8x16_splat(*m)
 }

 /// Load a single element and splat to all lanes of a v128 vector.
@@ -446,8 +389,7 @@ pub unsafe fn v128_load8_splat(m: *const u8) -> v128 {
 #[doc(alias("v128.load16_splat"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load16_splat(m: *const u16) -> v128 {
-    let m = ptr::read_unaligned(m);
-    simd::u16x8::splat(m).v128()
+    u16x8_splat(ptr::read_unaligned(m))
 }

 /// Load a single element and splat to all lanes of a v128 vector.
@@ -468,8 +410,7 @@ pub unsafe fn v128_load16_splat(m: *const u16) -> v128 {
 #[doc(alias("v128.load32_splat"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load32_splat(m: *const u32) -> v128 {
-    let m = ptr::read_unaligned(m);
-    simd::u32x4::splat(m).v128()
+    u32x4_splat(ptr::read_unaligned(m))
 }

 /// Load a single element and splat to all lanes of a v128 vector.
@@ -490,18 +431,14 @@ pub unsafe fn v128_load32_splat(m: *const u32) -> v128 {
 #[doc(alias("v128.load64_splat"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load64_splat(m: *const u64) -> v128 {
-    let m = ptr::read_unaligned(m);
-    simd::u64x2::splat(m).v128()
+    u64x2_splat(ptr::read_unaligned(m))
 }

 /// Load a 32-bit element into the low bits of the vector and sets all other
 /// bits to zero.
 ///
 /// This intrinsic is provided for completeness and is equivalent to `u32x4(*m,
-/// 0, 0, 0)` (which doesn't require `unsafe`). Note, though, that at the time
-/// of this writing this equivalent pattern does not optimize to the same
-/// WebAssembly instruction that this function generates. This will be fixed in
-/// the LLVM 13 release.
+/// 0, 0, 0)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -515,17 +452,14 @@ pub unsafe fn v128_load64_splat(m: *const u64) -> v128 {
 #[doc(alias("v128.load32_zero"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load32_zero(m: *const u32) -> v128 {
-    llvm_load32_zero(m).v128()
+    u32x4(ptr::read_unaligned(m), 0, 0, 0)
 }

 /// Load a 64-bit element into the low bits of the vector and sets all other
 /// bits to zero.
 ///
-/// This intrinsic is provided for completeness and is equivalent to `u64x2(*m,
-/// 0)` (which doesn't require `unsafe`). Note, though, that at the time
-/// of this writing this equivalent pattern does not optimize to the same
-/// WebAssembly instruction that this function generates. This will be fixed in
-/// the LLVM 13 release.
+/// This intrinsic is provided for completeness and is equivalent to
+/// `u64x2_replace_lane::<0>(u64x2(0, 0), *m)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -539,7 +473,7 @@ pub unsafe fn v128_load32_zero(m: *const u32) -> v128 {
 #[doc(alias("v128.load64_zero"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load64_zero(m: *const u64) -> v128 {
-    llvm_load64_zero(m).v128()
+    u64x2_replace_lane::<0>(u64x2(0, 0), ptr::read_unaligned(m))
 }

 /// Stores a `v128` vector to the given heap address.
@@ -576,10 +510,7 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 /// Loads an 8-bit value from `m` and sets lane `L` of `v` to that value.
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `u8x16_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `u8x16_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -593,17 +524,13 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 #[doc(alias("v128.load8_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load8_lane<const L: usize>(v: v128, m: *const u8) -> v128 {
-    static_assert!(L: usize where L < 16);
-    llvm_load8_lane(m, v.as_u8x16(), L).v128()
+    u8x16_replace_lane::<L>(v, *m)
 }

 /// Loads a 16-bit value from `m` and sets lane `L` of `v` to that value.
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `u16x8_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `u16x8_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -617,17 +544,13 @@ pub unsafe fn v128_load8_lane<const L: usize>(v: v128, m: *const u8) -> v128 {
 #[doc(alias("v128.load16_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load16_lane<const L: usize>(v: v128, m: *const u16) -> v128 {
-    static_assert!(L: usize where L < 8);
-    llvm_load16_lane(m, v.as_u16x8(), L).v128()
+    u16x8_replace_lane::<L>(v, ptr::read_unaligned(m))
 }

 /// Loads a 32-bit value from `m` and sets lane `L` of `v` to that value.
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `u32x4_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `u32x4_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -641,17 +564,13 @@ pub unsafe fn v128_load16_lane<const L: usize>(v: v128, m: *const u16) -> v128 {
 #[doc(alias("v128.load32_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load32_lane<const L: usize>(v: v128, m: *const u32) -> v128 {
-    static_assert!(L: usize where L < 4);
-    llvm_load32_lane(m, v.as_u32x4(), L).v128()
+    u32x4_replace_lane::<L>(v, ptr::read_unaligned(m))
 }

 /// Loads a 64-bit value from `m` and sets lane `L` of `v` to that value.
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `u64x2_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `u64x2_replace_lane::<L>(v, *m)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -665,17 +584,13 @@ pub unsafe fn v128_load32_lane<const L: usize>(v: v128, m: *const u32) -> v128 {
 #[doc(alias("v128.load64_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_load64_lane<const L: usize>(v: v128, m: *const u64) -> v128 {
-    static_assert!(L: usize where L < 2);
-    llvm_load64_lane(m, v.as_u64x2(), L).v128()
+    u64x2_replace_lane::<L>(v, ptr::read_unaligned(m))
 }

 /// Stores the 8-bit value from lane `L` of `v` into `m`
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `*m = u8x16_extract_lane::<L>(v)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `*m = u8x16_extract_lane::<L>(v)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -689,17 +604,13 @@ pub unsafe fn v128_load64_lane<const L: usize>(v: v128, m: *const u64) -> v128 {
 #[doc(alias("v128.store8_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_store8_lane<const L: usize>(v: v128, m: *mut u8) {
-    static_assert!(L: usize where L < 16);
-    llvm_store8_lane(m, v.as_u8x16(), L);
+    *m = u8x16_extract_lane::<L>(v);
 }

 /// Stores the 16-bit value from lane `L` of `v` into `m`
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `*m = u16x8_extract_lane::<L>(v)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `*m = u16x8_extract_lane::<L>(v)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -713,17 +624,13 @@ pub unsafe fn v128_store8_lane<const L: usize>(v: v128, m: *mut u8) {
 #[doc(alias("v128.store16_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_store16_lane<const L: usize>(v: v128, m: *mut u16) {
-    static_assert!(L: usize where L < 8);
-    llvm_store16_lane(m, v.as_u16x8(), L)
+    ptr::write_unaligned(m, u16x8_extract_lane::<L>(v))
 }

 /// Stores the 32-bit value from lane `L` of `v` into `m`
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `*m = u32x4_extract_lane::<L>(v)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `*m = u32x4_extract_lane::<L>(v)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -737,17 +644,13 @@ pub unsafe fn v128_store16_lane<const L: usize>(v: v128, m: *mut u16) {
 #[doc(alias("v128.store32_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_store32_lane<const L: usize>(v: v128, m: *mut u32) {
-    static_assert!(L: usize where L < 4);
-    llvm_store32_lane(m, v.as_u32x4(), L)
+    ptr::write_unaligned(m, u32x4_extract_lane::<L>(v))
 }

 /// Stores the 64-bit value from lane `L` of `v` into `m`
 ///
 /// This intrinsic is provided for completeness and is equivalent to
-/// `*m = u64x2_extract_lane::<L>(v)` (which doesn't require `unsafe`). Note,
-/// though, that at the time of this writing this equivalent pattern does not
-/// optimize to the same WebAssembly instruction that this function generates.
-/// This will be fixed in the LLVM 13 release.
+/// `*m = u64x2_extract_lane::<L>(v)` (which doesn't require `unsafe`).
 ///
 /// # Unsafety
 ///
@@ -761,8 +664,7 @@ pub unsafe fn v128_store32_lane<const L: usize>(v: v128, m: *mut u32) {
 #[doc(alias("v128.store64_lane"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub unsafe fn v128_store64_lane<const L: usize>(v: v128, m: *mut u64) {
-    static_assert!(L: usize where L < 2);
-    llvm_store64_lane(m, v.as_u64x2(), L)
+    ptr::write_unaligned(m, u64x2_extract_lane::<L>(v))
 }

 /// Materializes a SIMD value from the provided operands.
@@ -3053,7 +2955,19 @@ pub fn u16x8_avgr(a: v128, b: v128) -> v128 {
 #[doc(alias("i16x8.extmul_low_i8x16_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_extmul_low_i8x16(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i16x8_extmul_low_i8x16_s(a.as_i8x16(), b.as_i8x16()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3066,7 +2980,19 @@ pub fn i16x8_extmul_low_i8x16(a: v128, b: v128) -> v128 {
 #[doc(alias("i16x8.extmul_high_i8x16_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_extmul_high_i8x16(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i16x8_extmul_high_i8x16_s(a.as_i8x16(), b.as_i8x16()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            a.as_i8x16(),
+            a.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::i8x8, simd::i16x8>(simd_shuffle8!(
+            b.as_i8x16(),
+            b.as_i8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3079,7 +3005,19 @@ pub fn i16x8_extmul_high_i8x16(a: v128, b: v128) -> v128 {
 #[doc(alias("i16x8.extmul_low_i8x16_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_extmul_low_u8x16(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i16x8_extmul_low_i8x16_u(a.as_i8x16(), b.as_i8x16()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3095,7 +3033,19 @@ pub use i16x8_extmul_low_u8x16 as u16x8_extmul_low_u8x16;
 #[doc(alias("i16x8.extmul_high_i8x16_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_extmul_high_u8x16(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i16x8_extmul_high_i8x16_u(a.as_i8x16(), b.as_i8x16()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            a.as_u8x16(),
+            a.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        let rhs = simd_cast::<simd::u8x8, simd::u16x8>(simd_shuffle8!(
+            b.as_u8x16(),
+            b.as_u8x16(),
+            [8, 9, 10, 11, 12, 13, 14, 15],
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3412,7 +3362,19 @@ pub fn i32x4_dot_i16x8(a: v128, b: v128) -> v128 {
 #[doc(alias("i32x4.extmul_low_i16x8_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_extmul_low_i16x8(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i32x4_extmul_low_i16x8_s(a.as_i16x8(), b.as_i16x8()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3425,7 +3387,19 @@ pub fn i32x4_extmul_low_i16x8(a: v128, b: v128) -> v128 {
 #[doc(alias("i32x4.extmul_high_i16x8_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_extmul_high_i16x8(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i32x4_extmul_high_i16x8_s(a.as_i16x8(), b.as_i16x8()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            a.as_i16x8(),
+            a.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::i16x4, simd::i32x4>(simd_shuffle4!(
+            b.as_i16x8(),
+            b.as_i16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3438,7 +3412,19 @@ pub fn i32x4_extmul_high_i16x8(a: v128, b: v128) -> v128 {
 #[doc(alias("i32x4.extmul_low_i16x8_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_extmul_low_u16x8(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i32x4_extmul_low_i16x8_u(a.as_i16x8(), b.as_i16x8()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [0, 1, 2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3454,7 +3440,19 @@ pub use i32x4_extmul_low_u16x8 as u32x4_extmul_low_u16x8;
 #[doc(alias("i32x4.extmul_high_i16x8_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_extmul_high_u16x8(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i32x4_extmul_high_i16x8_u(a.as_i16x8(), b.as_i16x8()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            a.as_u16x8(),
+            a.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        let rhs = simd_cast::<simd::u16x4, simd::u32x4>(simd_shuffle4!(
+            b.as_u16x8(),
+            b.as_u16x8(),
+            [4, 5, 6, 7]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3666,7 +3664,19 @@ pub use i64x2_mul as u64x2_mul;
 #[doc(alias("i64x2.extmul_low_i32x4_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_extmul_low_i32x4(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i64x2_extmul_low_i32x4_s(a.as_i32x4(), b.as_i32x4()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3679,7 +3689,19 @@ pub fn i64x2_extmul_low_i32x4(a: v128, b: v128) -> v128 {
 #[doc(alias("i64x2.extmul_high_i32x4_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_extmul_high_i32x4(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i64x2_extmul_high_i32x4_s(a.as_i32x4(), b.as_i32x4()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            a.as_i32x4(),
+            a.as_i32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::i32x2, simd::i64x2>(simd_shuffle2!(
+            b.as_i32x4(),
+            b.as_i32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 /// Lane-wise integer extended multiplication producing twice wider result than
@@ -3692,7 +3714,19 @@ pub fn i64x2_extmul_high_i32x4(a: v128, b: v128) -> v128 {
 #[doc(alias("i64x2.extmul_low_i32x4_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_extmul_low_u32x4(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i64x2_extmul_low_i32x4_u(a.as_i32x4(), b.as_i32x4()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [0, 1]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [0, 1]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3708,7 +3742,19 @@ pub use i64x2_extmul_low_u32x4 as u64x2_extmul_low_u32x4;
 #[doc(alias("i64x2.extmul_high_i32x4_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_extmul_high_u32x4(a: v128, b: v128) -> v128 {
-    unsafe { llvm_i64x2_extmul_high_i32x4_u(a.as_i32x4(), b.as_i32x4()).v128() }
+    unsafe {
+        let lhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            [2, 3]
+        ));
+        let rhs = simd_cast::<simd::u32x2, simd::u64x2>(simd_shuffle2!(
+            b.as_u32x4(),
+            b.as_u32x4(),
+            [2, 3]
+        ));
+        simd_mul(lhs, rhs).v128()
+    }
 }

 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3862,7 +3908,14 @@ pub fn f32x4_max(a: v128, b: v128) -> v128 {
 #[doc(alias("f32x4.pmin"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f32x4_pmin(a: v128, b: v128) -> v128 {
-    unsafe { llvm_f32x4_pmin(a.as_f32x4(), b.as_f32x4()).v128() }
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(b.as_f32x4(), a.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
 }

 /// Lane-wise maximum value, defined as `a < b ? b : a`
@@ -3872,7 +3925,14 @@ pub fn f32x4_pmin(a: v128, b: v128) -> v128 {
 #[doc(alias("f32x4.pmax"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f32x4_pmax(a: v128, b: v128) -> v128 {
-    unsafe { llvm_f32x4_pmax(a.as_f32x4(), b.as_f32x4()).v128() }
+    unsafe {
+        simd_select::<simd::m32x4, simd::f32x4>(
+            simd_lt(a.as_f32x4(), b.as_f32x4()),
+            b.as_f32x4(),
+            a.as_f32x4(),
+        )
+        .v128()
+    }
 }

 /// Lane-wise rounding to the nearest integral value not smaller than the input.
@@ -4023,7 +4083,14 @@ pub fn f64x2_max(a: v128, b: v128) -> v128 {
 #[doc(alias("f64x2.pmin"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f64x2_pmin(a: v128, b: v128) -> v128 {
-    unsafe { llvm_f64x2_pmin(a.as_f64x2(), b.as_f64x2()).v128() }
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(b.as_f64x2(), a.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
 }

 /// Lane-wise maximum value, defined as `a < b ? b : a`
@@ -4033,7 +4100,14 @@ pub fn f64x2_pmin(a: v128, b: v128) -> v128 {
 #[doc(alias("f64x2.pmax"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f64x2_pmax(a: v128, b: v128) -> v128 {
-    unsafe { llvm_f64x2_pmax(a.as_f64x2(), b.as_f64x2()).v128() }
+    unsafe {
+        simd_select::<simd::m64x2, simd::f64x2>(
+            simd_lt(a.as_f64x2(), b.as_f64x2()),
+            b.as_f64x2(),
+            a.as_f64x2(),
+        )
+        .v128()
+    }
 }

 /// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
@@ -4171,7 +4245,14 @@ pub fn f64x2_convert_low_u32x4(a: v128) -> v128 {
 #[doc(alias("f32x4.demote_f64x2_zero"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
-    unsafe { llvm_f32x4_demote_f64x2_zero(a.as_f64x2()).v128() }
+    unsafe {
+        simd_cast::<simd::f64x4, simd::f32x4>(simd_shuffle4!(
+            a.as_f64x2(),
+            simd::f64x2::splat(0.0),
+            [0, 1, 2, 3]
+        ))
+        .v128()
+    }
 }

 /// Conversion of the two lower single-precision floating point lanes to the two
@@ -4182,7 +4263,10 @@ pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
 #[doc(alias("f32x4.promote_low_f32x4"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f64x2_promote_low_f32x4(a: v128) -> v128 {
-    unsafe { llvm_f64x2_promote_low_f32x4(a.as_f32x4()).v128() }
+    unsafe {
+        simd_cast::<simd::f32x2, simd::f64x2>(simd_shuffle2!(a.as_f32x4(), a.as_f32x4(), [0, 1]))
+            .v128()
+    }
 }

 #[cfg(test)]