2018-11-10 15:45:16 +01:00
|
|
|
#!/usr/bin/env sh
|
2017-09-21 08:42:52 -07:00
|
|
|
|
|
|
|
|
set -ex
|
|
|
|
|
|
2018-11-10 15:45:16 +01:00
|
|
|
: "${TARGET?The TARGET environment variable must be set.}"
|
2018-03-18 16:55:20 +01:00
|
|
|
|
2017-10-11 17:33:41 -07:00
|
|
|
# Tests are all super fast anyway, and they fault often enough on travis that
|
|
|
|
|
# having only one thread increases debuggability to be worth it.
|
2018-05-21 20:37:41 +02:00
|
|
|
#export RUST_BACKTRACE=full
|
adds AArch64's {s,u,f}{min,max}{v,p} and ARM's {vmov}{n,l} (#345)
* adds {s,u,f}{min,max}{v,p} AArch64 intrinsics
* adds {vmov}{n,l} ARM intrinsics
Closes #314 .
2018-03-07 16:31:14 +01:00
|
|
|
#export RUST_TEST_NOCAPTURE=1
|
2019-07-08 14:30:51 +02:00
|
|
|
#export RUST_TEST_THREADS=1
|
2017-10-11 17:33:41 -07:00
|
|
|
|
2025-05-28 02:08:46 +05:30
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -D warnings -Z merge-functions=disabled -Z verify-llvm-ir"
|
2023-06-07 16:53:48 +01:00
|
|
|
export HOST_RUSTFLAGS="${RUSTFLAGS}"
|
2024-11-05 11:29:25 +00:00
|
|
|
export PROFILE="${PROFILE:="--profile=release"}"
|
2021-09-20 18:19:05 +02:00
|
|
|
|
2018-03-20 15:11:50 +01:00
|
|
|
case ${TARGET} in
|
2021-09-20 18:19:05 +02:00
|
|
|
# On Windows the linker performs identical COMDAT folding (ICF) by default
|
|
|
|
|
# in release mode which removes identical COMDAT sections. This interferes
|
|
|
|
|
# with our instruction assertions just like LLVM's MergeFunctions pass so
|
|
|
|
|
# we disable it.
|
2020-10-25 09:32:27 +09:00
|
|
|
*-pc-windows-msvc)
|
2021-09-20 18:19:05 +02:00
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -Clink-args=/OPT:NOICF"
|
2020-10-25 09:32:27 +09:00
|
|
|
;;
|
2018-05-21 20:37:41 +02:00
|
|
|
# On 32-bit use a static relocation model which avoids some extra
|
|
|
|
|
# instructions when dealing with static data, notably allowing some
|
|
|
|
|
# instruction assertion checks to pass below the 20 instruction limit. If
|
|
|
|
|
# this is the default, dynamic, then too many instructions are generated
|
|
|
|
|
# when we assert the instruction for a function and it causes tests to fail.
|
|
|
|
|
i686-* | i586-*)
|
2025-04-16 04:56:38 +05:30
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static"
|
2018-05-21 20:37:41 +02:00
|
|
|
;;
|
2023-10-01 16:03:53 +02:00
|
|
|
# Some x86_64 targets enable by default more features beyond SSE2,
|
|
|
|
|
# which cause some instruction assertion checks to fail.
|
|
|
|
|
x86_64-*)
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=-sse3"
|
|
|
|
|
;;
|
2019-04-08 08:59:43 +00:00
|
|
|
#Unoptimized build uses fast-isel which breaks with msa
|
|
|
|
|
mips-* | mipsel-*)
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
|
|
|
|
|
;;
|
2021-12-04 13:03:30 +00:00
|
|
|
armv7-*eabihf | thumbv7-*eabihf)
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
|
|
|
|
|
;;
|
2021-12-09 23:50:37 +00:00
|
|
|
# Some of our test dependencies use the deprecated `gcc` crates which
|
|
|
|
|
# doesn't detect RISC-V compilers automatically, so do it manually here.
|
2025-04-23 15:30:47 +05:30
|
|
|
riscv*)
|
2023-09-16 16:54:08 +02:00
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zks,+zbb,+zbc"
|
2021-12-09 23:50:37 +00:00
|
|
|
;;
|
2018-03-20 15:11:50 +01:00
|
|
|
esac
|
|
|
|
|
|
2017-10-18 20:16:34 +02:00
|
|
|
echo "RUSTFLAGS=${RUSTFLAGS}"
|
2017-10-26 22:00:03 +02:00
|
|
|
echo "OBJDUMP=${OBJDUMP}"
|
2019-07-08 23:21:37 +02:00
|
|
|
echo "STDARCH_DISABLE_ASSERT_INSTR=${STDARCH_DISABLE_ASSERT_INSTR}"
|
|
|
|
|
echo "STDARCH_TEST_EVERYTHING=${STDARCH_TEST_EVERYTHING}"
|
2025-04-16 02:34:03 +05:30
|
|
|
echo "STDARCH_TEST_SKIP_FEATURE=${STDARCH_TEST_SKIP_FEATURE}"
|
|
|
|
|
echo "STDARCH_TEST_SKIP_FUNCTION=${STDARCH_TEST_SKIP_FUNCTION}"
|
2024-11-05 11:29:25 +00:00
|
|
|
echo "PROFILE=${PROFILE}"
|
2017-10-18 20:16:34 +02:00
|
|
|
|
2017-11-20 14:34:44 -08:00
|
|
|
cargo_test() {
|
2019-02-13 17:43:11 +01:00
|
|
|
cmd="cargo"
|
2018-11-10 13:00:47 +01:00
|
|
|
subcmd="test"
|
2019-02-13 17:43:11 +01:00
|
|
|
if [ "$NORUN" = "1" ]; then
|
|
|
|
|
export subcmd="build"
|
2018-11-10 13:00:47 +01:00
|
|
|
fi
|
2018-12-28 08:26:45 -05:00
|
|
|
cmd="$cmd ${subcmd} --target=$TARGET $1"
|
2017-12-29 11:52:27 -06:00
|
|
|
cmd="$cmd -- $2"
|
Update and revamp wasm32 SIMD intrinsics (#874)
Lots of time and lots of things have happened since the simd128 support
was first added to this crate. Things are starting to settle down now so
this commit syncs the Rust intrinsic definitions with the current
specification (https://github.com/WebAssembly/simd). Unfortuantely not
everything can be enabled just yet but everything is in the pipeline for
getting enabled soon.
This commit also applies a major revamp to how intrinsics are tested.
The intention is that the setup should be much more lightweight and/or
easy to work with after this commit.
At a high-level, the changes here are:
* Testing with node.js and `#[wasm_bindgen]` has been removed. Instead
intrinsics are tested with Wasmtime which has a nearly complete
implementation of the SIMD spec (and soon fully complete!)
* Testing is switched to `wasm32-wasi` to make idiomatic Rust bits a bit
easier to work with (e.g. `panic!)`
* Testing of this crate's simd128 feature for wasm is re-enabled. This
will run on CI and both compile and execute intrinsics. This should
bring wasm intrinsics to the same level of parity as x86 intrinsics,
for example.
* New wasm intrinsics have been added:
* `iNNxMM_loadAxA_{s,u}`
* `vNNxMM_load_splat`
* `v8x16_swizzle`
* `v128_andnot`
* `iNNxMM_abs`
* `iNNxMM_narrow_*_{u,s}`
* `iNNxMM_bitmask` - commented out until LLVM is updated to LLVM 11
* `iNNxMM_widen_*_{u,s}` - commented out until
bytecodealliance/wasmtime#1994 lands
* `iNNxMM_{max,min}_{u,s}`
* `iNNxMM_avgr_u`
* Some wasm intrinsics have been removed:
* `i64x2_trunc_*`
* `f64x2_convert_*`
* `i8x16_mul`
* The `v8x16.shuffle` instruction is exposed. This is done through a
`macro` (not `macro_rules!`, but `macro`). This is intended to be
somewhat experimental and unstable until we decide otherwise. This
instruction has 16 immediate-mode expressions and is as a result
unsuited to the existing `constify_*` logic of this crate. I'm hoping
that we can game out over time what a macro might look like and/or
look for better solutions. For now, though, what's implemented is the
first of its kind in this crate (an architecture-specific macro), so
some extra scrutiny looking at it would be appreciated.
* Lots of `assert_instr` annotations have been fixed for wasm.
* All wasm simd128 tests are uncommented and passing now.
This is still missing tests for new intrinsics and it's also missing
tests for various corner cases. I hope to get to those later as the
upstream spec itself gets closer to stabilization.
In the meantime, however, I went ahead and updated the `hex.rs` example
with a wasm implementation using intrinsics. With it I got some very
impressive speedups using Wasmtime:
test benches::large_default ... bench: 213,961 ns/iter (+/- 5,108) = 4900 MB/s
test benches::large_fallback ... bench: 3,108,434 ns/iter (+/- 75,730) = 337 MB/s
test benches::small_default ... bench: 52 ns/iter (+/- 0) = 2250 MB/s
test benches::small_fallback ... bench: 358 ns/iter (+/- 0) = 326 MB/s
or otherwise using Wasmtime hex encoding using SIMD is 15x faster on 1MB
chunks or 7x faster on small <128byte chunks.
All of these intrinsics are still unstable and will continue to be so
presumably until the simd proposal in wasm itself progresses to a later
stage. Additionaly we'll still want to sync with clang on intrinsic
names (or decide not to) at some point in the future.
* wasm: Unconditionally expose SIMD functions
This commit unconditionally exposes SIMD functions from the `wasm32`
module. This is done in such a way that the standard library does not
need to be recompiled to access SIMD intrinsics and use them. This,
hopefully, is the long-term story for SIMD in WebAssembly in Rust.
It's unlikely that all WebAssembly runtimes will end up implementing
SIMD so the standard library is unlikely to use SIMD any time soon, but
we want to make sure it's easily available to folks! This commit enables
all this by ensuring that SIMD is available to the standard library,
regardless of compilation flags.
This'll come with the same caveats as x86 support, where it doesn't make
sense to call these functions unless you're enabling simd support one
way or another locally. Additionally, as with x86, if you don't call
these functions then the instructions won't show up in your binary.
While I was here I went ahead and expanded the WebAssembly-specific
documentation for the wasm32 module as well, ensuring that the current
state of SIMD/Atomics are documented.
2020-07-18 07:32:52 -05:00
|
|
|
|
|
|
|
|
case ${TARGET} in
|
2024-11-10 14:27:06 +01:00
|
|
|
# wasm targets can't catch panics so if a test failures make sure the test
|
|
|
|
|
# harness isn't trying to capture output, otherwise we won't get any useful
|
|
|
|
|
# output.
|
Update and revamp wasm32 SIMD intrinsics (#874)
Lots of time and lots of things have happened since the simd128 support
was first added to this crate. Things are starting to settle down now so
this commit syncs the Rust intrinsic definitions with the current
specification (https://github.com/WebAssembly/simd). Unfortuantely not
everything can be enabled just yet but everything is in the pipeline for
getting enabled soon.
This commit also applies a major revamp to how intrinsics are tested.
The intention is that the setup should be much more lightweight and/or
easy to work with after this commit.
At a high-level, the changes here are:
* Testing with node.js and `#[wasm_bindgen]` has been removed. Instead
intrinsics are tested with Wasmtime which has a nearly complete
implementation of the SIMD spec (and soon fully complete!)
* Testing is switched to `wasm32-wasi` to make idiomatic Rust bits a bit
easier to work with (e.g. `panic!)`
* Testing of this crate's simd128 feature for wasm is re-enabled. This
will run on CI and both compile and execute intrinsics. This should
bring wasm intrinsics to the same level of parity as x86 intrinsics,
for example.
* New wasm intrinsics have been added:
* `iNNxMM_loadAxA_{s,u}`
* `vNNxMM_load_splat`
* `v8x16_swizzle`
* `v128_andnot`
* `iNNxMM_abs`
* `iNNxMM_narrow_*_{u,s}`
* `iNNxMM_bitmask` - commented out until LLVM is updated to LLVM 11
* `iNNxMM_widen_*_{u,s}` - commented out until
bytecodealliance/wasmtime#1994 lands
* `iNNxMM_{max,min}_{u,s}`
* `iNNxMM_avgr_u`
* Some wasm intrinsics have been removed:
* `i64x2_trunc_*`
* `f64x2_convert_*`
* `i8x16_mul`
* The `v8x16.shuffle` instruction is exposed. This is done through a
`macro` (not `macro_rules!`, but `macro`). This is intended to be
somewhat experimental and unstable until we decide otherwise. This
instruction has 16 immediate-mode expressions and is as a result
unsuited to the existing `constify_*` logic of this crate. I'm hoping
that we can game out over time what a macro might look like and/or
look for better solutions. For now, though, what's implemented is the
first of its kind in this crate (an architecture-specific macro), so
some extra scrutiny looking at it would be appreciated.
* Lots of `assert_instr` annotations have been fixed for wasm.
* All wasm simd128 tests are uncommented and passing now.
This is still missing tests for new intrinsics and it's also missing
tests for various corner cases. I hope to get to those later as the
upstream spec itself gets closer to stabilization.
In the meantime, however, I went ahead and updated the `hex.rs` example
with a wasm implementation using intrinsics. With it I got some very
impressive speedups using Wasmtime:
test benches::large_default ... bench: 213,961 ns/iter (+/- 5,108) = 4900 MB/s
test benches::large_fallback ... bench: 3,108,434 ns/iter (+/- 75,730) = 337 MB/s
test benches::small_default ... bench: 52 ns/iter (+/- 0) = 2250 MB/s
test benches::small_fallback ... bench: 358 ns/iter (+/- 0) = 326 MB/s
or otherwise using Wasmtime hex encoding using SIMD is 15x faster on 1MB
chunks or 7x faster on small <128byte chunks.
All of these intrinsics are still unstable and will continue to be so
presumably until the simd proposal in wasm itself progresses to a later
stage. Additionaly we'll still want to sync with clang on intrinsic
names (or decide not to) at some point in the future.
* wasm: Unconditionally expose SIMD functions
This commit unconditionally exposes SIMD functions from the `wasm32`
module. This is done in such a way that the standard library does not
need to be recompiled to access SIMD intrinsics and use them. This,
hopefully, is the long-term story for SIMD in WebAssembly in Rust.
It's unlikely that all WebAssembly runtimes will end up implementing
SIMD so the standard library is unlikely to use SIMD any time soon, but
we want to make sure it's easily available to folks! This commit enables
all this by ensuring that SIMD is available to the standard library,
regardless of compilation flags.
This'll come with the same caveats as x86 support, where it doesn't make
sense to call these functions unless you're enabling simd support one
way or another locally. Additionally, as with x86, if you don't call
these functions then the instructions won't show up in your binary.
While I was here I went ahead and expanded the WebAssembly-specific
documentation for the wasm32 module as well, ensuring that the current
state of SIMD/Atomics are documented.
2020-07-18 07:32:52 -05:00
|
|
|
wasm32*)
|
|
|
|
|
cmd="$cmd --nocapture"
|
|
|
|
|
;;
|
|
|
|
|
esac
|
2017-11-20 14:34:44 -08:00
|
|
|
$cmd
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-07 22:56:23 +01:00
|
|
|
CORE_ARCH="--manifest-path=crates/core_arch/Cargo.toml"
|
2019-07-08 23:21:37 +02:00
|
|
|
STDARCH_EXAMPLES="--manifest-path=examples/Cargo.toml"
|
2021-09-09 19:16:45 +01:00
|
|
|
INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
|
|
|
|
|
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${CORE_ARCH} ${PROFILE}"
|
2019-04-10 16:17:39 -07:00
|
|
|
|
2019-02-07 22:56:23 +01:00
|
|
|
if [ "$NOSTD" != "1" ]; then
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${STDARCH_EXAMPLES} ${PROFILE}"
|
2019-02-07 22:56:23 +01:00
|
|
|
fi
|
|
|
|
|
|
2025-01-31 11:17:21 +00:00
|
|
|
|
2018-08-15 18:20:33 +02:00
|
|
|
# Test targets compiled with extra features.
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
case ${TARGET} in
|
2025-04-18 03:26:10 +05:30
|
|
|
x86_64-unknown-linux-gnu)
|
2019-07-08 23:21:37 +02:00
|
|
|
export STDARCH_DISABLE_ASSERT_INSTR=1
|
2025-04-16 04:56:38 +05:30
|
|
|
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
|
|
|
|
|
cargo_test "${PROFILE}"
|
|
|
|
|
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx512f"
|
|
|
|
|
cargo_test "${PROFILE}"
|
|
|
|
|
;;
|
|
|
|
|
x86_64* | i686*)
|
|
|
|
|
export STDARCH_DISABLE_ASSERT_INSTR=1
|
|
|
|
|
|
2018-11-10 16:46:37 +01:00
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${PROFILE}"
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
;;
|
2019-07-14 12:17:04 +02:00
|
|
|
# FIXME: don't build anymore
|
|
|
|
|
#mips-*gnu* | mipsel-*gnu*)
|
|
|
|
|
# export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa,+fp64,+mips32r5"
|
2024-11-05 11:29:25 +00:00
|
|
|
# cargo_test "${PROFILE}"
|
2019-07-14 12:17:04 +02:00
|
|
|
# ;;
|
2019-04-08 08:59:43 +00:00
|
|
|
mips64*)
|
2019-04-16 11:39:57 +02:00
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+msa"
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${PROFILE}"
|
2019-04-16 11:39:57 +02:00
|
|
|
;;
|
2025-02-22 15:51:30 +01:00
|
|
|
s390x*)
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+vector-enhancements-1"
|
|
|
|
|
cargo_test "${PROFILE}"
|
|
|
|
|
;;
|
2019-04-16 13:46:13 +02:00
|
|
|
powerpc64*)
|
2025-04-18 09:29:14 +05:30
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${PROFILE}"
|
2019-04-16 11:39:57 +02:00
|
|
|
|
2025-04-18 09:29:14 +05:30
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+vsx"
|
|
|
|
|
cargo_test "${PROFILE}"
|
|
|
|
|
;;
|
|
|
|
|
powerpc*)
|
|
|
|
|
# qemu has a bug in PPC32 which leads to a crash when compiled with `vsx`
|
|
|
|
|
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo_test "${PROFILE}"
|
2019-04-16 11:39:57 +02:00
|
|
|
;;
|
2025-01-31 11:17:21 +00:00
|
|
|
|
2025-07-13 13:41:22 +02:00
|
|
|
# Setup aarch64 & armv7 specific variables, the runner, along with some
|
2025-01-31 11:17:21 +00:00
|
|
|
# tests to skip
|
|
|
|
|
aarch64-unknown-linux-gnu*)
|
|
|
|
|
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
|
|
|
|
|
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
|
2025-07-15 17:21:17 +05:30
|
|
|
TEST_CXX_COMPILER="clang++"
|
2025-01-31 11:17:21 +00:00
|
|
|
TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
|
|
|
|
|
;;
|
2025-02-17 15:23:08 +00:00
|
|
|
|
|
|
|
|
aarch64_be-unknown-linux-gnu*)
|
|
|
|
|
TEST_CPPFLAGS="-fuse-ld=lld"
|
|
|
|
|
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
|
2025-07-15 17:21:17 +05:30
|
|
|
TEST_CXX_COMPILER="clang++"
|
2025-02-17 15:23:08 +00:00
|
|
|
TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
|
|
|
|
|
;;
|
|
|
|
|
|
2025-01-31 11:17:21 +00:00
|
|
|
armv7-unknown-linux-gnueabihf*)
|
|
|
|
|
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
|
|
|
|
|
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
|
2025-07-15 17:21:17 +05:30
|
|
|
TEST_CXX_COMPILER="clang++"
|
2025-01-31 11:17:21 +00:00
|
|
|
TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
|
|
|
|
|
;;
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
*)
|
|
|
|
|
;;
|
2019-01-21 16:59:10 +01:00
|
|
|
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
esac
|
2019-01-21 16:59:10 +01:00
|
|
|
|
2025-01-31 11:17:21 +00:00
|
|
|
# Arm specific
|
|
|
|
|
case "${TARGET}" in
|
|
|
|
|
aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
|
|
|
|
|
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
|
|
|
|
|
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
|
|
|
|
|
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
|
|
|
|
|
--runner "${TEST_RUNNER}" \
|
|
|
|
|
--cppcompiler "${TEST_CXX_COMPILER}" \
|
|
|
|
|
--skip "${TEST_SKIP_INTRINSICS}" \
|
|
|
|
|
--target "${TARGET}"
|
|
|
|
|
;;
|
|
|
|
|
|
2025-02-17 15:23:08 +00:00
|
|
|
aarch64_be-unknown-linux-gnu*)
|
|
|
|
|
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
|
|
|
|
|
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
|
|
|
|
|
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
|
|
|
|
|
--runner "${TEST_RUNNER}" \
|
|
|
|
|
--cppcompiler "${TEST_CXX_COMPILER}" \
|
|
|
|
|
--skip "${TEST_SKIP_INTRINSICS}" \
|
2025-01-31 11:17:21 +00:00
|
|
|
--target "${TARGET}" \
|
2025-04-22 08:17:25 +01:00
|
|
|
--linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
|
2025-01-31 11:17:21 +00:00
|
|
|
--cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
|
|
|
|
|
;;
|
2025-02-17 15:23:08 +00:00
|
|
|
*)
|
2025-01-31 11:17:21 +00:00
|
|
|
;;
|
|
|
|
|
esac
|
2021-09-09 19:16:45 +01:00
|
|
|
|
Update and revamp wasm32 SIMD intrinsics (#874)
Lots of time and lots of things have happened since the simd128 support
was first added to this crate. Things are starting to settle down now so
this commit syncs the Rust intrinsic definitions with the current
specification (https://github.com/WebAssembly/simd). Unfortuantely not
everything can be enabled just yet but everything is in the pipeline for
getting enabled soon.
This commit also applies a major revamp to how intrinsics are tested.
The intention is that the setup should be much more lightweight and/or
easy to work with after this commit.
At a high-level, the changes here are:
* Testing with node.js and `#[wasm_bindgen]` has been removed. Instead
intrinsics are tested with Wasmtime which has a nearly complete
implementation of the SIMD spec (and soon fully complete!)
* Testing is switched to `wasm32-wasi` to make idiomatic Rust bits a bit
easier to work with (e.g. `panic!)`
* Testing of this crate's simd128 feature for wasm is re-enabled. This
will run on CI and both compile and execute intrinsics. This should
bring wasm intrinsics to the same level of parity as x86 intrinsics,
for example.
* New wasm intrinsics have been added:
* `iNNxMM_loadAxA_{s,u}`
* `vNNxMM_load_splat`
* `v8x16_swizzle`
* `v128_andnot`
* `iNNxMM_abs`
* `iNNxMM_narrow_*_{u,s}`
* `iNNxMM_bitmask` - commented out until LLVM is updated to LLVM 11
* `iNNxMM_widen_*_{u,s}` - commented out until
bytecodealliance/wasmtime#1994 lands
* `iNNxMM_{max,min}_{u,s}`
* `iNNxMM_avgr_u`
* Some wasm intrinsics have been removed:
* `i64x2_trunc_*`
* `f64x2_convert_*`
* `i8x16_mul`
* The `v8x16.shuffle` instruction is exposed. This is done through a
`macro` (not `macro_rules!`, but `macro`). This is intended to be
somewhat experimental and unstable until we decide otherwise. This
instruction has 16 immediate-mode expressions and is as a result
unsuited to the existing `constify_*` logic of this crate. I'm hoping
that we can game out over time what a macro might look like and/or
look for better solutions. For now, though, what's implemented is the
first of its kind in this crate (an architecture-specific macro), so
some extra scrutiny looking at it would be appreciated.
* Lots of `assert_instr` annotations have been fixed for wasm.
* All wasm simd128 tests are uncommented and passing now.
This is still missing tests for new intrinsics and it's also missing
tests for various corner cases. I hope to get to those later as the
upstream spec itself gets closer to stabilization.
In the meantime, however, I went ahead and updated the `hex.rs` example
with a wasm implementation using intrinsics. With it I got some very
impressive speedups using Wasmtime:
test benches::large_default ... bench: 213,961 ns/iter (+/- 5,108) = 4900 MB/s
test benches::large_fallback ... bench: 3,108,434 ns/iter (+/- 75,730) = 337 MB/s
test benches::small_default ... bench: 52 ns/iter (+/- 0) = 2250 MB/s
test benches::small_fallback ... bench: 358 ns/iter (+/- 0) = 326 MB/s
or otherwise using Wasmtime hex encoding using SIMD is 15x faster on 1MB
chunks or 7x faster on small <128byte chunks.
All of these intrinsics are still unstable and will continue to be so
presumably until the simd proposal in wasm itself progresses to a later
stage. Additionaly we'll still want to sync with clang on intrinsic
names (or decide not to) at some point in the future.
* wasm: Unconditionally expose SIMD functions
This commit unconditionally exposes SIMD functions from the `wasm32`
module. This is done in such a way that the standard library does not
need to be recompiled to access SIMD intrinsics and use them. This,
hopefully, is the long-term story for SIMD in WebAssembly in Rust.
It's unlikely that all WebAssembly runtimes will end up implementing
SIMD so the standard library is unlikely to use SIMD any time soon, but
we want to make sure it's easily available to folks! This commit enables
all this by ensuring that SIMD is available to the standard library,
regardless of compilation flags.
This'll come with the same caveats as x86 support, where it doesn't make
sense to call these functions unless you're enabling simd support one
way or another locally. Additionally, as with x86, if you don't call
these functions then the instructions won't show up in your binary.
While I was here I went ahead and expanded the WebAssembly-specific
documentation for the wasm32 module as well, ensuring that the current
state of SIMD/Atomics are documented.
2020-07-18 07:32:52 -05:00
|
|
|
if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
|
2019-02-13 17:43:11 +01:00
|
|
|
# Test examples
|
|
|
|
|
(
|
|
|
|
|
cd examples
|
2024-11-05 11:29:25 +00:00
|
|
|
cargo test --target "$TARGET" "${PROFILE}"
|
|
|
|
|
echo test | cargo run --target "$TARGET" "${PROFILE}" hex
|
2019-02-13 17:43:11 +01:00
|
|
|
)
|
|
|
|
|
fi
|