2017-09-20 10:28:00 -07:00
|
|
|
//! Implementation of the `#[assert_instr]` macro
|
|
|
|
|
//!
|
2019-07-08 23:21:37 +02:00
|
|
|
//! This macro is used when testing the `stdarch` crate and is used to generate
|
2017-09-20 10:28:00 -07:00
|
|
|
//! test cases to assert that functions do indeed contain the instructions that
|
|
|
|
|
//! we're expecting them to contain.
|
|
|
|
|
//!
|
|
|
|
|
//! The procedural macro here is relatively simple, it simply appends a
|
|
|
|
|
//! `#[test]` function to the original token stream which asserts that the
|
|
|
|
|
//! function itself contains the relevant instruction.
|
|
|
|
|
|
2017-09-19 14:46:00 -07:00
|
|
|
extern crate proc_macro;
|
2018-02-02 16:08:27 +01:00
|
|
|
extern crate proc_macro2;
|
2017-09-26 19:03:38 -07:00
|
|
|
#[macro_use]
|
|
|
|
|
extern crate quote;
|
2017-12-27 07:56:38 -08:00
|
|
|
extern crate syn;
|
2017-09-19 14:46:00 -07:00
|
|
|
|
2017-09-26 19:03:38 -07:00
|
|
|
use proc_macro2::TokenStream;
|
2018-11-10 15:45:16 +01:00
|
|
|
use quote::ToTokens;
|
2017-09-19 14:46:00 -07:00
|
|
|
|
|
|
|
|
#[proc_macro_attribute]
|
2017-10-27 17:55:29 +02:00
|
|
|
pub fn assert_instr(
|
2018-12-13 17:26:22 -06:00
|
|
|
attr: proc_macro::TokenStream,
|
|
|
|
|
item: proc_macro::TokenStream,
|
2017-10-27 17:55:29 +02:00
|
|
|
) -> proc_macro::TokenStream {
|
2018-12-13 20:17:30 -06:00
|
|
|
let invoc = match syn::parse::<Invoc>(attr) {
|
|
|
|
|
Ok(s) => s,
|
|
|
|
|
Err(e) => return e.to_compile_error().into(),
|
|
|
|
|
};
|
|
|
|
|
let item = match syn::parse::<syn::Item>(item) {
|
|
|
|
|
Ok(s) => s,
|
|
|
|
|
Err(e) => return e.to_compile_error().into(),
|
|
|
|
|
};
|
2017-11-11 23:35:00 +01:00
|
|
|
let func = match item {
|
|
|
|
|
syn::Item::Fn(ref f) => f,
|
2017-09-26 19:03:38 -07:00
|
|
|
_ => panic!("must be attached to a function"),
|
2017-09-19 14:46:00 -07:00
|
|
|
};
|
|
|
|
|
|
2017-09-26 19:03:38 -07:00
|
|
|
let instr = &invoc.instr;
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
let name = &func.ident;
|
|
|
|
|
|
|
|
|
|
// Disable assert_instr for x86 targets compiled with avx enabled, which
|
2018-06-06 00:17:14 +02:00
|
|
|
// causes LLVM to generate different intrinsics that the ones we are
|
|
|
|
|
// testing for.
|
2019-07-08 23:21:37 +02:00
|
|
|
let disable_assert_instr = std::env::var("STDARCH_DISABLE_ASSERT_INSTR").is_ok();
|
Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vand d0, d0, d1
vext.8 d1, d0, d0, #2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, #4
vorr d0, d0, d1
vext.8 d1, d0, d0, #2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vand q0, q0, q1
vext.8 q1, q0, q0, #4
vand q0, q0, q1
vext.8 q1, q0, q0, #2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, #8
vorr q0, q0, q1
vext.8 q1, q0, q0, #4
vorr q0, q0, q1
vext.8 q1, q0, q0, #2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes #362 .
* test avx on all x86 targets
* disable assert_instr on avx test
* enable all appropriate features
* disable assert_instr on x86+avx
* the fn_must_use is stable
* fix nbody example on armv7
* fixup
* fixup
* enable 64-bit wide mask MMX optimizations on x86_64 only
* remove coresimd dependency on cfg_if
* allow wasm to fail
* use an env variable to disable assert_instr tests
* disable m32x2 mask MMX optimization on macos
* move cfg_if to coresimd/macros.rs
2018-05-04 23:03:45 +02:00
|
|
|
|
2019-07-08 14:30:51 +02:00
|
|
|
// If instruction tests are disabled avoid emitting this shim at all, just
|
|
|
|
|
// return the original item without our attribute.
|
|
|
|
|
if !cfg!(optimized) || disable_assert_instr {
|
|
|
|
|
return (quote! { #item }).into();
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-10 19:22:54 +01:00
|
|
|
let instr_str = instr
|
|
|
|
|
.replace('.', "_")
|
2018-12-13 20:17:30 -06:00
|
|
|
.replace('/', "_")
|
|
|
|
|
.replace(':', "_")
|
2019-04-16 11:01:19 +02:00
|
|
|
.replace(char::is_whitespace, "");
|
2018-12-13 17:26:22 -06:00
|
|
|
let assert_name = syn::Ident::new(&format!("assert_{}_{}", name, instr_str), name.span());
|
2019-07-08 14:30:51 +02:00
|
|
|
// These name has to be unique enough for us to find it in the disassembly later on:
|
|
|
|
|
let shim_name = syn::Ident::new(
|
2019-07-08 23:21:37 +02:00
|
|
|
&format!("stdarch_test_shim_{}_{}", name, instr_str),
|
2019-07-08 14:30:51 +02:00
|
|
|
name.span(),
|
|
|
|
|
);
|
2018-01-25 12:13:48 -06:00
|
|
|
let mut inputs = Vec::new();
|
|
|
|
|
let mut input_vals = Vec::new();
|
|
|
|
|
let ret = &func.decl.output;
|
|
|
|
|
for arg in func.decl.inputs.iter() {
|
|
|
|
|
let capture = match *arg {
|
|
|
|
|
syn::FnArg::Captured(ref c) => c,
|
2018-05-16 19:10:19 +02:00
|
|
|
ref v => panic!(
|
|
|
|
|
"arguments must not have patterns: `{:?}`",
|
2018-05-21 20:37:41 +02:00
|
|
|
v.clone().into_token_stream()
|
2018-05-16 19:10:19 +02:00
|
|
|
),
|
2018-01-25 12:13:48 -06:00
|
|
|
};
|
|
|
|
|
let ident = match capture.pat {
|
|
|
|
|
syn::Pat::Ident(ref i) => &i.ident,
|
|
|
|
|
_ => panic!("must have bare arguments"),
|
|
|
|
|
};
|
2018-12-13 17:26:22 -06:00
|
|
|
if let Some(&(_, ref tts)) = invoc.args.iter().find(|a| *ident == a.0) {
|
2018-11-10 15:45:16 +01:00
|
|
|
input_vals.push(quote! { #tts });
|
|
|
|
|
} else {
|
|
|
|
|
inputs.push(capture);
|
|
|
|
|
input_vals.push(quote! { #ident });
|
|
|
|
|
}
|
2018-01-25 12:13:48 -06:00
|
|
|
}
|
2017-11-11 23:35:00 +01:00
|
|
|
|
2018-06-06 00:17:14 +02:00
|
|
|
let attrs = func
|
|
|
|
|
.attrs
|
2018-01-25 12:13:48 -06:00
|
|
|
.iter()
|
|
|
|
|
.filter(|attr| {
|
|
|
|
|
attr.path
|
|
|
|
|
.segments
|
|
|
|
|
.first()
|
2018-03-10 19:22:54 +01:00
|
|
|
.expect("attr.path.segments.first() failed")
|
2018-01-25 12:13:48 -06:00
|
|
|
.value()
|
|
|
|
|
.ident
|
2018-05-21 20:37:41 +02:00
|
|
|
.to_string()
|
2018-01-25 12:13:48 -06:00
|
|
|
.starts_with("target")
|
2018-11-10 13:30:13 +01:00
|
|
|
})
|
|
|
|
|
.collect::<Vec<_>>();
|
2018-01-25 12:13:48 -06:00
|
|
|
let attrs = Append(&attrs);
|
2018-02-11 10:04:53 -06:00
|
|
|
|
|
|
|
|
// Use an ABI on Windows that passes SIMD values in registers, like what
|
|
|
|
|
// happens on Unix (I think?) by default.
|
|
|
|
|
let abi = if cfg!(windows) {
|
|
|
|
|
syn::LitStr::new("vectorcall", proc_macro2::Span::call_site())
|
|
|
|
|
} else {
|
|
|
|
|
syn::LitStr::new("C", proc_macro2::Span::call_site())
|
|
|
|
|
};
|
2018-05-21 20:37:41 +02:00
|
|
|
let shim_name_str = format!("{}{}", shim_name, assert_name);
|
2018-01-25 12:13:48 -06:00
|
|
|
let to_test = quote! {
|
|
|
|
|
#attrs
|
2019-01-30 15:11:35 -08:00
|
|
|
#[no_mangle]
|
2019-07-08 14:30:51 +02:00
|
|
|
#[inline(never)]
|
|
|
|
|
pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
|
2018-05-21 20:37:41 +02:00
|
|
|
// The compiler in optimized mode by default runs a pass called
|
|
|
|
|
// "mergefunc" where it'll merge functions that look identical.
|
|
|
|
|
// Turns out some intrinsics produce identical code and they're
|
|
|
|
|
// folded together, meaning that one just jumps to another. This
|
|
|
|
|
// messes up our inspection of the disassembly of this function and
|
|
|
|
|
// we're not a huge fan of that.
|
|
|
|
|
//
|
|
|
|
|
// To thwart this pass and prevent functions from being merged we
|
|
|
|
|
// generate some code that's hopefully very tight in terms of
|
|
|
|
|
// codegen but is otherwise unique to prevent code from being
|
|
|
|
|
// folded.
|
2019-07-08 23:21:37 +02:00
|
|
|
::stdarch_test::_DONT_DEDUP.store(
|
2019-07-08 13:13:30 +02:00
|
|
|
std::mem::transmute(#shim_name_str.as_bytes().as_ptr()),
|
|
|
|
|
std::sync::atomic::Ordering::Relaxed,
|
|
|
|
|
);
|
2018-01-25 12:13:48 -06:00
|
|
|
#name(#(#input_vals),*)
|
|
|
|
|
}
|
2017-09-26 19:03:38 -07:00
|
|
|
};
|
|
|
|
|
|
2018-05-21 20:37:41 +02:00
|
|
|
let tts: TokenStream = quote! {
|
2018-08-15 18:20:33 +02:00
|
|
|
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
|
|
|
|
|
#[cfg_attr(not(target_arch = "wasm32"), test)]
|
2017-09-19 14:46:00 -07:00
|
|
|
#[allow(non_snake_case)]
|
2017-09-26 19:03:38 -07:00
|
|
|
fn #assert_name() {
|
|
|
|
|
#to_test
|
|
|
|
|
|
2019-07-08 14:30:51 +02:00
|
|
|
// Make sure that the shim is not removed by leaking it to unknown
|
|
|
|
|
// code:
|
|
|
|
|
unsafe { asm!("" : : "r"(#shim_name as usize) : "memory" : "volatile") };
|
|
|
|
|
|
2019-07-08 23:21:37 +02:00
|
|
|
::stdarch_test::assert(#shim_name as usize,
|
2018-01-25 12:13:48 -06:00
|
|
|
stringify!(#shim_name),
|
2018-08-15 18:20:33 +02:00
|
|
|
#instr);
|
2017-09-26 19:03:38 -07:00
|
|
|
}
|
2018-11-10 15:45:16 +01:00
|
|
|
};
|
2017-09-26 19:03:38 -07:00
|
|
|
// why? necessary now to get tests to work?
|
2018-12-13 17:26:22 -06:00
|
|
|
let tts: TokenStream = tts.to_string().parse().expect("cannot parse tokenstream");
|
2017-09-19 14:46:00 -07:00
|
|
|
|
2017-09-26 19:03:38 -07:00
|
|
|
let tts: TokenStream = quote! {
|
|
|
|
|
#item
|
|
|
|
|
#tts
|
2018-11-10 15:45:16 +01:00
|
|
|
};
|
2017-09-26 19:03:38 -07:00
|
|
|
tts.into()
|
2017-09-19 14:46:00 -07:00
|
|
|
}
|
|
|
|
|
|
2017-09-26 19:03:38 -07:00
|
|
|
struct Invoc {
|
2018-08-15 18:20:33 +02:00
|
|
|
instr: String,
|
2017-09-26 19:03:38 -07:00
|
|
|
args: Vec<(syn::Ident, syn::Expr)>,
|
|
|
|
|
}
|
2017-09-19 14:46:00 -07:00
|
|
|
|
2018-09-06 16:54:14 -07:00
|
|
|
impl syn::parse::Parse for Invoc {
|
|
|
|
|
fn parse(input: syn::parse::ParseStream) -> syn::parse::Result<Self> {
|
2018-12-13 20:17:30 -06:00
|
|
|
use syn::{ext::IdentExt, Token};
|
|
|
|
|
|
|
|
|
|
let mut instr = String::new();
|
|
|
|
|
while !input.is_empty() {
|
|
|
|
|
if input.parse::<Token![,]>().is_ok() {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if let Ok(ident) = syn::Ident::parse_any(input) {
|
|
|
|
|
instr.push_str(&ident.to_string());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if input.parse::<Token![.]>().is_ok() {
|
|
|
|
|
instr.push_str(".");
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if let Ok(s) = input.parse::<syn::LitStr>() {
|
|
|
|
|
instr.push_str(&s.value());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
println!("{:?}", input.cursor().token_stream());
|
|
|
|
|
return Err(input.error("expected an instruction"));
|
|
|
|
|
}
|
2019-01-22 18:48:36 +01:00
|
|
|
if instr.is_empty() {
|
2018-12-13 20:17:30 -06:00
|
|
|
return Err(input.error("expected an instruction before comma"));
|
|
|
|
|
}
|
2018-09-06 16:54:14 -07:00
|
|
|
let mut args = Vec::new();
|
2018-12-13 20:17:30 -06:00
|
|
|
while !input.is_empty() {
|
2018-09-06 16:54:14 -07:00
|
|
|
let name = input.parse::<syn::Ident>()?;
|
|
|
|
|
input.parse::<Token![=]>()?;
|
|
|
|
|
let expr = input.parse::<syn::Expr>()?;
|
|
|
|
|
args.push((name, expr));
|
2018-12-13 20:17:30 -06:00
|
|
|
|
|
|
|
|
if input.parse::<Token![,]>().is_err() {
|
|
|
|
|
if !input.is_empty() {
|
|
|
|
|
return Err(input.error("extra tokens at end"));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2018-09-06 16:54:14 -07:00
|
|
|
}
|
2018-11-10 15:45:16 +01:00
|
|
|
Ok(Self { instr, args })
|
2018-09-06 16:54:14 -07:00
|
|
|
}
|
2017-09-26 19:03:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct Append<T>(T);
|
|
|
|
|
|
|
|
|
|
impl<T> quote::ToTokens for Append<T>
|
2017-10-27 17:55:29 +02:00
|
|
|
where
|
|
|
|
|
T: Clone + IntoIterator,
|
|
|
|
|
T::Item: quote::ToTokens,
|
2017-09-26 19:03:38 -07:00
|
|
|
{
|
2018-05-21 20:37:41 +02:00
|
|
|
fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
|
2017-09-26 19:03:38 -07:00
|
|
|
for item in self.0.clone() {
|
|
|
|
|
item.to_tokens(tokens);
|
|
|
|
|
}
|
2017-09-19 14:46:00 -07:00
|
|
|
}
|
|
|
|
|
}
|