Commit Graph

2485 Commits

Author SHA1 Message Date
est31
5cb46aa089 Unstabilize some more #[doc(hidden)] items 2018-05-15 17:44:20 +02:00
est31
47c6ffa8c9 Make the Feature enum unstable (#445)
This is cleaner than just having a #[doc(hidden)] attribute
2018-05-15 09:49:07 -05:00
Steve Klabnik
755f78bce5 Remove notes about stability (#444)
This is already included with the regular stability attributes, and some of this is now stable, while some of it is not.
2018-05-14 11:10:31 -05:00
Alex Crichton
c781efd8d5 Pin wabt to a working revision (#440) 2018-05-06 12:01:01 -05:00
Pietro Lorefice
d8484f6bb5 Add support for x86 FMA extension (#439) 2018-05-06 11:57:37 -05:00
gnzlbg
c0bf5d9c42 Workarounds for all/any mask reductions on x86, armv7, and aarch64 (#425)
* Work arounds for LLVM6 code-gen bugs in all/any reductions

This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes #362 .

* test avx on all x86 targets

* disable assert_instr on avx test

* enable all appropriate features

* disable assert_instr on x86+avx

* the fn_must_use is stable

* fix nbody example on armv7

* fixup

* fixup

* enable 64-bit wide mask MMX optimizations on x86_64 only

* remove coresimd dependency on cfg_if

* allow wasm to fail

* use an env variable to disable assert_instr tests

* disable m32x2 mask MMX optimization on macos

* move cfg_if to coresimd/macros.rs
2018-05-04 16:03:45 -05:00
gnzlbg
9953f8e1ae Add missing From impls to {i,u}8x16. Closes #434 2018-04-28 13:51:19 +02:00
gnzlbg
30962e58e6 fix errors/warnings from the stabilization of cfg_target_feature and target_feature (#432)
* fix build after stabilization of cfg_target_feature and target_feature

* fix doc tests

* fix spurious unused_attributes warning

* fix more unused attribute warnings

* More unnecessary target features

* Remove no longer needed trait imports

* Remove fixed upstream workarounds

* Fix parsing the #[assert_instr] macro

Following upstream proc_macro changes

* Fix form and parsing of #[simd_test]

* Don't use Cargo features for testing modes

Instead use RUSTFLAGS with `--cfg`. This'll help us be compatible with the
latest Cargo where a tweak to workspaces and features made the previous
invocations we had invalid.

* Don't thread RUSTFLAGS through docker

* Re-gate on x86 verification

Closes #411
2018-04-26 21:54:15 -05:00
Guillaume Gomez
189283e76f remove rustdoc warnings 2018-04-19 23:29:05 +02:00
Alex Crichton
9b018d657b Yet more fixes for libstd... 2018-04-16 13:58:26 -07:00
Alex Crichton
6f8c6d9251 More libstd doc tweaks 2018-04-16 07:27:03 -07:00
Alex Crichton
e18fa0baf6 Shuffle around stdsimd::arch::detect bits (#428)
Compile more code on more platforms, tweak imports, try to catch mistakes
sooner.
2018-04-15 10:53:38 -05:00
Alex Crichton
1e7e606bd1 More fixes for stable libstd 2018-04-13 13:39:56 -07:00
Alex Crichton
ff62c6d638 Another fix for stable libstd 2018-04-13 08:58:12 -07:00
Alex Crichton
f650b93003 Stabilize x86/x86_64 intrinsics (#414)
This commit stabilizes all intrinsics in the `x86` and `x86_64` modules, namely
allowing stabilization of the `arch::x86` and `arch::x86_64` module in libstd.
Stabilizations here were applied in an automated fashion using [this
script][scr], and notably everything related to `__m64` was omitted from this
round of stabilization

[scr]: https://gist.github.com/alexcrichton/5b456d495d6fe1df46a158754565c7a5
2018-04-13 09:32:22 -05:00
gnzlbg
b89963711d add arm/aarch64 min/max intrinsics (#424) 2018-04-11 18:57:57 -05:00
gnzlbg
619a7ba19f fix vrsqrt on arm neon (#423)
Closes #383 .
2018-04-11 17:50:47 -05:00
gnzlbg
22b6bfc51c add missing from impls for u8x16 and i8x16 (#421) 2018-04-11 09:06:04 -05:00
gnzlbg
1218140901 implement vertical min/max ops (#418) 2018-04-06 13:50:58 -05:00
gnzlbg
bf11a67f0f remaining masks and select (#417) 2018-04-06 09:29:45 -05:00
gnzlbg
87ce896543 Documents arithmetic reduction semantics (#412)
* documents arithmetic reduction semantics
2018-04-05 19:36:04 +02:00
gnzlbg
80e6c726fb Enable fxsr (#415)
Re-enables fxsr
2018-04-05 16:56:58 +02:00
Alex Crichton
52f9198902 Fix compile errors in simd-test-macro 2018-04-03 07:34:02 -07:00
Alex Crichton
a3def97fc6 Bump dependencies on proc-macro2 2018-04-03 07:17:40 -07:00
gnzlbg
41abade2dc allow verify x86 to fail 2018-04-03 15:40:22 +02:00
gnzlbg
fa9a55105a upgrade formatting 2018-04-03 15:40:22 +02:00
gnzlbg
cae02b7fa0 update ubuntu version 2018-04-03 15:40:22 +02:00
gnzlbg
0239a1a0aa update intel SDE version 2018-04-03 15:40:22 +02:00
Jason Davies
4c3eed5d2d i128 is now stable. 2018-03-27 16:09:03 +02:00
Jason Davies
f5503198b8 rustfmt 2018-03-27 16:09:03 +02:00
Jason Davies
fd5fd85608 Add AArch64 SHA/AES. 2018-03-27 16:09:03 +02:00
gnzlbg
273fc1c344 endian-dependent conversions to/from tuples tests (#400) 2018-03-23 14:12:59 -05:00
gnzlbg
6ce3b9bbba add test for arrays/unions (#399) 2018-03-23 10:49:20 -05:00
Alex Crichton
aafe6ebb75 Fix default cargo test experience (#397)
Turns out Cargo doesn't automatically set `TARGET` for rustc invocations so
carry it forward manually from the build script over to the rustc invocation.
2018-03-22 17:40:44 -05:00
Alex Crichton
fa924e754d Fix doc builds 2018-03-22 11:43:05 -07:00
Alex Crichton
874829e4a9 rustfmt 2018-03-22 11:34:50 -07:00
Jason Davies
de82d9d26b Add support for Intel SHA extensions. (#395) 2018-03-22 13:32:44 -05:00
gnzlbg
56d9a42a2f add tests for endian-dependent behavior (#394)
* add tests for endian-dependent behavior

* format
2018-03-22 11:09:01 -05:00
gnzlbg
db819cb932 remove clone impls (#389)
Closes #386 .
2018-03-20 10:14:26 -05:00
gnzlbg
ff53ec6cb2 add arm neon vector types (#384) 2018-03-20 09:11:50 -05:00
gnzlbg
5f77210b34 add vector/scalar ops (#381) 2018-03-19 15:08:27 -05:00
gnzlbg
0dc39beed6 add llvm bug for reductions on aarch64 (#385) 2018-03-19 09:48:04 -05:00
gnzlbg
f107499c51 add missing fmt implementations (#380) 2018-03-19 09:44:57 -05:00
gnzlbg
68c53c1e55 Split protable vector types tests into multiple crates (#379)
* split the portable vector tests into separate crates

* use rustc reductions
2018-03-18 10:55:20 -05:00
Alex Crichton
44763c853d Fix tests on nightly (#378) 2018-03-16 13:06:07 -05:00
gnzlbg
2762e2ca9a [mips/mips64: msa] add add_a_b intrinsic (#365)
* [mips64/msa] add add_a_b intrinsic

* add make/file to mips64el's Dockerfile

* add run-time detection support for mips64

* add mips64 build bot

* generate docs for mips64

* fix linux test

* cleanup rt-detection

* support mips64/mips64el in stdsimd-test

* support asserting instructions with  in their name

* better error msgs for the auxv_crate test

* debug auxv on mips64

* override run-time detection on mips msa tests

* remove unused #[macro_use]

* try another MIPS cpu

* detect default TARGET in simd-test-macro

* use mips64r2-generic

* disable unused function in mips tests

* move msa to mips

* remove mips from ci

* split into mips and mips64 modules

* add rt-detection for 32-bit mips

* fmt

* remove merge error

* add norun build bots for mips

* add -p to avoid changing the cwd

* fixup

* refactor run-time detection module
2018-03-10 12:22:54 -06:00
QuietMisdreavus
63b540b07a rustfmt 2018-03-10 00:04:01 +01:00
QuietMisdreavus
ef0d02d04b document all arches when part of std
unfortunately, stdsimd's version of the documentation will be blanked
out in favor of coresimd's version, but coresimd (when re-exported in
libcore) will include all the arches
2018-03-10 00:04:01 +01:00
Alex Crichton
cb4a957efd Add initial wasm memory grow/current intrinsics (#361)
This exposes access to the `grow_memory` and `current_memory` instructions
provided by wasm in what will hopefully be a stable interface (the stable part
being x86 first in theory).
2018-03-09 09:21:08 -06:00
Jake Goulding
2b1ee5288f Fix unsigned typo (#359) 2018-03-08 10:29:09 -06:00