Auto merge of #107634 - scottmcm:array-drain, r=thomcc

Improve the `array::map` codegen The `map` method on arrays [is documented as sometimes performing poorly](https://doc.rust-lang.org/std/primitive.array.html#note-on-performance-and-stack-usage), and after [a question on URLO](https://users.rust-lang.org/t/try-trait-residual-o-trait-and-try-collect-into-array/88510?u=scottmcm) prompted me to take another look at the core [`try_collect_into_array`](7c46fb2111/library/core/src/array/mod.rs (L865-L912)) function, I had some ideas that ended up working better than I'd expected. There's three main ideas in here, split over three commits: 1. Don't use `array::IntoIter` when we can avoid it, since that seems to not get SRoA'd, meaning that every step writes things like loop counters into the stack unnecessarily 2. Don't return arrays in `Result`s unnecessarily, as that doesn't seem to optimize away even with `unwrap_unchecked` (perhaps because it needs to get moved into a new LLVM type to account for the discriminant) 3. Don't distract LLVM with all the `Option` dances when we know for sure we have enough items (like in `map` and `zip`). This one's a larger commit as to do it I ended up adding a new `pub(crate)` trait, but hopefully those changes are still straight-forward. (No libs-api changes; everything should be completely implementation-detail-internal.) It's still not completely fixed -- I think it needs pcwalton's `memcpy` optimizations still (#103830) to get further -- but this seems to go much better than before. And the remaining `memcpy`s are just `transmute`-equivalent (`[T; N] -> ManuallyDrop<[T; N]>` and `[MaybeUninit<T>; N] -> [T; N]`), so hopefully those will be easier to remove with LLVM16 than the previous subobject copies 🤞 r? `@thomcc` As a simple example, this test ```rust pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] { x.map(|x| 13 * x + 7) } ``` On nightly <https://rust.godbolt.org/z/xK7548TGj> takes `sub rsp, 808` ```llvm start: %array.i.i.i.i = alloca [64 x i32], align 4 %_3.sroa.5.i.i.i = alloca [65 x i32], align 4 %_5.i = alloca %"core::iter::adapters::map::Map<core::array::iter::IntoIter<u32, 64>, [closure@/app/example.rs:2:11: 2:14]>", align 8 ``` (and yes, that's a 6**5**-element array `alloca` despite 6**4**-element input and output) But with this PR it's only `sub rsp, 520` ```llvm start: %array.i.i.i.i.i.i = alloca [64 x i32], align 4 %array1.i.i.i = alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>", align 4 ``` Similarly, the loop it emits on nightly is scalar-only and horrifying ```nasm .LBB0_1: mov esi, 64 mov edi, 0 cmp rdx, 64 je .LBB0_3 lea rsi, [rdx + 1] mov qword ptr [rsp + 784], rsi mov r8d, dword ptr [rsp + 4*rdx + 528] mov edi, 1 lea edx, [r8 + 2*r8] lea r8d, [r8 + 4*rdx] add r8d, 7 .LBB0_3: test edi, edi je .LBB0_11 mov dword ptr [rsp + 4*rcx + 272], r8d cmp rsi, 64 jne .LBB0_6 xor r8d, r8d mov edx, 64 test r8d, r8d jne .LBB0_8 jmp .LBB0_11 .LBB0_6: lea rdx, [rsi + 1] mov qword ptr [rsp + 784], rdx mov edi, dword ptr [rsp + 4*rsi + 528] mov r8d, 1 lea esi, [rdi + 2*rdi] lea edi, [rdi + 4*rsi] add edi, 7 test r8d, r8d je .LBB0_11 .LBB0_8: mov dword ptr [rsp + 4*rcx + 276], edi add rcx, 2 cmp rcx, 64 jne .LBB0_1 ``` whereas with this PR it's unrolled and vectorized ```nasm vpmulld ymm1, ymm0, ymmword ptr [rsp + 64] vpaddd ymm1, ymm1, ymm2 vmovdqu ymmword ptr [rsp + 328], ymm1 vpmulld ymm1, ymm0, ymmword ptr [rsp + 96] vpaddd ymm1, ymm1, ymm2 vmovdqu ymmword ptr [rsp + 360], ymm1 ``` (though sadly still stack-to-stack)
2023-02-13 10:18:48 +00:00
parent 20081880ad bb77860d9c
commit 2d91939bb7
16 changed files with 404 additions and 151 deletions
--- a/library/core/src/array/drain.rs
+++ b/library/core/src/array/drain.rs
@@ -0,0 +1,76 @@
+use crate::iter::{TrustedLen, UncheckedIterator};
+use crate::mem::ManuallyDrop;
+use crate::ptr::drop_in_place;
+use crate::slice;
+
+/// A situationally-optimized version of `array.into_iter().for_each(func)`.
+///
+/// [`crate::array::IntoIter`]s are great when you need an owned iterator, but
+/// storing the entire array *inside* the iterator like that can sometimes
+/// pessimize code.  Notable, it can be more bytes than you really want to move
+/// around, and because the array accesses index into it SRoA has a harder time
+/// optimizing away the type than it does iterators that just hold a couple pointers.
+///
+/// Thus this function exists, which gives a way to get *moved* access to the
+/// elements of an array using a small iterator -- no bigger than a slice iterator.
+///
+/// The function-taking-a-closure structure makes it safe, as it keeps callers
+/// from looking at already-dropped elements.
+pub(crate) fn drain_array_with<T, R, const N: usize>(
+    array: [T; N],
+    func: impl for<'a> FnOnce(Drain<'a, T>) -> R,
+) -> R {
+    let mut array = ManuallyDrop::new(array);
+    // SAFETY: Now that the local won't drop it, it's ok to construct the `Drain` which will.
+    let drain = Drain(array.iter_mut());
+    func(drain)
+}
+
+/// See [`drain_array_with`] -- this is `pub(crate)` only so it's allowed to be
+/// mentioned in the signature of that method.  (Otherwise it hits `E0446`.)
+// INVARIANT: It's ok to drop the remainder of the inner iterator.
+pub(crate) struct Drain<'a, T>(slice::IterMut<'a, T>);
+
+impl<T> Drop for Drain<'_, T> {
+    fn drop(&mut self) {
+        // SAFETY: By the type invariant, we're allowed to drop all these.
+        unsafe { drop_in_place(self.0.as_mut_slice()) }
+    }
+}
+
+impl<T> Iterator for Drain<'_, T> {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        let p: *const T = self.0.next()?;
+        // SAFETY: The iterator was already advanced, so we won't drop this later.
+        Some(unsafe { p.read() })
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let n = self.len();
+        (n, Some(n))
+    }
+}
+
+impl<T> ExactSizeIterator for Drain<'_, T> {
+    #[inline]
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+}
+
+// SAFETY: This is a 1:1 wrapper for a slice iterator, which is also `TrustedLen`.
+unsafe impl<T> TrustedLen for Drain<'_, T> {}
+
+impl<T> UncheckedIterator for Drain<'_, T> {
+    unsafe fn next_unchecked(&mut self) -> T {
+        // SAFETY: `Drain` is 1:1 with the inner iterator, so if the caller promised
+        // that there's an element left, the inner iterator has one too.
+        let p: *const T = unsafe { self.0.next_unchecked() };
+        // SAFETY: The iterator was already advanced, so we won't drop this later.
+        unsafe { p.read() }
+    }
+}
--- a/library/core/src/array/mod.rs
+++ b/library/core/src/array/mod.rs
@@ -10,16 +10,19 @@ use crate::convert::{Infallible, TryFrom};
 use crate::error::Error;
 use crate::fmt;
 use crate::hash::{self, Hash};
-use crate::iter::TrustedLen;
+use crate::iter::UncheckedIterator;
 use crate::mem::{self, MaybeUninit};
 use crate::ops::{
    ChangeOutputType, ControlFlow, FromResidual, Index, IndexMut, NeverShortCircuit, Residual, Try,
 };
 use crate::slice::{Iter, IterMut};

+mod drain;
 mod equality;
 mod iter;

+pub(crate) use drain::drain_array_with;
+
 #[stable(feature = "array_value_iter", since = "1.51.0")]
 pub use iter::IntoIter;

@@ -52,16 +55,11 @@ pub use iter::IntoIter;
 /// ```
 #[inline]
 #[stable(feature = "array_from_fn", since = "1.63.0")]
-pub fn from_fn<T, const N: usize, F>(mut cb: F) -> [T; N]
+pub fn from_fn<T, const N: usize, F>(cb: F) -> [T; N]
 where
    F: FnMut(usize) -> T,
 {
-    let mut idx = 0;
-    [(); N].map(|_| {
-        let res = cb(idx);
-        idx += 1;
-        res
-    })
+    try_from_fn(NeverShortCircuit::wrap_mut_1(cb)).0
 }

 /// Creates an array `[T; N]` where each fallible array element `T` is returned by the `cb` call.
@@ -101,9 +99,14 @@ where
    R: Try,
    R::Residual: Residual<[R::Output; N]>,
 {
-    // SAFETY: we know for certain that this iterator will yield exactly `N`
-    // items.
-    unsafe { try_collect_into_array_unchecked(&mut (0..N).map(cb)) }
+    let mut array = MaybeUninit::uninit_array::<N>();
+    match try_from_fn_erased(&mut array, cb) {
+        ControlFlow::Break(r) => FromResidual::from_residual(r),
+        ControlFlow::Continue(()) => {
+            // SAFETY: All elements of the array were populated.
+            try { unsafe { MaybeUninit::array_assume_init(array) } }
+        }
+    }
 }

 /// Converts a reference to `T` into a reference to an array of length 1 (without copying).
@@ -414,9 +417,7 @@ trait SpecArrayClone: Clone {
 impl<T: Clone> SpecArrayClone for T {
    #[inline]
    default fn clone<const N: usize>(array: &[T; N]) -> [T; N] {
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { collect_into_array_unchecked(&mut array.iter().cloned()) }
+        from_trusted_iterator(array.iter().cloned())
    }
 }

@@ -500,9 +501,7 @@ impl<T, const N: usize> [T; N] {
    where
        F: FnMut(T) -> U,
    {
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { collect_into_array_unchecked(&mut IntoIterator::into_iter(self).map(f)) }
+        self.try_map(NeverShortCircuit::wrap_mut_1(f)).0
    }

    /// A fallible function `f` applied to each element on array `self` in order to
@@ -539,9 +538,7 @@ impl<T, const N: usize> [T; N] {
        R: Try,
        R::Residual: Residual<[R::Output; N]>,
    {
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { try_collect_into_array_unchecked(&mut IntoIterator::into_iter(self).map(f)) }
+        drain_array_with(self, |iter| try_from_trusted_iterator(iter.map(f)))
    }

    /// 'Zips up' two arrays into a single array of pairs.
@@ -562,11 +559,9 @@ impl<T, const N: usize> [T; N] {
    /// ```
    #[unstable(feature = "array_zip", issue = "80094")]
    pub fn zip<U>(self, rhs: [U; N]) -> [(T, U); N] {
-        let mut iter = IntoIterator::into_iter(self).zip(rhs);
-
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { collect_into_array_unchecked(&mut iter) }
+        drain_array_with(self, |lhs| {
+            drain_array_with(rhs, |rhs| from_trusted_iterator(crate::iter::zip(lhs, rhs)))
+        })
    }

    /// Returns a slice containing the entire array. Equivalent to `&s[..]`.
@@ -613,9 +608,7 @@ impl<T, const N: usize> [T; N] {
    /// ```
    #[unstable(feature = "array_methods", issue = "76118")]
    pub fn each_ref(&self) -> [&T; N] {
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { collect_into_array_unchecked(&mut self.iter()) }
+        from_trusted_iterator(self.iter())
    }

    /// Borrows each element mutably and returns an array of mutable references
@@ -635,9 +628,7 @@ impl<T, const N: usize> [T; N] {
    /// ```
    #[unstable(feature = "array_methods", issue = "76118")]
    pub fn each_mut(&mut self) -> [&mut T; N] {
-        // SAFETY: we know for certain that this iterator will yield exactly `N`
-        // items.
-        unsafe { collect_into_array_unchecked(&mut self.iter_mut()) }
+        from_trusted_iterator(self.iter_mut())
    }

    /// Divides one array reference into two at an index.
@@ -797,105 +788,71 @@ impl<T, const N: usize> [T; N] {
    }
 }

-/// Pulls `N` items from `iter` and returns them as an array. If the iterator
-/// yields fewer than `N` items, this function exhibits undefined behavior.
+/// Populate an array from the first `N` elements of `iter`
 ///
-/// See [`try_collect_into_array`] for more information.
+/// # Panics
 ///
+/// If the iterator doesn't actually have enough items.
 ///
-/// # Safety
-///
-/// It is up to the caller to guarantee that `iter` yields at least `N` items.
-/// Violating this condition causes undefined behavior.
-unsafe fn try_collect_into_array_unchecked<I, T, R, const N: usize>(iter: &mut I) -> R::TryType
-where
-    // Note: `TrustedLen` here is somewhat of an experiment. This is just an
-    // internal function, so feel free to remove if this bound turns out to be a
-    // bad idea. In that case, remember to also remove the lower bound
-    // `debug_assert!` below!
-    I: Iterator + TrustedLen,
-    I::Item: Try<Output = T, Residual = R>,
-    R: Residual<[T; N]>,
-{
-    debug_assert!(N <= iter.size_hint().1.unwrap_or(usize::MAX));
-    debug_assert!(N <= iter.size_hint().0);
-
-    // SAFETY: covered by the function contract.
-    unsafe { try_collect_into_array(iter).unwrap_unchecked() }
-}
-
-// Infallible version of `try_collect_into_array_unchecked`.
-unsafe fn collect_into_array_unchecked<I, const N: usize>(iter: &mut I) -> [I::Item; N]
-where
-    I: Iterator + TrustedLen,
-{
-    let mut map = iter.map(NeverShortCircuit);
-
-    // SAFETY: The same safety considerations w.r.t. the iterator length
-    // apply for `try_collect_into_array_unchecked` as for
-    // `collect_into_array_unchecked`
-    match unsafe { try_collect_into_array_unchecked(&mut map) } {
-        NeverShortCircuit(array) => array,
-    }
-}
-
-/// Pulls `N` items from `iter` and returns them as an array. If the iterator
-/// yields fewer than `N` items, `Err` is returned containing an iterator over
-/// the already yielded items.
-///
-/// Since the iterator is passed as a mutable reference and this function calls
-/// `next` at most `N` times, the iterator can still be used afterwards to
-/// retrieve the remaining items.
-///
-/// If `iter.next()` panicks, all items already yielded by the iterator are
-/// dropped.
+/// By depending on `TrustedLen`, however, we can do that check up-front (where
+/// it easily optimizes away) so it doesn't impact the loop that fills the array.
 #[inline]
-fn try_collect_into_array<I, T, R, const N: usize>(
-    iter: &mut I,
-) -> Result<R::TryType, IntoIter<T, N>>
+fn from_trusted_iterator<T, const N: usize>(iter: impl UncheckedIterator<Item = T>) -> [T; N] {
+    try_from_trusted_iterator(iter.map(NeverShortCircuit)).0
+}
+
+#[inline]
+fn try_from_trusted_iterator<T, R, const N: usize>(
+    iter: impl UncheckedIterator<Item = R>,
+) -> ChangeOutputType<R, [T; N]>
 where
-    I: Iterator,
-    I::Item: Try<Output = T, Residual = R>,
-    R: Residual<[T; N]>,
+    R: Try<Output = T>,
+    R::Residual: Residual<[T; N]>,
 {
-    if N == 0 {
-        // SAFETY: An empty array is always inhabited and has no validity invariants.
-        return Ok(Try::from_output(unsafe { mem::zeroed() }));
-    }
-
-    let mut array = MaybeUninit::uninit_array::<N>();
-    let mut guard = Guard { array_mut: &mut array, initialized: 0 };
-
-    for _ in 0..N {
-        match iter.next() {
-            Some(item_rslt) => {
-                let item = match item_rslt.branch() {
-                    ControlFlow::Break(r) => {
-                        return Ok(FromResidual::from_residual(r));
-                    }
-                    ControlFlow::Continue(elem) => elem,
-                };
-
-                // SAFETY: `guard.initialized` starts at 0, which means push can be called
-                // at most N times, which this loop does.
-                unsafe {
-                    guard.push_unchecked(item);
-                }
-            }
-            None => {
-                let alive = 0..guard.initialized;
-                mem::forget(guard);
-                // SAFETY: `array` was initialized with exactly `initialized`
-                // number of elements.
-                return Err(unsafe { IntoIter::new_unchecked(array, alive) });
-            }
+    assert!(iter.size_hint().0 >= N);
+    fn next<T>(mut iter: impl UncheckedIterator<Item = T>) -> impl FnMut(usize) -> T {
+        move |_| {
+            // SAFETY: We know that `from_fn` will call this at most N times,
+            // and we checked to ensure that we have at least that many items.
+            unsafe { iter.next_unchecked() }
        }
    }

+    try_from_fn(next(iter))
+}
+
+/// Version of [`try_from_fn`] using a passed-in slice in order to avoid
+/// needing to monomorphize for every array length.
+///
+/// This takes a generator rather than an iterator so that *at the type level*
+/// it never needs to worry about running out of items.  When combined with
+/// an infallible `Try` type, that means the loop canonicalizes easily, allowing
+/// it to optimize well.
+///
+/// It would be *possible* to unify this and [`iter_next_chunk_erased`] into one
+/// function that does the union of both things, but last time it was that way
+/// it resulted in poor codegen from the "are there enough source items?" checks
+/// not optimizing away.  So if you give it a shot, make sure to watch what
+/// happens in the codegen tests.
+#[inline]
+fn try_from_fn_erased<T, R>(
+    buffer: &mut [MaybeUninit<T>],
+    mut generator: impl FnMut(usize) -> R,
+) -> ControlFlow<R::Residual>
+where
+    R: Try<Output = T>,
+{
+    let mut guard = Guard { array_mut: buffer, initialized: 0 };
+
+    while guard.initialized < guard.array_mut.len() {
+        let item = generator(guard.initialized).branch()?;
+
+        // SAFETY: The loop condition ensures we have space to push the item
+        unsafe { guard.push_unchecked(item) };
+    }
+
    mem::forget(guard);
-    // SAFETY: All elements of the array were populated in the loop above.
-    let output = unsafe { array.transpose().assume_init() };
-    Ok(Try::from_output(output))
+    ControlFlow::Continue(())
 }

 /// Panic guard for incremental initialization of arrays.
@@ -909,14 +866,14 @@ where
 ///
 /// To minimize indirection fields are still pub but callers should at least use
 /// `push_unchecked` to signal that something unsafe is going on.
-pub(crate) struct Guard<'a, T, const N: usize> {
+struct Guard<'a, T> {
    /// The array to be initialized.
-    pub array_mut: &'a mut [MaybeUninit<T>; N],
+    pub array_mut: &'a mut [MaybeUninit<T>],
    /// The number of items that have been initialized so far.
    pub initialized: usize,
 }

-impl<T, const N: usize> Guard<'_, T, N> {
+impl<T> Guard<'_, T> {
    /// Adds an item to the array and updates the initialized item counter.
    ///
    /// # Safety
@@ -934,28 +891,73 @@ impl<T, const N: usize> Guard<'_, T, N> {
    }
 }

-impl<T, const N: usize> Drop for Guard<'_, T, N> {
+impl<T> Drop for Guard<'_, T> {
    fn drop(&mut self) {
-        debug_assert!(self.initialized <= N);
+        debug_assert!(self.initialized <= self.array_mut.len());

        // SAFETY: this slice will contain only initialized objects.
        unsafe {
            crate::ptr::drop_in_place(MaybeUninit::slice_assume_init_mut(
-                &mut self.array_mut.get_unchecked_mut(..self.initialized),
+                self.array_mut.get_unchecked_mut(..self.initialized),
            ));
        }
    }
 }

-/// Returns the next chunk of `N` items from the iterator or errors with an
-/// iterator over the remainder. Used for `Iterator::next_chunk`.
+/// Pulls `N` items from `iter` and returns them as an array. If the iterator
+/// yields fewer than `N` items, `Err` is returned containing an iterator over
+/// the already yielded items.
+///
+/// Since the iterator is passed as a mutable reference and this function calls
+/// `next` at most `N` times, the iterator can still be used afterwards to
+/// retrieve the remaining items.
+///
+/// If `iter.next()` panicks, all items already yielded by the iterator are
+/// dropped.
+///
+/// Used for [`Iterator::next_chunk`].
 #[inline]
-pub(crate) fn iter_next_chunk<I, const N: usize>(
-    iter: &mut I,
-) -> Result<[I::Item; N], IntoIter<I::Item, N>>
-where
-    I: Iterator,
-{
-    let mut map = iter.map(NeverShortCircuit);
-    try_collect_into_array(&mut map).map(|NeverShortCircuit(arr)| arr)
+pub(crate) fn iter_next_chunk<T, const N: usize>(
+    iter: &mut impl Iterator<Item = T>,
+) -> Result<[T; N], IntoIter<T, N>> {
+    let mut array = MaybeUninit::uninit_array::<N>();
+    let r = iter_next_chunk_erased(&mut array, iter);
+    match r {
+        Ok(()) => {
+            // SAFETY: All elements of `array` were populated.
+            Ok(unsafe { MaybeUninit::array_assume_init(array) })
+        }
+        Err(initialized) => {
+            // SAFETY: Only the first `initialized` elements were populated
+            Err(unsafe { IntoIter::new_unchecked(array, 0..initialized) })
+        }
+    }
+}
+
+/// Version of [`iter_next_chunk`] using a passed-in slice in order to avoid
+/// needing to monomorphize for every array length.
+///
+/// Unfortunately this loop has two exit conditions, the buffer filling up
+/// or the iterator running out of items, making it tend to optimize poorly.
+#[inline]
+fn iter_next_chunk_erased<T>(
+    buffer: &mut [MaybeUninit<T>],
+    iter: &mut impl Iterator<Item = T>,
+) -> Result<(), usize> {
+    let mut guard = Guard { array_mut: buffer, initialized: 0 };
+    while guard.initialized < guard.array_mut.len() {
+        let Some(item) = iter.next() else {
+            // Unlike `try_from_fn_erased`, we want to keep the partial results,
+            // so we need to defuse the guard instead of using `?`.
+            let initialized = guard.initialized;
+            mem::forget(guard);
+            return Err(initialized)
+        };
+
+        // SAFETY: The loop condition ensures we have space to push the item
+        unsafe { guard.push_unchecked(item) };
+    }
+
+    mem::forget(guard);
+    Ok(())
 }