Auto merge of #107833 - Zoxc:arena-query-clean, r=cjgillot

Factor query arena allocation out from query caches This moves the logic for arena allocation out from the query caches into conditional code in the query system. The specialized arena caches are removed. A new `QuerySystem` type is added in `rustc_middle` which contains the arenas, providers and query caches. Performance seems to be slightly regressed: <table><tr><td rowspan="2">Benchmark</td><td colspan="1"><b>Before</b></th><td colspan="2"><b>After</b></th></tr><tr><td align="right">Time</td><td align="right">Time</td><td align="right">%</th></tr><tr><td>🟣 <b>clap</b>:check</td><td align="right">1.8053s</td><td align="right">1.8109s</td><td align="right"> 0.31%</td></tr><tr><td>🟣 <b>hyper</b>:check</td><td align="right">0.2600s</td><td align="right">0.2597s</td><td align="right"> -0.10%</td></tr><tr><td>🟣 <b>regex</b>:check</td><td align="right">0.9973s</td><td align="right">1.0006s</td><td align="right"> 0.34%</td></tr><tr><td>🟣 <b>syn</b>:check</td><td align="right">1.6048s</td><td align="right">1.6051s</td><td align="right"> 0.02%</td></tr><tr><td>🟣 <b>syntex_syntax</b>:check</td><td align="right">6.2992s</td><td align="right">6.3159s</td><td align="right"> 0.26%</td></tr><tr><td>Total</td><td align="right">10.9664s</td><td align="right">10.9922s</td><td align="right"> 0.23%</td></tr><tr><td>Summary</td><td align="right">1.0000s</td><td align="right">1.0017s</td><td align="right"> 0.17%</td></tr></table> Incremental performance is a bit worse: <table><tr><td rowspan="2">Benchmark</td><td colspan="1"><b>Before</b></th><td colspan="2"><b>After</b></th></tr><tr><td align="right">Time</td><td align="right">Time</td><td align="right">%</th></tr><tr><td>🟣 <b>clap</b>:check:initial</td><td align="right">2.2103s</td><td align="right">2.2247s</td><td align="right"> 0.65%</td></tr><tr><td>🟣 <b>hyper</b>:check:initial</td><td align="right">0.3335s</td><td align="right">0.3349s</td><td align="right"> 0.41%</td></tr><tr><td>🟣 <b>regex</b>:check:initial</td><td align="right">1.2597s</td><td align="right">1.2650s</td><td align="right"> 0.42%</td></tr><tr><td>🟣 <b>syn</b>:check:initial</td><td align="right">2.0521s</td><td align="right">2.0613s</td><td align="right"> 0.45%</td></tr><tr><td>🟣 <b>syntex_syntax</b>:check:initial</td><td align="right">7.8275s</td><td align="right">7.8583s</td><td align="right"> 0.39%</td></tr><tr><td>Total</td><td align="right">13.6832s</td><td align="right">13.7442s</td><td align="right"> 0.45%</td></tr><tr><td>Summary</td><td align="right">1.0000s</td><td align="right">1.0046s</td><td align="right"> 0.46%</td></tr></table> It does seem like LLVM optimizers struggle a bit with the current state of the query system. Based on top of https://github.com/rust-lang/rust/pull/107782 and https://github.com/rust-lang/rust/pull/107802. r? `@cjgillot`
2023-02-16 22:10:10 +00:00
parent 9a7cc6c32f caf29b2727
commit 947b696ce0
8 changed files with 148 additions and 240 deletions
--- a/compiler/rustc_query_system/src/query/caches.rs
+++ b/compiler/rustc_query_system/src/query/caches.rs
@@ -1,12 +1,10 @@
 use crate::dep_graph::DepNodeIndex;

-use rustc_arena::TypedArena;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sharded;
 #[cfg(parallel_compiler)]
 use rustc_data_structures::sharded::Sharded;
 use rustc_data_structures::sync::Lock;
-use rustc_data_structures::sync::WorkerLocal;
 use rustc_index::vec::{Idx, IndexVec};
 use std::fmt::Debug;
 use std::hash::Hash;
@@ -16,12 +14,10 @@ pub trait CacheSelector<'tcx, V> {
    type Cache
    where
        V: Copy;
-    type ArenaCache;
 }

 pub trait QueryStorage {
-    type Value: Debug;
-    type Stored: Copy;
+    type Value: Copy;
 }

 pub trait QueryCache: QueryStorage + Sized {
@@ -31,9 +27,9 @@ pub trait QueryCache: QueryStorage + Sized {
    /// It returns the shard index and a lock guard to the shard,
    /// which will be used if the query is not in the cache and we need
    /// to compute it.
-    fn lookup(&self, key: &Self::Key) -> Option<(Self::Stored, DepNodeIndex)>;
+    fn lookup(&self, key: &Self::Key) -> Option<(Self::Value, DepNodeIndex)>;

-    fn complete(&self, key: Self::Key, value: Self::Value, index: DepNodeIndex) -> Self::Stored;
+    fn complete(&self, key: Self::Key, value: Self::Value, index: DepNodeIndex);

    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex));
 }
@@ -44,7 +40,6 @@ impl<'tcx, K: Eq + Hash, V: 'tcx> CacheSelector<'tcx, V> for DefaultCacheSelecto
    type Cache = DefaultCache<K, V>
    where
        V: Copy;
-    type ArenaCache = ArenaCache<'tcx, K, V>;
 }

 pub struct DefaultCache<K, V> {
@@ -62,7 +57,6 @@ impl<K, V> Default for DefaultCache<K, V> {

 impl<K: Eq + Hash, V: Copy + Debug> QueryStorage for DefaultCache<K, V> {
    type Value = V;
-    type Stored = V;
 }

 impl<K, V> QueryCache for DefaultCache<K, V>
@@ -85,7 +79,7 @@ where
    }

    #[inline]
-    fn complete(&self, key: K, value: V, index: DepNodeIndex) -> Self::Stored {
+    fn complete(&self, key: K, value: V, index: DepNodeIndex) {
        #[cfg(parallel_compiler)]
        let mut lock = self.cache.get_shard_by_value(&key).lock();
        #[cfg(not(parallel_compiler))]
@@ -93,7 +87,6 @@ where
        // We may be overwriting another value. This is all right, since the dep-graph
        // will check that the fingerprint matches.
        lock.insert(key, (value, index));
-        value
    }

    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
@@ -122,7 +115,6 @@ impl<'tcx, V: 'tcx> CacheSelector<'tcx, V> for SingleCacheSelector {
    type Cache = SingleCache<V>
    where
        V: Copy;
-    type ArenaCache = ArenaCache<'tcx, (), V>;
 }

 pub struct SingleCache<V> {
@@ -137,7 +129,6 @@ impl<V> Default for SingleCache<V> {

 impl<V: Copy + Debug> QueryStorage for SingleCache<V> {
    type Value = V;
-    type Stored = V;
 }

 impl<V> QueryCache for SingleCache<V>
@@ -152,9 +143,8 @@ where
    }

    #[inline]
-    fn complete(&self, _key: (), value: V, index: DepNodeIndex) -> Self::Stored {
+    fn complete(&self, _key: (), value: V, index: DepNodeIndex) {
        *self.cache.lock() = Some((value, index));
-        value
    }

    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
@@ -162,85 +152,12 @@ where
    }
 }

-pub struct ArenaCache<'tcx, K, V> {
-    arena: WorkerLocal<TypedArena<(V, DepNodeIndex)>>,
-    #[cfg(parallel_compiler)]
-    cache: Sharded<FxHashMap<K, &'tcx (V, DepNodeIndex)>>,
-    #[cfg(not(parallel_compiler))]
-    cache: Lock<FxHashMap<K, &'tcx (V, DepNodeIndex)>>,
-}
-
-impl<'tcx, K, V> Default for ArenaCache<'tcx, K, V> {
-    fn default() -> Self {
-        ArenaCache { arena: WorkerLocal::new(|_| TypedArena::default()), cache: Default::default() }
-    }
-}
-
-impl<'tcx, K: Eq + Hash, V: Debug + 'tcx> QueryStorage for ArenaCache<'tcx, K, V> {
-    type Value = V;
-    type Stored = &'tcx V;
-}
-
-impl<'tcx, K, V: 'tcx> QueryCache for ArenaCache<'tcx, K, V>
-where
-    K: Eq + Hash + Clone + Debug,
-    V: Debug,
-{
-    type Key = K;
-
-    #[inline(always)]
-    fn lookup(&self, key: &K) -> Option<(&'tcx V, DepNodeIndex)> {
-        let key_hash = sharded::make_hash(key);
-        #[cfg(parallel_compiler)]
-        let lock = self.cache.get_shard_by_hash(key_hash).lock();
-        #[cfg(not(parallel_compiler))]
-        let lock = self.cache.lock();
-        let result = lock.raw_entry().from_key_hashed_nocheck(key_hash, key);
-
-        if let Some((_, value)) = result { Some((&value.0, value.1)) } else { None }
-    }
-
-    #[inline]
-    fn complete(&self, key: K, value: V, index: DepNodeIndex) -> Self::Stored {
-        let value = self.arena.alloc((value, index));
-        let value = unsafe { &*(value as *const _) };
-        #[cfg(parallel_compiler)]
-        let mut lock = self.cache.get_shard_by_value(&key).lock();
-        #[cfg(not(parallel_compiler))]
-        let mut lock = self.cache.lock();
-        // We may be overwriting another value. This is all right, since the dep-graph
-        // will check that the fingerprint matches.
-        lock.insert(key, value);
-        &value.0
-    }
-
-    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        #[cfg(parallel_compiler)]
-        {
-            let shards = self.cache.lock_shards();
-            for shard in shards.iter() {
-                for (k, v) in shard.iter() {
-                    f(k, &v.0, v.1);
-                }
-            }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            let map = self.cache.lock();
-            for (k, v) in map.iter() {
-                f(k, &v.0, v.1);
-            }
-        }
-    }
-}
-
 pub struct VecCacheSelector<K>(PhantomData<K>);

 impl<'tcx, K: Idx, V: 'tcx> CacheSelector<'tcx, V> for VecCacheSelector<K> {
    type Cache = VecCache<K, V>
    where
        V: Copy;
-    type ArenaCache = VecArenaCache<'tcx, K, V>;
 }

 pub struct VecCache<K: Idx, V> {
@@ -258,7 +175,6 @@ impl<K: Idx, V> Default for VecCache<K, V> {

 impl<K: Eq + Idx, V: Copy + Debug> QueryStorage for VecCache<K, V> {
    type Value = V;
-    type Stored = V;
 }

 impl<K, V> QueryCache for VecCache<K, V>
@@ -278,87 +194,12 @@ where
    }

    #[inline]
-    fn complete(&self, key: K, value: V, index: DepNodeIndex) -> Self::Stored {
+    fn complete(&self, key: K, value: V, index: DepNodeIndex) {
        #[cfg(parallel_compiler)]
        let mut lock = self.cache.get_shard_by_hash(key.index() as u64).lock();
        #[cfg(not(parallel_compiler))]
        let mut lock = self.cache.lock();
        lock.insert(key, (value, index));
-        value
-    }
-
-    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        #[cfg(parallel_compiler)]
-        {
-            let shards = self.cache.lock_shards();
-            for shard in shards.iter() {
-                for (k, v) in shard.iter_enumerated() {
-                    if let Some(v) = v {
-                        f(&k, &v.0, v.1);
-                    }
-                }
-            }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            let map = self.cache.lock();
-            for (k, v) in map.iter_enumerated() {
-                if let Some(v) = v {
-                    f(&k, &v.0, v.1);
-                }
-            }
-        }
-    }
-}
-
-pub struct VecArenaCache<'tcx, K: Idx, V> {
-    arena: WorkerLocal<TypedArena<(V, DepNodeIndex)>>,
-    #[cfg(parallel_compiler)]
-    cache: Sharded<IndexVec<K, Option<&'tcx (V, DepNodeIndex)>>>,
-    #[cfg(not(parallel_compiler))]
-    cache: Lock<IndexVec<K, Option<&'tcx (V, DepNodeIndex)>>>,
-}
-
-impl<'tcx, K: Idx, V> Default for VecArenaCache<'tcx, K, V> {
-    fn default() -> Self {
-        VecArenaCache {
-            arena: WorkerLocal::new(|_| TypedArena::default()),
-            cache: Default::default(),
-        }
-    }
-}
-
-impl<'tcx, K: Eq + Idx, V: Debug + 'tcx> QueryStorage for VecArenaCache<'tcx, K, V> {
-    type Value = V;
-    type Stored = &'tcx V;
-}
-
-impl<'tcx, K, V: 'tcx> QueryCache for VecArenaCache<'tcx, K, V>
-where
-    K: Eq + Idx + Clone + Debug,
-    V: Debug,
-{
-    type Key = K;
-
-    #[inline(always)]
-    fn lookup(&self, key: &K) -> Option<(&'tcx V, DepNodeIndex)> {
-        #[cfg(parallel_compiler)]
-        let lock = self.cache.get_shard_by_hash(key.index() as u64).lock();
-        #[cfg(not(parallel_compiler))]
-        let lock = self.cache.lock();
-        if let Some(Some(value)) = lock.get(*key) { Some((&value.0, value.1)) } else { None }
-    }
-
-    #[inline]
-    fn complete(&self, key: K, value: V, index: DepNodeIndex) -> Self::Stored {
-        let value = self.arena.alloc((value, index));
-        let value = unsafe { &*(value as *const _) };
-        #[cfg(parallel_compiler)]
-        let mut lock = self.cache.get_shard_by_hash(key.index() as u64).lock();
-        #[cfg(not(parallel_compiler))]
-        let mut lock = self.cache.lock();
-        lock.insert(key, value);
-        &value.0
    }

    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
--- a/compiler/rustc_query_system/src/query/config.rs
+++ b/compiler/rustc_query_system/src/query/config.rs
@@ -20,10 +20,9 @@ pub trait QueryConfig<Qcx: QueryContext> {
    const NAME: &'static str;

    type Key: DepNodeParams<Qcx::DepContext> + Eq + Hash + Clone + Debug;
-    type Value: Debug;
-    type Stored: Debug + Copy + std::borrow::Borrow<Self::Value>;
+    type Value: Debug + Copy;

-    type Cache: QueryCache<Key = Self::Key, Stored = Self::Stored, Value = Self::Value>;
+    type Cache: QueryCache<Key = Self::Key, Value = Self::Value>;

    // Don't use this method to access query results, instead use the methods on TyCtxt
    fn query_state<'a>(tcx: Qcx) -> &'a QueryState<Self::Key, Qcx::DepKind>
@@ -38,9 +37,9 @@ pub trait QueryConfig<Qcx: QueryContext> {
    fn cache_on_disk(tcx: Qcx::DepContext, key: &Self::Key) -> bool;

    // Don't use this method to compute query results, instead use the methods on TyCtxt
-    fn execute_query(tcx: Qcx::DepContext, k: Self::Key) -> Self::Stored;
+    fn execute_query(tcx: Qcx::DepContext, k: Self::Key) -> Self::Value;

-    fn compute(tcx: Qcx, key: &Self::Key) -> fn(Qcx::DepContext, Self::Key) -> Self::Value;
+    fn compute(tcx: Qcx, key: Self::Key) -> Self::Value;

    fn try_load_from_disk(qcx: Qcx, idx: &Self::Key) -> TryLoadFromDisk<Qcx, Self>;

--- a/compiler/rustc_query_system/src/query/plumbing.rs
+++ b/compiler/rustc_query_system/src/query/plumbing.rs
@@ -246,7 +246,7 @@ where

    /// Completes the query by updating the query cache with the `result`,
    /// signals the waiter and forgets the JobOwner, so it won't poison the query
-    fn complete<C>(self, cache: &C, result: C::Value, dep_node_index: DepNodeIndex) -> C::Stored
+    fn complete<C>(self, cache: &C, result: C::Value, dep_node_index: DepNodeIndex)
    where
        C: QueryCache<Key = K>,
    {
@@ -257,23 +257,19 @@ where
        // Forget ourself so our destructor won't poison the query
        mem::forget(self);

-        let (job, result) = {
-            let job = {
-                #[cfg(parallel_compiler)]
-                let mut lock = state.active.get_shard_by_value(&key).lock();
-                #[cfg(not(parallel_compiler))]
-                let mut lock = state.active.lock();
-                match lock.remove(&key).unwrap() {
-                    QueryResult::Started(job) => job,
-                    QueryResult::Poisoned => panic!(),
-                }
-            };
-            let result = cache.complete(key, result, dep_node_index);
-            (job, result)
+        let job = {
+            #[cfg(parallel_compiler)]
+            let mut lock = state.active.get_shard_by_value(&key).lock();
+            #[cfg(not(parallel_compiler))]
+            let mut lock = state.active.lock();
+            match lock.remove(&key).unwrap() {
+                QueryResult::Started(job) => job,
+                QueryResult::Poisoned => panic!(),
+            }
        };
+        cache.complete(key, result, dep_node_index);

        job.signal_complete();
-        result
    }
 }

@@ -336,7 +332,7 @@ where
 /// which will be used if the query is not in the cache and we need
 /// to compute it.
 #[inline]
-pub fn try_get_cached<Tcx, C>(tcx: Tcx, cache: &C, key: &C::Key) -> Option<C::Stored>
+pub fn try_get_cached<Tcx, C>(tcx: Tcx, cache: &C, key: &C::Key) -> Option<C::Value>
 where
    C: QueryCache,
    Tcx: DepContext,
@@ -358,7 +354,7 @@ fn try_execute_query<Q, Qcx>(
    span: Span,
    key: Q::Key,
    dep_node: Option<DepNode<Qcx::DepKind>>,
-) -> (Q::Stored, Option<DepNodeIndex>)
+) -> (Q::Value, Option<DepNodeIndex>)
 where
    Q: QueryConfig<Qcx>,
    Qcx: QueryContext,
@@ -390,7 +386,7 @@ where
                    );
                }
            }
-            let result = job.complete(cache, result, dep_node_index);
+            job.complete(cache, result, dep_node_index);
            (result, Some(dep_node_index))
        }
        TryGetJob::Cycle(error) => {
@@ -426,9 +422,7 @@ where
    // Fast path for when incr. comp. is off.
    if !dep_graph.is_fully_enabled() {
        let prof_timer = qcx.dep_context().profiler().query_provider();
-        let result = qcx.start_query(job_id, Q::DEPTH_LIMIT, None, || {
-            Q::compute(qcx, &key)(*qcx.dep_context(), key)
-        });
+        let result = qcx.start_query(job_id, Q::DEPTH_LIMIT, None, || Q::compute(qcx, key));
        let dep_node_index = dep_graph.next_virtual_depnode_index();
        prof_timer.finish_with_query_invocation_id(dep_node_index.into());
        return (result, dep_node_index);
@@ -454,17 +448,15 @@ where
    let (result, dep_node_index) =
        qcx.start_query(job_id, Q::DEPTH_LIMIT, Some(&diagnostics), || {
            if Q::ANON {
-                return dep_graph.with_anon_task(*qcx.dep_context(), Q::DEP_KIND, || {
-                    Q::compute(qcx, &key)(*qcx.dep_context(), key)
-                });
+                return dep_graph
+                    .with_anon_task(*qcx.dep_context(), Q::DEP_KIND, || Q::compute(qcx, key));
            }

            // `to_dep_node` is expensive for some `DepKind`s.
            let dep_node =
                dep_node_opt.unwrap_or_else(|| Q::construct_dep_node(*qcx.dep_context(), &key));

-            let task = Q::compute(qcx, &key);
-            dep_graph.with_task(dep_node, *qcx.dep_context(), key, task, Q::HASH_RESULT)
+            dep_graph.with_task(dep_node, qcx, key, Q::compute, Q::HASH_RESULT)
        });

    prof_timer.finish_with_query_invocation_id(dep_node_index.into());
@@ -555,7 +547,7 @@ where
    let prof_timer = qcx.dep_context().profiler().query_provider();

    // The dep-graph for this computation is already in-place.
-    let result = dep_graph.with_ignore(|| Q::compute(qcx, key)(*qcx.dep_context(), key.clone()));
+    let result = dep_graph.with_ignore(|| Q::compute(qcx, key.clone()));

    prof_timer.finish_with_query_invocation_id(dep_node_index.into());

@@ -727,7 +719,7 @@ pub enum QueryMode {
    Ensure,
 }

-pub fn get_query<Q, Qcx, D>(qcx: Qcx, span: Span, key: Q::Key, mode: QueryMode) -> Option<Q::Stored>
+pub fn get_query<Q, Qcx, D>(qcx: Qcx, span: Span, key: Q::Key, mode: QueryMode) -> Option<Q::Value>
 where
    D: DepKind,
    Q: QueryConfig<Qcx>,