Move various token stream things from rustc_parse to rustc_ast.

Specifically: `TokenCursor`, `TokenTreeCursor`, `LazyAttrTokenStreamImpl`, `FlatToken`, `make_attr_token_stream`, `ParserRange`, `NodeRange`. `ParserReplacement`, and `NodeReplacement`. These are all related to token streams, rather than actual parsing. This will facilitate the simplifications in the next commit.
2025-04-29 11:18:08 +10:00
parent 25cdf1f674
commit 28236ab703
6 changed files with 338 additions and 339 deletions
--- a/compiler/rustc_parse/src/parser/attr.rs
+++ b/compiler/rustc_parse/src/parser/attr.rs
@@ -1,5 +1,6 @@
 use rustc_ast as ast;
 use rustc_ast::token::{self, MetaVarKind};
+use rustc_ast::tokenstream::ParserRange;
 use rustc_ast::{Attribute, attr};
 use rustc_errors::codes::*;
 use rustc_errors::{Diag, PResult};
@@ -8,8 +9,7 @@ use thin_vec::ThinVec;
 use tracing::debug;

 use super::{
-    AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, ParserRange, PathStyle, Trailing,
-    UsePreAttrPos,
+    AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, PathStyle, Trailing, UsePreAttrPos,
 };
 use crate::{errors, exp, fluent_generated as fluent};

--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -1,21 +1,18 @@
 use std::borrow::Cow;
-use std::{iter, mem};
+use std::mem;

-use rustc_ast::token::{Delimiter, Token};
+use rustc_ast::token::Token;
 use rustc_ast::tokenstream::{
-    AttrTokenStream, AttrTokenTree, AttrsTarget, DelimSpacing, DelimSpan, LazyAttrTokenStream,
-    Spacing, ToAttrTokenStream,
+    AttrsTarget, LazyAttrTokenStream, LazyAttrTokenStreamImpl, NodeRange, ParserRange, Spacing,
+    TokenCursor,
 };
 use rustc_ast::{self as ast, AttrVec, Attribute, HasAttrs, HasTokens};
 use rustc_data_structures::fx::FxHashSet;
 use rustc_errors::PResult;
 use rustc_session::parse::ParseSess;
-use rustc_span::{DUMMY_SP, Span, sym};
+use rustc_span::{DUMMY_SP, sym};

-use super::{
-    Capturing, FlatToken, ForceCollect, NodeRange, NodeReplacement, Parser, ParserRange,
-    TokenCursor, Trailing,
-};
+use super::{Capturing, ForceCollect, Parser, Trailing};

 // When collecting tokens, this fully captures the start point. Usually its
 // just after outer attributes, but occasionally it's before.
@@ -94,95 +91,6 @@ fn has_cfg_or_cfg_attr(attrs: &[Attribute]) -> bool {
    })
 }

-// From a value of this type we can reconstruct the `TokenStream` seen by the
-// `f` callback passed to a call to `Parser::collect_tokens`, by
-// replaying the getting of the tokens. This saves us producing a `TokenStream`
-// if it is never needed, e.g. a captured `macro_rules!` argument that is never
-// passed to a proc macro. In practice, token stream creation happens rarely
-// compared to calls to `collect_tokens` (see some statistics in #78736) so we
-// are doing as little up-front work as possible.
-//
-// This also makes `Parser` very cheap to clone, since
-// there is no intermediate collection buffer to clone.
-struct LazyAttrTokenStreamImpl {
-    start_token: (Token, Spacing),
-    cursor_snapshot: TokenCursor,
-    num_calls: u32,
-    break_last_token: u32,
-    node_replacements: Box<[NodeReplacement]>,
-}
-
-impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
-    fn to_attr_token_stream(&self) -> AttrTokenStream {
-        // The token produced by the final call to `{,inlined_}next` was not
-        // actually consumed by the callback. The combination of chaining the
-        // initial token and using `take` produces the desired result - we
-        // produce an empty `TokenStream` if no calls were made, and omit the
-        // final token otherwise.
-        let mut cursor_snapshot = self.cursor_snapshot.clone();
-        let tokens = iter::once(FlatToken::Token(self.start_token))
-            .chain(iter::repeat_with(|| FlatToken::Token(cursor_snapshot.next())))
-            .take(self.num_calls as usize);
-
-        if self.node_replacements.is_empty() {
-            make_attr_token_stream(tokens, self.break_last_token)
-        } else {
-            let mut tokens: Vec<_> = tokens.collect();
-            let mut node_replacements = self.node_replacements.to_vec();
-            node_replacements.sort_by_key(|(range, _)| range.0.start);
-
-            #[cfg(debug_assertions)]
-            for [(node_range, tokens), (next_node_range, next_tokens)] in
-                node_replacements.array_windows()
-            {
-                assert!(
-                    node_range.0.end <= next_node_range.0.start
-                        || node_range.0.end >= next_node_range.0.end,
-                    "Node ranges should be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
-                    node_range,
-                    tokens,
-                    next_node_range,
-                    next_tokens,
-                );
-            }
-
-            // Process the replace ranges, starting from the highest start
-            // position and working our way back. If have tokens like:
-            //
-            // `#[cfg(FALSE)] struct Foo { #[cfg(FALSE)] field: bool }`
-            //
-            // Then we will generate replace ranges for both
-            // the `#[cfg(FALSE)] field: bool` and the entire
-            // `#[cfg(FALSE)] struct Foo { #[cfg(FALSE)] field: bool }`
-            //
-            // By starting processing from the replace range with the greatest
-            // start position, we ensure that any (outer) replace range which
-            // encloses another (inner) replace range will fully overwrite the
-            // inner range's replacement.
-            for (node_range, target) in node_replacements.into_iter().rev() {
-                assert!(
-                    !node_range.0.is_empty(),
-                    "Cannot replace an empty node range: {:?}",
-                    node_range.0
-                );
-
-                // Replace the tokens in range with zero or one `FlatToken::AttrsTarget`s, plus
-                // enough `FlatToken::Empty`s to fill up the rest of the range. This keeps the
-                // total length of `tokens` constant throughout the replacement process, allowing
-                // us to do all replacements without adjusting indices.
-                let target_len = target.is_some() as usize;
-                tokens.splice(
-                    (node_range.0.start as usize)..(node_range.0.end as usize),
-                    target.into_iter().map(|target| FlatToken::AttrsTarget(target)).chain(
-                        iter::repeat(FlatToken::Empty).take(node_range.0.len() - target_len),
-                    ),
-                );
-            }
-            make_attr_token_stream(tokens.into_iter(), self.break_last_token)
-        }
-    }
-}
-
 impl<'a> Parser<'a> {
    pub(super) fn collect_pos(&self) -> CollectPos {
        CollectPos {
@@ -483,71 +391,6 @@ impl<'a> Parser<'a> {
    }
 }

-/// Converts a flattened iterator of tokens (including open and close delimiter tokens) into an
-/// `AttrTokenStream`, creating an `AttrTokenTree::Delimited` for each matching pair of open and
-/// close delims.
-fn make_attr_token_stream(
-    iter: impl Iterator<Item = FlatToken>,
-    break_last_token: u32,
-) -> AttrTokenStream {
-    #[derive(Debug)]
-    struct FrameData {
-        // This is `None` for the first frame, `Some` for all others.
-        open_delim_sp: Option<(Delimiter, Span, Spacing)>,
-        inner: Vec<AttrTokenTree>,
-    }
-    // The stack always has at least one element. Storing it separately makes for shorter code.
-    let mut stack_top = FrameData { open_delim_sp: None, inner: vec![] };
-    let mut stack_rest = vec![];
-    for flat_token in iter {
-        match flat_token {
-            FlatToken::Token((token @ Token { kind, span }, spacing)) => {
-                if let Some(delim) = kind.open_delim() {
-                    stack_rest.push(mem::replace(
-                        &mut stack_top,
-                        FrameData { open_delim_sp: Some((delim, span, spacing)), inner: vec![] },
-                    ));
-                } else if let Some(delim) = kind.close_delim() {
-                    let frame_data = mem::replace(&mut stack_top, stack_rest.pop().unwrap());
-                    let (open_delim, open_sp, open_spacing) = frame_data.open_delim_sp.unwrap();
-                    assert!(
-                        open_delim.eq_ignoring_invisible_origin(&delim),
-                        "Mismatched open/close delims: open={open_delim:?} close={span:?}"
-                    );
-                    let dspan = DelimSpan::from_pair(open_sp, span);
-                    let dspacing = DelimSpacing::new(open_spacing, spacing);
-                    let stream = AttrTokenStream::new(frame_data.inner);
-                    let delimited = AttrTokenTree::Delimited(dspan, dspacing, delim, stream);
-                    stack_top.inner.push(delimited);
-                } else {
-                    stack_top.inner.push(AttrTokenTree::Token(token, spacing))
-                }
-            }
-            FlatToken::AttrsTarget(target) => {
-                stack_top.inner.push(AttrTokenTree::AttrsTarget(target))
-            }
-            FlatToken::Empty => {}
-        }
-    }
-
-    if break_last_token > 0 {
-        let last_token = stack_top.inner.pop().unwrap();
-        if let AttrTokenTree::Token(last_token, spacing) = last_token {
-            let (unglued, _) = last_token.kind.break_two_token_op(break_last_token).unwrap();
-
-            // Tokens are always ASCII chars, so we can use byte arithmetic here.
-            let mut first_span = last_token.span.shrink_to_lo();
-            first_span =
-                first_span.with_hi(first_span.lo() + rustc_span::BytePos(break_last_token));
-
-            stack_top.inner.push(AttrTokenTree::Token(Token::new(unglued, first_span), spacing));
-        } else {
-            panic!("Unexpected last token {last_token:?}")
-        }
-    }
-    AttrTokenStream::new(stack_top.inner)
-}
-
 /// Tokens are needed if:
 /// - any non-single-segment attributes (other than doc comments) are present,
 ///   e.g. `rustfmt::skip`; or
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -12,7 +12,6 @@ pub mod token_type;
 mod ty;

 use std::assert_matches::debug_assert_matches;
-use std::ops::Range;
 use std::{fmt, mem, slice};

 use attr_wrapper::{AttrWrapper, UsePreAttrPos};
@@ -25,7 +24,9 @@ use rustc_ast::ptr::P;
 use rustc_ast::token::{
    self, IdentIsRaw, InvisibleOrigin, MetaVarKind, NtExprKind, NtPatKind, Token, TokenKind,
 };
-use rustc_ast::tokenstream::{AttrsTarget, Spacing, TokenStream, TokenTree};
+use rustc_ast::tokenstream::{
+    ParserRange, ParserReplacement, Spacing, TokenCursor, TokenStream, TokenTree, TokenTreeCursor,
+};
 use rustc_ast::util::case::Case;
 use rustc_ast::{
    self as ast, AnonConst, AttrArgs, AttrId, ByRef, Const, CoroutineKind, DUMMY_NODE_ID,
@@ -37,7 +38,7 @@ use rustc_data_structures::fx::FxHashMap;
 use rustc_errors::{Applicability, Diag, FatalError, MultiSpan, PResult};
 use rustc_index::interval::IntervalSet;
 use rustc_session::parse::ParseSess;
-use rustc_span::{DUMMY_SP, Ident, Span, Symbol, kw, sym};
+use rustc_span::{Ident, Span, Symbol, kw, sym};
 use thin_vec::ThinVec;
 use token_type::TokenTypeSet;
 pub use token_type::{ExpKeywordPair, ExpTokenPair, TokenType};
@@ -187,57 +188,6 @@ struct ClosureSpans {
    body: Span,
 }

-/// A token range within a `Parser`'s full token stream.
-#[derive(Clone, Debug)]
-struct ParserRange(Range<u32>);
-
-/// A token range within an individual AST node's (lazy) token stream, i.e.
-/// relative to that node's first token. Distinct from `ParserRange` so the two
-/// kinds of range can't be mixed up.
-#[derive(Clone, Debug)]
-struct NodeRange(Range<u32>);
-
-/// Indicates a range of tokens that should be replaced by an `AttrsTarget`
-/// (replacement) or be replaced by nothing (deletion). This is used in two
-/// places during token collection.
-///
-/// 1. Replacement. During the parsing of an AST node that may have a
-///    `#[derive]` attribute, when we parse a nested AST node that has `#[cfg]`
-///    or `#[cfg_attr]`, we replace the entire inner AST node with
-///    `FlatToken::AttrsTarget`. This lets us perform eager cfg-expansion on an
-///    `AttrTokenStream`.
-///
-/// 2. Deletion. We delete inner attributes from all collected token streams,
-///    and instead track them through the `attrs` field on the AST node. This
-///    lets us manipulate them similarly to outer attributes. When we create a
-///    `TokenStream`, the inner attributes are inserted into the proper place
-///    in the token stream.
-///
-/// Each replacement starts off in `ParserReplacement` form but is converted to
-/// `NodeReplacement` form when it is attached to a single AST node, via
-/// `LazyAttrTokenStreamImpl`.
-type ParserReplacement = (ParserRange, Option<AttrsTarget>);
-
-/// See the comment on `ParserReplacement`.
-type NodeReplacement = (NodeRange, Option<AttrsTarget>);
-
-impl NodeRange {
-    // Converts a range within a parser's tokens to a range within a
-    // node's tokens beginning at `start_pos`.
-    //
-    // For example, imagine a parser with 50 tokens in its token stream, a
-    // function that spans `ParserRange(20..40)` and an inner attribute within
-    // that function that spans `ParserRange(30..35)`. We would find the inner
-    // attribute's range within the function's tokens by subtracting 20, which
-    // is the position of the function's start token. This gives
-    // `NodeRange(10..15)`.
-    fn new(ParserRange(parser_range): ParserRange, start_pos: u32) -> NodeRange {
-        assert!(!parser_range.is_empty());
-        assert!(parser_range.start >= start_pos);
-        NodeRange((parser_range.start - start_pos)..(parser_range.end - start_pos))
-    }
-}
-
 /// Controls how we capture tokens. Capturing can be expensive,
 /// so we try to avoid performing capturing in cases where
 /// we will never need an `AttrTokenStream`.
@@ -260,104 +210,6 @@ struct CaptureState {
    seen_attrs: IntervalSet<AttrId>,
 }

-#[derive(Clone, Debug)]
-struct TokenTreeCursor {
-    stream: TokenStream,
-    /// Points to the current token tree in the stream. In `TokenCursor::curr`,
-    /// this can be any token tree. In `TokenCursor::stack`, this is always a
-    /// `TokenTree::Delimited`.
-    index: usize,
-}
-
-impl TokenTreeCursor {
-    #[inline]
-    fn new(stream: TokenStream) -> Self {
-        TokenTreeCursor { stream, index: 0 }
-    }
-
-    #[inline]
-    fn curr(&self) -> Option<&TokenTree> {
-        self.stream.get(self.index)
-    }
-
-    fn look_ahead(&self, n: usize) -> Option<&TokenTree> {
-        self.stream.get(self.index + n)
-    }
-
-    #[inline]
-    fn bump(&mut self) {
-        self.index += 1;
-    }
-}
-
-/// A `TokenStream` cursor that produces `Token`s. It's a bit odd that
-/// we (a) lex tokens into a nice tree structure (`TokenStream`), and then (b)
-/// use this type to emit them as a linear sequence. But a linear sequence is
-/// what the parser expects, for the most part.
-#[derive(Clone, Debug)]
-struct TokenCursor {
-    // Cursor for the current (innermost) token stream. The index within the
-    // cursor can point to any token tree in the stream (or one past the end).
-    // The delimiters for this token stream are found in `self.stack.last()`;
-    // if that is `None` we are in the outermost token stream which never has
-    // delimiters.
-    curr: TokenTreeCursor,
-
-    // Token streams surrounding the current one. The index within each cursor
-    // always points to a `TokenTree::Delimited`.
-    stack: Vec<TokenTreeCursor>,
-}
-
-impl TokenCursor {
-    fn next(&mut self) -> (Token, Spacing) {
-        self.inlined_next()
-    }
-
-    /// This always-inlined version should only be used on hot code paths.
-    #[inline(always)]
-    fn inlined_next(&mut self) -> (Token, Spacing) {
-        loop {
-            // FIXME: we currently don't return `Delimiter::Invisible` open/close delims. To fix
-            // #67062 we will need to, whereupon the `delim != Delimiter::Invisible` conditions
-            // below can be removed.
-            if let Some(tree) = self.curr.curr() {
-                match tree {
-                    &TokenTree::Token(token, spacing) => {
-                        debug_assert!(!token.kind.is_delim());
-                        let res = (token, spacing);
-                        self.curr.bump();
-                        return res;
-                    }
-                    &TokenTree::Delimited(sp, spacing, delim, ref tts) => {
-                        let trees = TokenTreeCursor::new(tts.clone());
-                        self.stack.push(mem::replace(&mut self.curr, trees));
-                        if !delim.skip() {
-                            return (Token::new(delim.as_open_token_kind(), sp.open), spacing.open);
-                        }
-                        // No open delimiter to return; continue on to the next iteration.
-                    }
-                };
-            } else if let Some(parent) = self.stack.pop() {
-                // We have exhausted this token stream. Move back to its parent token stream.
-                let Some(&TokenTree::Delimited(span, spacing, delim, _)) = parent.curr() else {
-                    panic!("parent should be Delimited")
-                };
-                self.curr = parent;
-                self.curr.bump(); // move past the `Delimited`
-                if !delim.skip() {
-                    return (Token::new(delim.as_close_token_kind(), span.close), spacing.close);
-                }
-                // No close delimiter to return; continue on to the next iteration.
-            } else {
-                // We have exhausted the outermost token stream. The use of
-                // `Spacing::Alone` is arbitrary and immaterial, because the
-                // `Eof` token's spacing is never used.
-                return (Token::new(token::Eof, DUMMY_SP), Spacing::Alone);
-            }
-        }
-    }
-}
-
 /// A sequence separator.
 #[derive(Debug)]
 struct SeqSep<'a> {
@@ -1742,26 +1594,6 @@ impl<'a> Parser<'a> {
    }
 }

-/// A helper struct used when building an `AttrTokenStream` from
-/// a `LazyAttrTokenStream`. Both delimiter and non-delimited tokens
-/// are stored as `FlatToken::Token`. A vector of `FlatToken`s
-/// is then 'parsed' to build up an `AttrTokenStream` with nested
-/// `AttrTokenTree::Delimited` tokens.
-#[derive(Debug, Clone)]
-enum FlatToken {
-    /// A token - this holds both delimiter (e.g. '{' and '}')
-    /// and non-delimiter tokens
-    Token((Token, Spacing)),
-    /// Holds the `AttrsTarget` for an AST node. The `AttrsTarget` is inserted
-    /// directly into the constructed `AttrTokenStream` as an
-    /// `AttrTokenTree::AttrsTarget`.
-    AttrsTarget(AttrsTarget),
-    /// A special 'empty' token that is ignored during the conversion
-    /// to an `AttrTokenStream`. This is used to simplify the
-    /// handling of replace ranges.
-    Empty,
-}
-
 // Metavar captures of various kinds.
 #[derive(Clone, Debug)]
 pub enum ParseNtResult {