Implement RFC 3503: frontmatters

Supercedes #137193
This commit is contained in:
Deadbeef
2025-04-12 15:53:46 +00:00
parent 0c33fe2c3d
commit 662182637e
61 changed files with 970 additions and 22 deletions

View File

@@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
use self::LiteralKind::*;
use self::TokenKind::*;
pub use crate::cursor::Cursor;
use crate::cursor::EOF_CHAR;
pub use crate::cursor::{Cursor, FrontmatterAllowed};
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@@ -57,17 +57,27 @@ impl Token {
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
/// A line comment, e.g. `// comment`.
LineComment { doc_style: Option<DocStyle> },
LineComment {
doc_style: Option<DocStyle>,
},
/// A block comment, e.g. `/* block comment */`.
///
/// Block comments can be recursive, so a sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error.
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
BlockComment {
doc_style: Option<DocStyle>,
terminated: bool,
},
/// Any whitespace character sequence.
Whitespace,
Frontmatter {
has_invalid_preceding_whitespace: bool,
invalid_infostring: bool,
},
/// An identifier or keyword, e.g. `ident` or `continue`.
Ident,
@@ -109,10 +119,15 @@ pub enum TokenKind {
/// this type will need to check for and reject that case.
///
/// See [LiteralKind] for more details.
Literal { kind: LiteralKind, suffix_start: u32 },
Literal {
kind: LiteralKind,
suffix_start: u32,
},
/// A lifetime, e.g. `'a`.
Lifetime { starts_with_number: bool },
Lifetime {
starts_with_number: bool,
},
/// `;`
Semi,
@@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
#[inline]
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
debug_assert!(!input.is_empty());
let mut cursor = Cursor::new(input);
let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
// Move past the leading `r` or `br`.
for _ in 0..prefix_len {
cursor.bump().unwrap();
@@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
let mut cursor = Cursor::new(input);
let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
std::iter::from_fn(move || {
let token = cursor.advance_token();
if token.kind != TokenKind::Eof { Some(token) } else { None }
@@ -361,7 +376,34 @@ impl Cursor<'_> {
Some(c) => c,
None => return Token::new(TokenKind::Eof, 0),
};
let token_kind = match first_char {
c if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
&& is_whitespace(c) =>
{
let mut last = first_char;
while is_whitespace(self.first()) {
let Some(c) = self.bump() else {
break;
};
last = c;
}
// invalid frontmatter opening as whitespace preceding it isn't newline.
// combine the whitespace and the frontmatter to a single token as we shall
// error later.
if last != '\n' && self.as_str().starts_with("---") {
self.bump();
self.frontmatter(true)
} else {
Whitespace
}
}
'-' if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
&& self.as_str().starts_with("--") =>
{
// happy path
self.frontmatter(false)
}
// Slash, comment or block comment.
'/' => match self.first() {
'/' => self.line_comment(),
@@ -464,11 +506,110 @@ impl Cursor<'_> {
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Unknown,
};
if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
&& !matches!(token_kind, Whitespace)
{
// stop allowing frontmatters after first non-whitespace token
self.frontmatter_allowed = FrontmatterAllowed::No;
}
let res = Token::new(token_kind, self.pos_within_token());
self.reset_pos_within_token();
res
}
/// Given that one `-` was eaten, eat the rest of the frontmatter.
fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind {
debug_assert_eq!('-', self.prev());
let pos = self.pos_within_token();
self.eat_while(|c| c == '-');
// one `-` is eaten by the caller.
let length_opening = self.pos_within_token() - pos + 1;
// must be ensured by the caller
debug_assert!(length_opening >= 3);
// whitespace between the opening and the infostring.
self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
// copied from `eat_identifier`, but allows `.` in infostring to allow something like
// `---Cargo.toml` as a valid opener
if is_id_start(self.first()) {
self.bump();
self.eat_while(|c| is_id_continue(c) || c == '.');
}
self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
let invalid_infostring = self.first() != '\n';
let mut s = self.as_str();
let mut found = false;
while let Some(closing) = s.find(&"-".repeat(length_opening as usize)) {
let preceding_chars_start = s[..closing].rfind("\n").map_or(0, |i| i + 1);
if s[preceding_chars_start..closing].chars().all(is_whitespace) {
// candidate found
self.bump_bytes(closing);
// in case like
// ---cargo
// --- blahblah
// or
// ---cargo
// ----
// combine those stuff into this frontmatter token such that it gets detected later.
self.eat_until(b'\n');
found = true;
break;
} else {
s = &s[closing + length_opening as usize..];
}
}
if !found {
// recovery strategy: a closing statement might have precending whitespace/newline
// but not have enough dashes to properly close. In this case, we eat until there,
// and report a mismatch in the parser.
let mut rest = self.as_str();
// We can look for a shorter closing (starting with four dashes but closing with three)
// and other indications that Rust has started and the infostring has ended.
let mut potential_closing = rest
.find("\n---")
// n.b. only in the case where there are dashes, we move the index to the line where
// the dashes start as we eat to include that line. For other cases those are Rust code
// and not included in the frontmatter.
.map(|x| x + 1)
.or_else(|| rest.find("\nuse "))
.or_else(|| rest.find("\n//!"))
.or_else(|| rest.find("\n#!["));
if potential_closing.is_none() {
// a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
// on a standalone line. Might be wrong.
while let Some(closing) = rest.find("---") {
let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
if rest[preceding_chars_start..closing].chars().all(is_whitespace) {
// candidate found
potential_closing = Some(closing);
break;
} else {
rest = &rest[closing + 3..];
}
}
}
if let Some(potential_closing) = potential_closing {
// bump to the potential closing, and eat everything on that line.
self.bump_bytes(potential_closing);
self.eat_until(b'\n');
} else {
// eat everything. this will get reported as an unclosed frontmatter.
self.eat_while(|_| true);
}
}
Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
}
fn line_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.first() == '/');
self.bump();