Auto merge of #140999 - hkBst:update-escaper, r=nnethercote
update to literal-escaper 0.0.4 for better API without `unreachable` and faster string parsing This is the replacement for just the part of https://github.com/rust-lang/rust/pull/138163 dealing with the changed API of unescape functionality, since that got moved into its own crate. <del>This uses an unpublished version of literal-escaper (https://github.com/rust-lang/literal-escaper/pull/8).</del> r? `@nnethercote`
This commit is contained in:
@@ -3175,9 +3175,9 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-literal-escaper"
|
||||
version = "0.0.2"
|
||||
version = "0.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0041b6238913c41fe704213a4a9329e2f685a156d1781998128b4149c230ad04"
|
||||
checksum = "ab03008eb631b703dd16978282ae36c73282e7922fe101a4bd072a40ecea7b8b"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-main"
|
||||
|
||||
@@ -89,3 +89,8 @@ codegen-units = 1
|
||||
# FIXME: LTO cannot be enabled for binaries in a workspace
|
||||
# <https://github.com/rust-lang/cargo/issues/9330>
|
||||
# lto = true
|
||||
|
||||
# If you want to use a crate with local modifications, you can set a path or git dependency here.
|
||||
# For git dependencies, also add your source to ALLOWED_SOURCES in src/tools/tidy/src/extdeps.rs.
|
||||
#[patch.crates-io]
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ edition = "2024"
|
||||
# tidy-alphabetical-start
|
||||
bitflags = "2.4.1"
|
||||
memchr = "2.7.4"
|
||||
rustc-literal-escaper = "0.0.2"
|
||||
rustc-literal-escaper = "0.0.4"
|
||||
rustc_ast_ir = { path = "../rustc_ast_ir" }
|
||||
rustc_data_structures = { path = "../rustc_data_structures" }
|
||||
rustc_index = { path = "../rustc_index" }
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use std::{ascii, fmt, str};
|
||||
|
||||
use rustc_literal_escaper::{
|
||||
MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode,
|
||||
MixedUnit, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str,
|
||||
};
|
||||
use rustc_span::{Span, Symbol, kw, sym};
|
||||
use tracing::debug;
|
||||
@@ -87,11 +87,10 @@ impl LitKind {
|
||||
// Force-inlining here is aggressive but the closure is
|
||||
// called on every char in the string, so it can be hot in
|
||||
// programs with many long strings containing escapes.
|
||||
unescape_unicode(
|
||||
unescape_str(
|
||||
s,
|
||||
Mode::Str,
|
||||
&mut #[inline(always)]
|
||||
|_, c| match c {
|
||||
#[inline(always)]
|
||||
|_, res| match res {
|
||||
Ok(c) => buf.push(c),
|
||||
Err(err) => {
|
||||
assert!(!err.is_fatal(), "failed to unescape string literal")
|
||||
@@ -111,8 +110,8 @@ impl LitKind {
|
||||
token::ByteStr => {
|
||||
let s = symbol.as_str();
|
||||
let mut buf = Vec::with_capacity(s.len());
|
||||
unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
|
||||
Ok(c) => buf.push(byte_from_char(c)),
|
||||
unescape_byte_str(s, |_, res| match res {
|
||||
Ok(b) => buf.push(b),
|
||||
Err(err) => {
|
||||
assert!(!err.is_fatal(), "failed to unescape string literal")
|
||||
}
|
||||
@@ -128,7 +127,7 @@ impl LitKind {
|
||||
token::CStr => {
|
||||
let s = symbol.as_str();
|
||||
let mut buf = Vec::with_capacity(s.len());
|
||||
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
|
||||
unescape_c_str(s, |_span, c| match c {
|
||||
Ok(MixedUnit::Char(c)) => {
|
||||
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ edition = "2024"
|
||||
[dependencies]
|
||||
# tidy-alphabetical-start
|
||||
bitflags = "2.4.1"
|
||||
rustc-literal-escaper = "0.0.2"
|
||||
rustc-literal-escaper = "0.0.4"
|
||||
rustc_ast = { path = "../rustc_ast" }
|
||||
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
|
||||
rustc_data_structures = { path = "../rustc_data_structures" }
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use diagnostics::make_unclosed_delims_error;
|
||||
use rustc_ast::ast::{self, AttrStyle};
|
||||
use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
|
||||
@@ -10,7 +8,7 @@ use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
|
||||
use rustc_lexer::{
|
||||
Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
|
||||
};
|
||||
use rustc_literal_escaper::{EscapeError, Mode, unescape_mixed, unescape_unicode};
|
||||
use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
|
||||
use rustc_session::lint::BuiltinLintDiag;
|
||||
use rustc_session::lint::builtin::{
|
||||
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
|
||||
@@ -702,7 +700,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
}
|
||||
err.emit()
|
||||
}
|
||||
self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' '
|
||||
self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
|
||||
}
|
||||
rustc_lexer::LiteralKind::Byte { terminated } => {
|
||||
if !terminated {
|
||||
@@ -714,7 +712,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
.with_code(E0763)
|
||||
.emit()
|
||||
}
|
||||
self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
|
||||
self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
|
||||
}
|
||||
rustc_lexer::LiteralKind::Str { terminated } => {
|
||||
if !terminated {
|
||||
@@ -726,7 +724,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
.with_code(E0765)
|
||||
.emit()
|
||||
}
|
||||
self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " "
|
||||
self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
|
||||
}
|
||||
rustc_lexer::LiteralKind::ByteStr { terminated } => {
|
||||
if !terminated {
|
||||
@@ -738,7 +736,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
.with_code(E0766)
|
||||
.emit()
|
||||
}
|
||||
self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
|
||||
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
|
||||
// b" "
|
||||
}
|
||||
rustc_lexer::LiteralKind::CStr { terminated } => {
|
||||
if !terminated {
|
||||
@@ -750,13 +749,14 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
.with_code(E0767)
|
||||
.emit()
|
||||
}
|
||||
self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
|
||||
self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
|
||||
}
|
||||
rustc_lexer::LiteralKind::RawStr { n_hashes } => {
|
||||
if let Some(n_hashes) = n_hashes {
|
||||
let n = u32::from(n_hashes);
|
||||
let kind = token::StrRaw(n_hashes);
|
||||
self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
|
||||
self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
|
||||
// r##" "##
|
||||
} else {
|
||||
self.report_raw_str_error(start, 1);
|
||||
}
|
||||
@@ -765,7 +765,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
if let Some(n_hashes) = n_hashes {
|
||||
let n = u32::from(n_hashes);
|
||||
let kind = token::ByteStrRaw(n_hashes);
|
||||
self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
|
||||
self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
|
||||
// br##" "##
|
||||
} else {
|
||||
self.report_raw_str_error(start, 2);
|
||||
}
|
||||
@@ -774,7 +775,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
if let Some(n_hashes) = n_hashes {
|
||||
let n = u32::from(n_hashes);
|
||||
let kind = token::CStrRaw(n_hashes);
|
||||
self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
|
||||
self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
|
||||
// cr##" "##
|
||||
} else {
|
||||
self.report_raw_str_error(start, 2);
|
||||
}
|
||||
@@ -1091,7 +1093,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
|
||||
}
|
||||
|
||||
fn cook_common(
|
||||
fn cook_quoted(
|
||||
&self,
|
||||
mut kind: token::LitKind,
|
||||
mode: Mode,
|
||||
@@ -1099,32 +1101,28 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
end: BytePos,
|
||||
prefix_len: u32,
|
||||
postfix_len: u32,
|
||||
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
|
||||
) -> (token::LitKind, Symbol) {
|
||||
let content_start = start + BytePos(prefix_len);
|
||||
let content_end = end - BytePos(postfix_len);
|
||||
let lit_content = self.str_from_to(content_start, content_end);
|
||||
unescape(lit_content, mode, &mut |range, result| {
|
||||
// Here we only check for errors. The actual unescaping is done later.
|
||||
if let Err(err) = result {
|
||||
let span_with_quotes = self.mk_sp(start, end);
|
||||
let (start, end) = (range.start as u32, range.end as u32);
|
||||
let lo = content_start + BytePos(start);
|
||||
let hi = lo + BytePos(end - start);
|
||||
let span = self.mk_sp(lo, hi);
|
||||
let is_fatal = err.is_fatal();
|
||||
if let Some(guar) = emit_unescape_error(
|
||||
self.dcx(),
|
||||
lit_content,
|
||||
span_with_quotes,
|
||||
span,
|
||||
mode,
|
||||
range,
|
||||
err,
|
||||
) {
|
||||
assert!(is_fatal);
|
||||
kind = token::Err(guar);
|
||||
}
|
||||
check_for_errors(lit_content, mode, |range, err| {
|
||||
let span_with_quotes = self.mk_sp(start, end);
|
||||
let (start, end) = (range.start as u32, range.end as u32);
|
||||
let lo = content_start + BytePos(start);
|
||||
let hi = lo + BytePos(end - start);
|
||||
let span = self.mk_sp(lo, hi);
|
||||
let is_fatal = err.is_fatal();
|
||||
if let Some(guar) = emit_unescape_error(
|
||||
self.dcx(),
|
||||
lit_content,
|
||||
span_with_quotes,
|
||||
span,
|
||||
mode,
|
||||
range,
|
||||
err,
|
||||
) {
|
||||
assert!(is_fatal);
|
||||
kind = token::Err(guar);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1137,34 +1135,6 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
|
||||
};
|
||||
(kind, sym)
|
||||
}
|
||||
|
||||
fn cook_unicode(
|
||||
&self,
|
||||
kind: token::LitKind,
|
||||
mode: Mode,
|
||||
start: BytePos,
|
||||
end: BytePos,
|
||||
prefix_len: u32,
|
||||
postfix_len: u32,
|
||||
) -> (token::LitKind, Symbol) {
|
||||
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
|
||||
unescape_unicode(src, mode, &mut |span, result| callback(span, result.map(drop)))
|
||||
})
|
||||
}
|
||||
|
||||
fn cook_mixed(
|
||||
&self,
|
||||
kind: token::LitKind,
|
||||
mode: Mode,
|
||||
start: BytePos,
|
||||
end: BytePos,
|
||||
prefix_len: u32,
|
||||
postfix_len: u32,
|
||||
) -> (token::LitKind, Symbol) {
|
||||
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
|
||||
unescape_mixed(src, mode, &mut |span, result| callback(span, result.map(drop)))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn nfc_normalize(string: &str) -> Symbol {
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
# tidy-alphabetical-start
|
||||
rustc-literal-escaper = "0.0.2"
|
||||
rustc-literal-escaper = "0.0.4"
|
||||
rustc_lexer = { path = "../rustc_lexer" }
|
||||
# tidy-alphabetical-end
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ use std::ops::Range;
|
||||
pub use Alignment::*;
|
||||
pub use Count::*;
|
||||
pub use Position::*;
|
||||
use rustc_literal_escaper::{Mode, unescape_unicode};
|
||||
|
||||
/// The type of format string that we are parsing.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
@@ -320,7 +319,7 @@ impl<'input> Parser<'input> {
|
||||
let without_quotes = &snippet[1..snippet.len() - 1];
|
||||
let (mut ok, mut vec) = (true, vec![]);
|
||||
let mut chars = input.chars();
|
||||
unescape_unicode(without_quotes, Mode::Str, &mut |range, res| match res {
|
||||
rustc_literal_escaper::unescape_str(without_quotes, |range, res| match res {
|
||||
Ok(ch) if ok && chars.next().is_some_and(|c| ch == c) => {
|
||||
vec.push((range, ch));
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ test = false
|
||||
doctest = false
|
||||
|
||||
[dependencies]
|
||||
rustc-literal-escaper = "0.0.2"
|
||||
rustc-literal-escaper = "0.0.4"
|
||||
|
||||
[features]
|
||||
rustc-dep-of-std = []
|
||||
|
||||
@@ -271,10 +271,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustc-literal-escaper"
|
||||
version = "0.0.2"
|
||||
version = "0.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0041b6238913c41fe704213a4a9329e2f685a156d1781998128b4149c230ad04"
|
||||
checksum = "ab03008eb631b703dd16978282ae36c73282e7922fe101a4bd072a40ecea7b8b"
|
||||
dependencies = [
|
||||
"rustc-std-workspace-core",
|
||||
"rustc-std-workspace-std",
|
||||
]
|
||||
|
||||
|
||||
@@ -55,8 +55,7 @@ rustflags = ["-Cpanic=abort"]
|
||||
rustflags = ["-Cpanic=abort"]
|
||||
|
||||
[patch.crates-io]
|
||||
# See comments in `library/rustc-std-workspace-core/README.md` for what's going on
|
||||
# here
|
||||
# See comments in `library/rustc-std-workspace-core/README.md` for what's going on here
|
||||
rustc-std-workspace-core = { path = 'rustc-std-workspace-core' }
|
||||
rustc-std-workspace-alloc = { path = 'rustc-std-workspace-alloc' }
|
||||
rustc-std-workspace-std = { path = 'rustc-std-workspace-std' }
|
||||
|
||||
@@ -9,7 +9,7 @@ std = { path = "../std" }
|
||||
# `core` when resolving doc links. Without this line a different `core` will be
|
||||
# loaded from sysroot causing duplicate lang items and other similar errors.
|
||||
core = { path = "../core" }
|
||||
rustc-literal-escaper = { version = "0.0.2", features = ["rustc-dep-of-std"] }
|
||||
rustc-literal-escaper = { version = "0.0.4", features = ["rustc-dep-of-std"] }
|
||||
|
||||
[features]
|
||||
default = ["rustc-dep-of-std"]
|
||||
|
||||
@@ -56,7 +56,7 @@ use std::{error, fmt};
|
||||
pub use diagnostic::{Diagnostic, Level, MultiSpan};
|
||||
#[unstable(feature = "proc_macro_value", issue = "136652")]
|
||||
pub use rustc_literal_escaper::EscapeError;
|
||||
use rustc_literal_escaper::{MixedUnit, Mode, byte_from_char, unescape_mixed, unescape_unicode};
|
||||
use rustc_literal_escaper::{MixedUnit, unescape_byte_str, unescape_c_str, unescape_str};
|
||||
#[unstable(feature = "proc_macro_totokens", issue = "130977")]
|
||||
pub use to_tokens::ToTokens;
|
||||
|
||||
@@ -1440,10 +1440,9 @@ impl Literal {
|
||||
// Force-inlining here is aggressive but the closure is
|
||||
// called on every char in the string, so it can be hot in
|
||||
// programs with many long strings containing escapes.
|
||||
unescape_unicode(
|
||||
unescape_str(
|
||||
symbol,
|
||||
Mode::Str,
|
||||
&mut #[inline(always)]
|
||||
#[inline(always)]
|
||||
|_, c| match c {
|
||||
Ok(c) => buf.push(c),
|
||||
Err(err) => {
|
||||
@@ -1472,7 +1471,7 @@ impl Literal {
|
||||
let mut error = None;
|
||||
let mut buf = Vec::with_capacity(symbol.len());
|
||||
|
||||
unescape_mixed(symbol, Mode::CStr, &mut |_span, c| match c {
|
||||
unescape_c_str(symbol, |_span, c| match c {
|
||||
Ok(MixedUnit::Char(c)) => {
|
||||
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
|
||||
}
|
||||
@@ -1511,8 +1510,8 @@ impl Literal {
|
||||
let mut buf = Vec::with_capacity(symbol.len());
|
||||
let mut error = None;
|
||||
|
||||
unescape_unicode(symbol, Mode::ByteStr, &mut |_, c| match c {
|
||||
Ok(c) => buf.push(byte_from_char(c)),
|
||||
unescape_byte_str(symbol, |_, res| match res {
|
||||
Ok(b) => buf.push(b),
|
||||
Err(err) => {
|
||||
if err.is_fatal() {
|
||||
error = Some(ConversionErrorKind::FailedToUnescape(err));
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::utils::{
|
||||
ErrAction, File, FileUpdater, RustSearcher, Token, UpdateMode, UpdateStatus, expect_action, update_text_region_fn,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use rustc_lexer::{LiteralKind, TokenKind, tokenize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
@@ -342,7 +343,7 @@ fn parse_str_lit(s: &str) -> String {
|
||||
.and_then(|s| s.strip_suffix('"'))
|
||||
.unwrap_or_else(|| panic!("expected quoted string, found `{s}`"));
|
||||
let mut res = String::with_capacity(s.len());
|
||||
rustc_literal_escaper::unescape_unicode(s, mode, &mut |_, ch| {
|
||||
rustc_literal_escaper::unescape_str(s, |range, ch| {
|
||||
if let Ok(ch) = ch {
|
||||
res.push(ch);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ description = "A script to extract the lint documentation for the rustc book."
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rustc-literal-escaper = "0.0.2"
|
||||
rustc-literal-escaper = "0.0.4"
|
||||
serde_json = "1.0.57"
|
||||
tempfile = "3.1.0"
|
||||
walkdir = "2.3.1"
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
|
||||
use rustc_literal_escaper::{Mode, unescape_unicode};
|
||||
use rustc_literal_escaper::unescape_str;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
mod groups;
|
||||
@@ -218,7 +218,7 @@ impl<'a> LintExtractor<'a> {
|
||||
} else if let Some(text) = line.strip_prefix("#[doc = \"") {
|
||||
let escaped = text.strip_suffix("\"]").unwrap();
|
||||
let mut buf = String::new();
|
||||
unescape_unicode(escaped, Mode::Str, &mut |_, c| match c {
|
||||
unescape_str(escaped, |_, res| match res {
|
||||
Ok(c) => buf.push(c),
|
||||
Err(err) => {
|
||||
assert!(!err.is_fatal(), "failed to unescape string literal")
|
||||
|
||||
Reference in New Issue
Block a user