Introduce ByteSymbol.

It's like `Symbol` but for byte strings. The interner is now used for
both `Symbol` and `ByteSymbol`. E.g. if you intern `"dog"` and `b"dog"`
you'll get a `Symbol` and a `ByteSymbol` with the same index and the
characters will only be stored once.

The motivation for this is to eliminate the `Arc`s in `ast::LitKind`, to
make `ast::LitKind` impl `Copy`, and to avoid the need to arena-allocate
`ast::LitKind` in HIR. The latter change reduces peak memory by a
non-trivial amount on literal-heavy benchmarks such as `deep-vector` and
`tuple-stress`.

`Encoder`, `Decoder`, `SpanEncoder`, and `SpanDecoder` all get some
changes so that they can handle normal strings and byte strings.

This change does slow down compilation of programs that use
`include_bytes!` on large files, because the contents of those files are
now interned (hashed). This makes `include_bytes!` more similar to
`include_str!`, though `include_bytes!` contents still aren't escaped,
and hashing is still much cheaper than escaping.
This commit is contained in:
Nicholas Nethercote
2025-06-02 08:59:29 +10:00
parent ed2d759783
commit 478f8287c0
46 changed files with 449 additions and 269 deletions

View File

@@ -19,7 +19,6 @@
//! - [`UnOp`], [`BinOp`], and [`BinOpKind`]: Unary and binary operators.
use std::borrow::Cow;
use std::sync::Arc;
use std::{cmp, fmt};
pub use GenericArgs::*;
@@ -32,7 +31,7 @@ use rustc_data_structures::tagged_ptr::Tag;
use rustc_macros::{Decodable, Encodable, HashStable_Generic};
pub use rustc_span::AttrId;
use rustc_span::source_map::{Spanned, respan};
use rustc_span::{DUMMY_SP, ErrorGuaranteed, Ident, Span, Symbol, kw, sym};
use rustc_span::{ByteSymbol, DUMMY_SP, ErrorGuaranteed, Ident, Span, Symbol, kw, sym};
use thin_vec::{ThinVec, thin_vec};
pub use crate::format::*;
@@ -1805,10 +1804,17 @@ pub enum ExprKind {
Become(P<Expr>),
/// Bytes included via `include_bytes!`
///
/// Added for optimization purposes to avoid the need to escape
/// large binary blobs - should always behave like [`ExprKind::Lit`]
/// with a `ByteStr` literal.
IncludedBytes(Arc<[u8]>),
///
/// The value is stored as a `ByteSymbol`. It's unfortunate that we need to
/// intern (hash) the bytes because they're likely to be large and unique.
/// But it's necessary because this will eventually be lowered to
/// `LitKind::ByteStr`, which needs a `ByteSymbol` to impl `Copy` and avoid
/// arena allocation.
IncludedBytes(ByteSymbol),
/// A `format_args!()` expression.
FormatArgs(P<FormatArgs>),
@@ -2066,7 +2072,7 @@ impl YieldKind {
}
/// A literal in a meta item.
#[derive(Clone, Encodable, Decodable, Debug, HashStable_Generic)]
#[derive(Clone, Copy, Encodable, Decodable, Debug, HashStable_Generic)]
pub struct MetaItemLit {
/// The original literal as written in the source code.
pub symbol: Symbol,
@@ -2129,16 +2135,18 @@ pub enum LitFloatType {
/// deciding the `LitKind`. This means that float literals like `1f32` are
/// classified by this type as `Float`. This is different to `token::LitKind`
/// which does *not* consider the suffix.
#[derive(Clone, Encodable, Decodable, Debug, Hash, Eq, PartialEq, HashStable_Generic)]
#[derive(Clone, Copy, Encodable, Decodable, Debug, Hash, Eq, PartialEq, HashStable_Generic)]
pub enum LitKind {
/// A string literal (`"foo"`). The symbol is unescaped, and so may differ
/// from the original token's symbol.
Str(Symbol, StrStyle),
/// A byte string (`b"foo"`). Not stored as a symbol because it might be
/// non-utf8, and symbols only allow utf8 strings.
ByteStr(Arc<[u8]>, StrStyle),
/// A C String (`c"foo"`). Guaranteed to only have `\0` at the end.
CStr(Arc<[u8]>, StrStyle),
/// A byte string (`b"foo"`). The symbol is unescaped, and so may differ
/// from the original token's symbol.
ByteStr(ByteSymbol, StrStyle),
/// A C String (`c"foo"`). Guaranteed to only have `\0` at the end. The
/// symbol is unescaped, and so may differ from the original token's
/// symbol.
CStr(ByteSymbol, StrStyle),
/// A byte char (`b'f'`).
Byte(u8),
/// A character literal (`'a'`).

View File

@@ -5,7 +5,7 @@ use std::{ascii, fmt, str};
use rustc_literal_escaper::{
MixedUnit, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str,
};
use rustc_span::{Span, Symbol, kw, sym};
use rustc_span::{ByteSymbol, Span, Symbol, kw, sym};
use tracing::debug;
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
@@ -116,13 +116,12 @@ impl LitKind {
assert!(!err.is_fatal(), "failed to unescape string literal")
}
});
LitKind::ByteStr(buf.into(), StrStyle::Cooked)
LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Cooked)
}
token::ByteStrRaw(n) => {
// Raw strings have no escapes so we can convert the symbol
// directly to a `Arc<u8>`.
// Raw byte strings have no escapes so no work is needed here.
let buf = symbol.as_str().to_owned().into_bytes();
LitKind::ByteStr(buf.into(), StrStyle::Raw(n))
LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Raw(n))
}
token::CStr => {
let s = symbol.as_str();
@@ -137,7 +136,7 @@ impl LitKind {
}
});
buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Cooked)
LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Cooked)
}
token::CStrRaw(n) => {
// Raw strings have no escapes so we can convert the symbol
@@ -145,7 +144,7 @@ impl LitKind {
// char.
let mut buf = symbol.as_str().to_owned().into_bytes();
buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Raw(n))
LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Raw(n))
}
token::Err(guar) => LitKind::Err(guar),
})
@@ -167,12 +166,12 @@ impl fmt::Display for LitKind {
delim = "#".repeat(n as usize),
string = sym
)?,
LitKind::ByteStr(ref bytes, StrStyle::Cooked) => {
write!(f, "b\"{}\"", escape_byte_str_symbol(bytes))?
LitKind::ByteStr(ref byte_sym, StrStyle::Cooked) => {
write!(f, "b\"{}\"", escape_byte_str_symbol(byte_sym.as_byte_str()))?
}
LitKind::ByteStr(ref bytes, StrStyle::Raw(n)) => {
LitKind::ByteStr(ref byte_sym, StrStyle::Raw(n)) => {
// Unwrap because raw byte string literals can only contain ASCII.
let symbol = str::from_utf8(bytes).unwrap();
let symbol = str::from_utf8(byte_sym.as_byte_str()).unwrap();
write!(
f,
"br{delim}\"{string}\"{delim}",
@@ -181,11 +180,11 @@ impl fmt::Display for LitKind {
)?;
}
LitKind::CStr(ref bytes, StrStyle::Cooked) => {
write!(f, "c\"{}\"", escape_byte_str_symbol(bytes))?
write!(f, "c\"{}\"", escape_byte_str_symbol(bytes.as_byte_str()))?
}
LitKind::CStr(ref bytes, StrStyle::Raw(n)) => {
// This can only be valid UTF-8.
let symbol = str::from_utf8(bytes).unwrap();
let symbol = str::from_utf8(bytes.as_byte_str()).unwrap();
write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize),)?;
}
LitKind::Int(n, ty) => {