Auto merge of #60793 - Xanewok:raw-string-cleanup, r=petrochenkov
lexer: Disallow bare CR in raw byte strings Handles bare CR ~but doesn't translate `\r\n` to `\n` yet in raw strings yet~ and translates CRLF to LF in raw strings. As a side-note I think it'd be good to change the `unescape_` to return plain iterators to reduce some boilerplate (e.g. `has_error` could benefit from collecting `Result<T>` and aborting early on errors) but will do that separately, unless I missed something here that prevents it. @matklad @petrochenkov thoughts?
This commit is contained in:
@@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
|
||||
self.ch.is_none()
|
||||
}
|
||||
|
||||
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
|
||||
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
|
||||
let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
|
||||
err.span_label(self.mk_sp(pos, pos), "unterminated raw string");
|
||||
|
||||
@@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
|
||||
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
|
||||
}
|
||||
|
||||
/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
|
||||
/// escaped character to the error message
|
||||
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
|
||||
let mut m = m.to_string();
|
||||
m.push_str(": ");
|
||||
push_escaped_char(&mut m, c);
|
||||
self.err_span_(from_pos, to_pos, &m[..]);
|
||||
}
|
||||
|
||||
/// Advance peek_token to refer to the next token, and
|
||||
/// possibly update the interner.
|
||||
fn advance_token(&mut self) -> Result<(), ()> {
|
||||
@@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
|
||||
self.validate_byte_str_escape(start_with_quote);
|
||||
(token::ByteStr, symbol)
|
||||
},
|
||||
Some('r') => self.scan_raw_byte_string(),
|
||||
Some('r') => {
|
||||
let (start, end, hash_count) = self.scan_raw_string();
|
||||
let symbol = self.name_from_to(start, end);
|
||||
self.validate_raw_byte_str_escape(start, end);
|
||||
|
||||
(token::ByteStrRaw(hash_count), symbol)
|
||||
}
|
||||
_ => unreachable!(), // Should have been a token::Ident above.
|
||||
};
|
||||
let suffix = self.scan_optional_raw_name();
|
||||
@@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
|
||||
Ok(TokenKind::lit(token::Str, symbol, suffix))
|
||||
}
|
||||
'r' => {
|
||||
let start_bpos = self.pos;
|
||||
self.bump();
|
||||
let mut hash_count: u16 = 0;
|
||||
while self.ch_is('#') {
|
||||
if hash_count == 65535 {
|
||||
let bpos = self.next_pos;
|
||||
self.fatal_span_(start_bpos,
|
||||
bpos,
|
||||
"too many `#` symbols: raw strings may be \
|
||||
delimited by up to 65535 `#` symbols").raise();
|
||||
}
|
||||
self.bump();
|
||||
hash_count += 1;
|
||||
}
|
||||
|
||||
if self.is_eof() {
|
||||
self.fail_unterminated_raw_string(start_bpos, hash_count);
|
||||
} else if !self.ch_is('"') {
|
||||
let last_bpos = self.pos;
|
||||
let curr_char = self.ch.unwrap();
|
||||
self.fatal_span_char(start_bpos,
|
||||
last_bpos,
|
||||
"found invalid character; only `#` is allowed \
|
||||
in raw string delimitation",
|
||||
curr_char).raise();
|
||||
}
|
||||
self.bump();
|
||||
let content_start_bpos = self.pos;
|
||||
let mut content_end_bpos;
|
||||
let mut valid = true;
|
||||
'outer: loop {
|
||||
if self.is_eof() {
|
||||
self.fail_unterminated_raw_string(start_bpos, hash_count);
|
||||
}
|
||||
// if self.ch_is('"') {
|
||||
// content_end_bpos = self.pos;
|
||||
// for _ in 0..hash_count {
|
||||
// self.bump();
|
||||
// if !self.ch_is('#') {
|
||||
// continue 'outer;
|
||||
let c = self.ch.unwrap();
|
||||
match c {
|
||||
'"' => {
|
||||
content_end_bpos = self.pos;
|
||||
for _ in 0..hash_count {
|
||||
self.bump();
|
||||
if !self.ch_is('#') {
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
'\r' => {
|
||||
if !self.nextch_is('\n') {
|
||||
let last_bpos = self.pos;
|
||||
self.err_span_(start_bpos,
|
||||
last_bpos,
|
||||
"bare CR not allowed in raw string, use \\r \
|
||||
instead");
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
self.bump();
|
||||
}
|
||||
|
||||
self.bump();
|
||||
let symbol = if valid {
|
||||
self.name_from_to(content_start_bpos, content_end_bpos)
|
||||
} else {
|
||||
Symbol::intern("??")
|
||||
};
|
||||
let (start, end, hash_count) = self.scan_raw_string();
|
||||
let symbol = self.name_from_to(start, end);
|
||||
self.validate_raw_str_escape(start, end);
|
||||
let suffix = self.scan_optional_raw_name();
|
||||
|
||||
Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
|
||||
@@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
|
||||
id
|
||||
}
|
||||
|
||||
fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
|
||||
/// Scans a raw (byte) string, returning byte position range for `"<literal>"`
|
||||
/// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
|
||||
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
|
||||
let start_bpos = self.pos;
|
||||
self.bump();
|
||||
let mut hash_count = 0;
|
||||
let mut hash_count: u16 = 0;
|
||||
while self.ch_is('#') {
|
||||
if hash_count == 65535 {
|
||||
let bpos = self.next_pos;
|
||||
self.fatal_span_(start_bpos,
|
||||
bpos,
|
||||
"too many `#` symbols: raw byte strings may be \
|
||||
"too many `#` symbols: raw strings may be \
|
||||
delimited by up to 65535 `#` symbols").raise();
|
||||
}
|
||||
self.bump();
|
||||
@@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
|
||||
if self.is_eof() {
|
||||
self.fail_unterminated_raw_string(start_bpos, hash_count);
|
||||
} else if !self.ch_is('"') {
|
||||
let pos = self.pos;
|
||||
let ch = self.ch.unwrap();
|
||||
let last_bpos = self.pos;
|
||||
let curr_char = self.ch.unwrap();
|
||||
self.fatal_span_char(start_bpos,
|
||||
pos,
|
||||
"found invalid character; only `#` is allowed in raw \
|
||||
string delimitation",
|
||||
ch).raise();
|
||||
last_bpos,
|
||||
"found invalid character; only `#` is allowed \
|
||||
in raw string delimitation",
|
||||
curr_char).raise();
|
||||
}
|
||||
self.bump();
|
||||
let content_start_bpos = self.pos;
|
||||
@@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
break;
|
||||
}
|
||||
Some(c) => {
|
||||
if c > '\x7F' {
|
||||
let pos = self.pos;
|
||||
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
self.bump();
|
||||
}
|
||||
|
||||
self.bump();
|
||||
|
||||
(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
|
||||
(content_start_bpos, content_end_bpos, hash_count)
|
||||
}
|
||||
|
||||
fn validate_char_escape(&self, start_with_quote: BytePos) {
|
||||
@@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
|
||||
});
|
||||
}
|
||||
|
||||
fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
|
||||
self.with_str_from_to(content_start, content_end, |lit: &str| {
|
||||
unescape::unescape_raw_str(lit, &mut |range, c| {
|
||||
if let Err(err) = c {
|
||||
emit_unescape_error(
|
||||
&self.sess.span_diagnostic,
|
||||
lit,
|
||||
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
|
||||
unescape::Mode::Str,
|
||||
range,
|
||||
err,
|
||||
)
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
|
||||
self.with_str_from_to(content_start, content_end, |lit: &str| {
|
||||
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
|
||||
if let Err(err) = c {
|
||||
emit_unescape_error(
|
||||
&self.sess.span_diagnostic,
|
||||
lit,
|
||||
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
|
||||
unescape::Mode::ByteStr,
|
||||
range,
|
||||
err,
|
||||
)
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
|
||||
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
|
||||
unescape::unescape_byte_str(lit, &mut |range, c| {
|
||||
|
||||
Reference in New Issue
Block a user