Actually translate CRLF in raw byte strings and unify unescape impl

This commit is contained in:
Igor Matuszewski
2019-06-09 14:43:31 +02:00
parent 3c1d352dc4
commit 735ac057bb
3 changed files with 69 additions and 25 deletions

View File

@@ -1348,7 +1348,7 @@ impl<'a> StringReader<'a> {
fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, unescape::Mode::Str, &mut |range, c| {
unescape::unescape_raw_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
@@ -1365,7 +1365,7 @@ impl<'a> StringReader<'a> {
fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, unescape::Mode::ByteStr, &mut |range, c| {
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,

View File

@@ -4,8 +4,9 @@ use crate::ast::{self, Lit, LitKind};
use crate::parse::parser::Parser;
use crate::parse::PResult;
use crate::parse::token::{self, Token, TokenKind};
use crate::parse::unescape::{self, unescape_str, unescape_byte_str, unescape_raw_str};
use crate::parse::unescape::{unescape_char, unescape_byte};
use crate::parse::unescape::{unescape_str, unescape_byte_str};
use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str};
use crate::print::pprust;
use crate::symbol::{kw, sym, Symbol};
use crate::tokenstream::{TokenStream, TokenTree};
@@ -144,7 +145,7 @@ impl LitKind {
let symbol = if s.contains('\r') {
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_str(&s, unescape::Mode::Str, &mut |_, unescaped_char| {
unescape_raw_str(&s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
@@ -172,7 +173,26 @@ impl LitKind {
buf.shrink_to_fit();
LitKind::ByteStr(Lrc::new(buf))
}
token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())),
token::ByteStrRaw(_) => {
let s = symbol.as_str();
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_byte_str(&s, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
buf
} else {
symbol.to_string().into_bytes()
};
LitKind::ByteStr(Lrc::new(bytes))
},
token::Err => LitKind::Err(symbol),
})
}

View File

@@ -71,29 +71,24 @@ where
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub(crate) fn unescape_raw_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
pub(crate) fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut byte_offset: usize = 0;
unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
}
let mut chars = literal_text.chars().peekable();
while let Some(curr) = chars.next() {
let (result, scanned) = match (curr, chars.peek()) {
('\r', Some('\n')) => {
chars.next();
(Ok('\n'), [Some('\r'), Some('\n')])
},
('\r', _) =>
(Err(EscapeError::BareCarriageReturn), [Some('\r'), None]),
(c, _) if mode.is_bytes() && c > '\x7F' =>
(Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]),
(c, _) => (Ok(c), [Some(c), None]),
};
let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum();
callback(byte_offset..(byte_offset + len_utf8), result);
byte_offset += len_utf8;
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub(crate) fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
callback(range, char.map(byte_from_char))
})
}
#[derive(Debug, Clone, Copy)]
@@ -284,9 +279,38 @@ where
}
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut byte_offset: usize = 0;
let mut chars = literal_text.chars().peekable();
while let Some(curr) = chars.next() {
let (result, scanned) = match (curr, chars.peek()) {
('\r', Some('\n')) => {
chars.next();
(Ok('\n'), [Some('\r'), Some('\n')])
},
('\r', _) =>
(Err(EscapeError::BareCarriageReturn), [Some('\r'), None]),
(c, _) if mode.is_bytes() && !c.is_ascii() =>
(Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]),
(c, _) => (Ok(c), [Some(c), None]),
};
let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum();
callback(byte_offset..(byte_offset + len_utf8), result);
byte_offset += len_utf8;
}
}
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)");
res as u8
}