Auto merge of #60793 - Xanewok:raw-string-cleanup, r=petrochenkov

lexer: Disallow bare CR in raw byte strings Handles bare CR ~but doesn't translate `\r\n` to `\n` yet in raw strings yet~ and translates CRLF to LF in raw strings. As a side-note I think it'd be good to change the `unescape_` to return plain iterators to reduce some boilerplate (e.g. `has_error` could benefit from collecting `Result<T>` and aborting early on errors) but will do that separately, unless I missed something here that prevents it. @matklad @petrochenkov thoughts?
2019-06-10 23:32:12 +00:00
parent 02564de47b 630d5f355f
commit 5e2c11034f
8 changed files with 180 additions and 135 deletions
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
        self.ch.is_none()
    }
-    fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
+    fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
        let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
        err.span_label(self.mk_sp(pos, pos), "unterminated raw string");
@@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
        self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
    }
    /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
    /// escaped character to the error message
    fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
        let mut m = m.to_string();
        m.push_str(": ");
        push_escaped_char(&mut m, c);
        self.err_span_(from_pos, to_pos, &m[..]);
    }
    /// Advance peek_token to refer to the next token, and
    /// possibly update the interner.
    fn advance_token(&mut self) -> Result<(), ()> {
@@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
                        self.validate_byte_str_escape(start_with_quote);
                        (token::ByteStr, symbol)
                    },
-                    Some('r') => self.scan_raw_byte_string(),
+                    Some('r') => {
                        let (start, end, hash_count) = self.scan_raw_string();
                        let symbol = self.name_from_to(start, end);
                        self.validate_raw_byte_str_escape(start, end);
                        (token::ByteStrRaw(hash_count), symbol)
                    }
                    _ => unreachable!(),  // Should have been a token::Ident above.
                };
                let suffix = self.scan_optional_raw_name();
@@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
                Ok(TokenKind::lit(token::Str, symbol, suffix))
            }
            'r' => {
-                let start_bpos = self.pos;
+                let (start, end, hash_count) = self.scan_raw_string();
-                self.bump();
+                let symbol = self.name_from_to(start, end);
-                let mut hash_count: u16 = 0;
+                self.validate_raw_str_escape(start, end);
                while self.ch_is('#') {
                    if hash_count == 65535 {
                        let bpos = self.next_pos;
                        self.fatal_span_(start_bpos,
                                         bpos,
                                         "too many `#` symbols: raw strings may be \
                                         delimited by up to 65535 `#` symbols").raise();
                    }
                    self.bump();
                    hash_count += 1;
                }
                if self.is_eof() {
                    self.fail_unterminated_raw_string(start_bpos, hash_count);
                } else if !self.ch_is('"') {
                    let last_bpos = self.pos;
                    let curr_char = self.ch.unwrap();
                    self.fatal_span_char(start_bpos,
                                         last_bpos,
                                         "found invalid character; only `#` is allowed \
                                         in raw string delimitation",
                                         curr_char).raise();
                }
                self.bump();
                let content_start_bpos = self.pos;
                let mut content_end_bpos;
                let mut valid = true;
                'outer: loop {
                    if self.is_eof() {
                        self.fail_unterminated_raw_string(start_bpos, hash_count);
                    }
                    // if self.ch_is('"') {
                    // content_end_bpos = self.pos;
                    // for _ in 0..hash_count {
                    // self.bump();
                    // if !self.ch_is('#') {
                    // continue 'outer;
                    let c = self.ch.unwrap();
                    match c {
                        '"' => {
                            content_end_bpos = self.pos;
                            for _ in 0..hash_count {
                                self.bump();
                                if !self.ch_is('#') {
                                    continue 'outer;
                                }
                            }
                            break;
                        }
                        '\r' => {
                            if !self.nextch_is('\n') {
                                let last_bpos = self.pos;
                                self.err_span_(start_bpos,
                                               last_bpos,
                                               "bare CR not allowed in raw string, use \\r \
                                                instead");
                                valid = false;
                            }
                        }
                        _ => (),
                    }
                    self.bump();
                }
                self.bump();
                let symbol = if valid {
                    self.name_from_to(content_start_bpos, content_end_bpos)
                } else {
                    Symbol::intern("??")
                };
                let suffix = self.scan_optional_raw_name();
                Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
@@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
        id
    }
-    fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
+    /// Scans a raw (byte) string, returning byte position range for `"<literal>"`
    /// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
    fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
        let start_bpos = self.pos;
        self.bump();
-        let mut hash_count = 0;
+        let mut hash_count: u16 = 0;
        while self.ch_is('#') {
            if hash_count == 65535 {
                let bpos = self.next_pos;
                self.fatal_span_(start_bpos,
                                 bpos,
-                                 "too many `#` symbols: raw byte strings may be \
+                                 "too many `#` symbols: raw strings may be \
                                 delimited by up to 65535 `#` symbols").raise();
            }
            self.bump();
@@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
        if self.is_eof() {
            self.fail_unterminated_raw_string(start_bpos, hash_count);
        } else if !self.ch_is('"') {
-            let pos = self.pos;
+            let last_bpos = self.pos;
-            let ch = self.ch.unwrap();
+            let curr_char = self.ch.unwrap();
            self.fatal_span_char(start_bpos,
-                                        pos,
+                                 last_bpos,
-                                        "found invalid character; only `#` is allowed in raw \
+                                 "found invalid character; only `#` is allowed \
-                                         string delimitation",
+                                 in raw string delimitation",
-                                        ch).raise();
+                                 curr_char).raise();
        }
        self.bump();
        let content_start_bpos = self.pos;
@@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
                    }
                    break;
                }
-                Some(c) => {
+                _ => (),
                    if c > '\x7F' {
                        let pos = self.pos;
                        self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
                    }
                }
            }
            self.bump();
        }
        self.bump();
-        (token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
+        (content_start_bpos, content_end_bpos, hash_count)
    }
    fn validate_char_escape(&self, start_with_quote: BytePos) {
@@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
        });
    }
    fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
        self.with_str_from_to(content_start, content_end, |lit: &str| {
            unescape::unescape_raw_str(lit, &mut |range, c| {
                if let Err(err) = c {
                    emit_unescape_error(
                        &self.sess.span_diagnostic,
                        lit,
                        self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
                        unescape::Mode::Str,
                        range,
                        err,
                    )
                }
            })
        });
    }
    fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
        self.with_str_from_to(content_start, content_end, |lit: &str| {
            unescape::unescape_raw_byte_str(lit, &mut |range, c| {
                if let Err(err) = c {
                    emit_unescape_error(
                        &self.sess.span_diagnostic,
                        lit,
                        self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
                        unescape::Mode::ByteStr,
                        range,
                        err,
                    )
                }
            })
        });
    }
    fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
        self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
            unescape::unescape_byte_str(lit, &mut |range, c| {
--- a/src/libsyntax/parse/literal.rs
+++ b/src/libsyntax/parse/literal.rs
@@ -4,7 +4,9 @@ use crate::ast::{self, Lit, LitKind};
 use crate::parse::parser::Parser;
 use crate::parse::PResult;
 use crate::parse::token::{self, Token, TokenKind};
-use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
+use crate::parse::unescape::{unescape_char, unescape_byte};
 use crate::parse::unescape::{unescape_str, unescape_byte_str};
 use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str};
 use crate::print::pprust;
 use crate::symbol::{kw, sym, Symbol};
 use crate::tokenstream::{TokenStream, TokenTree};
@@ -141,7 +143,17 @@ impl LitKind {
                // Ditto.
                let s = symbol.as_str();
                let symbol = if s.contains('\r') {
-                    Symbol::intern(&raw_str_lit(&s))
+                    let mut buf = String::with_capacity(s.len());
                    let mut error = Ok(());
                    unescape_raw_str(&s, &mut |_, unescaped_char| {
                        match unescaped_char {
                            Ok(c) => buf.push(c),
                            Err(_) => error = Err(LitError::LexerError),
                        }
                    });
                    error?;
                    buf.shrink_to_fit();
                    Symbol::intern(&buf)
                } else {
                    symbol
                };
@@ -161,7 +173,26 @@ impl LitKind {
                buf.shrink_to_fit();
                LitKind::ByteStr(Lrc::new(buf))
            }
-            token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())),
+            token::ByteStrRaw(_) => {
                let s = symbol.as_str();
                let bytes = if s.contains('\r') {
                    let mut buf = Vec::with_capacity(s.len());
                    let mut error = Ok(());
                    unescape_raw_byte_str(&s, &mut |_, unescaped_byte| {
                        match unescaped_byte {
                            Ok(c) => buf.push(c),
                            Err(_) => error = Err(LitError::LexerError),
                        }
                    });
                    error?;
                    buf.shrink_to_fit();
                    buf
                } else {
                    symbol.to_string().into_bytes()
                };
                LitKind::ByteStr(Lrc::new(bytes))
            },
            token::Err => LitKind::Err(symbol),
        })
    }
@@ -353,29 +384,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
    }
 }
 /// Parses a string representing a raw string literal into its final form. The
 /// only operation this does is convert embedded CRLF into a single LF.
 fn raw_str_lit(lit: &str) -> String {
    debug!("raw_str_lit: {:?}", lit);
    let mut res = String::with_capacity(lit.len());
    let mut chars = lit.chars().peekable();
    while let Some(c) = chars.next() {
        if c == '\r' {
            if *chars.peek().unwrap() != '\n' {
                panic!("lexer accepted bare CR");
            }
            chars.next();
            res.push('\n');
        } else {
            res.push(c);
        }
    }
    res.shrink_to_fit();
    res
 }
 // Checks if `s` looks like i32 or u1234 etc.
 fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
    s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())
--- a/src/libsyntax/parse/unescape.rs
+++ b/src/libsyntax/parse/unescape.rs
@@ -1,4 +1,4 @@
-//! Utilities for validating  string and char literals and turning them into
+//! Utilities for validating string and char literals and turning them into
 //! values they represent.
 use std::str::Chars;
@@ -12,6 +12,7 @@ pub(crate) enum EscapeError {
    LoneSlash,
    InvalidEscape,
    BareCarriageReturn,
    BareCarriageReturnInRawString,
    EscapeOnlyChar,
    TooShortHexEscape,
@@ -29,6 +30,7 @@ pub(crate) enum EscapeError {
    UnicodeEscapeInByte,
    NonAsciiCharInByte,
    NonAsciiCharInByteString,
 }
 /// Takes a contents of a char literal (without quotes), and returns an
@@ -66,6 +68,30 @@ where
    })
 }
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only translate CRLF to LF and produce errors on bare CR.
 pub(crate) fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
 where
    F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
    unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
 }
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only translate CRLF to LF and produce errors on bare CR.
 pub(crate) fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
 where
    F: FnMut(Range<usize>, Result<u8, EscapeError>),
 {
    unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
        callback(range, char.map(byte_from_char))
    })
 }
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum Mode {
    Char,
@@ -254,9 +280,40 @@ where
    }
 }
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only translate CRLF to LF and produce errors on bare CR.
 fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
 where
    F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
    assert!(mode.in_double_quotes());
    let initial_len = literal_text.len();
    let mut chars = literal_text.chars();
    while let Some(curr) = chars.next() {
        let start = initial_len - chars.as_str().len() - curr.len_utf8();
        let result = match (curr, chars.clone().next()) {
            ('\r', Some('\n')) => {
                chars.next();
                Ok('\n')
            },
            ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
            (c, _) if mode.is_bytes() && !c.is_ascii() =>
                Err(EscapeError::NonAsciiCharInByteString),
            (c, _) => Ok(c),
        };
        let end = initial_len - chars.as_str().len();
        callback(start..end, result);
    }
 }
 fn byte_from_char(c: char) -> u8 {
    let res = c as u32;
-    assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
+    assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)");
    res as u8
 }
--- a/src/libsyntax/parse/unescape_error_reporting.rs
+++ b/src/libsyntax/parse/unescape_error_reporting.rs
@@ -80,6 +80,11 @@ pub(crate) fn emit_unescape_error(
            };
            handler.span_err(span, msg);
        }
        EscapeError::BareCarriageReturnInRawString => {
            assert!(mode.in_double_quotes());
            let msg = "bare CR not allowed in raw string";
            handler.span_err(span, msg);
        }
        EscapeError::InvalidEscape => {
            let (c, span) = last_char();
@@ -124,6 +129,11 @@ pub(crate) fn emit_unescape_error(
            handler.span_err(span, "byte constant must be ASCII. \
                                    Use a \\xHH escape for a non-ASCII byte")
        }
        EscapeError::NonAsciiCharInByteString => {
            assert!(mode.is_bytes());
            let (_c, span) = last_char();
            handler.span_err(span, "raw byte string must be ASCII")
        }
        EscapeError::OutOfRangeHexEscape => {
            handler.span_err(span, "this form of character escape may only be used \
                                    with characters in the range [\\x00-\\x7f]")
--- a/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs
+++ b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs
@@ -30,6 +30,9 @@ literal";
    let s = r"string
 literal";
    assert_eq!(s, "string\nliteral");
    let s = br"byte string
 literal";
    assert_eq!(s, "byte string\nliteral".as_bytes());
    // validate that our source file has CRLF endings
    let source = include_str!("lexer-crlf-line-endings-string-literal-doc-comment.rs");
--- a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr
+++ b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr
@@ -28,11 +28,11 @@ error: bare CR not allowed in string, use \r instead
 error: bare CR not allowed in string, use \r instead
  --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:21:18
-   |
+   |
-LL |     let _s = "foo
+LL |     let _s = "foo
 bar";
   |                  ^
-
+
 error: bare CR not allowed in raw string
  --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:19
   |
--- a/src/test/ui/parser/raw-byte-string-literals.rs
+++ b/src/test/ui/parser/raw-byte-string-literals.rs
@@ -1,4 +1,7 @@
 // ignore-tidy-cr
 // compile-flags: -Z continue-parse-after-error
 pub fn main() {
    br"a
 "; //~ ERROR bare CR not allowed in raw string
    br"é";  //~ ERROR raw byte string must be ASCII
    br##~"a"~##;  //~ ERROR only `#` is allowed in raw string delimitation
--- a/src/test/ui/parser/raw-byte-string-literals.stderr
+++ b/src/test/ui/parser/raw-byte-string-literals.stderr
@@ -1,14 +1,20 @@
-error: raw byte string must be ASCII: \u{e9}
+error: bare CR not allowed in raw string
-  --> $DIR/raw-byte-string-literals.rs:2:8
+  --> $DIR/raw-byte-string-literals.rs:4:9
   |
 LL |     br"a
 ";
   |         ^
 error: raw byte string must be ASCII
  --> $DIR/raw-byte-string-literals.rs:5:8
   |
 LL |     br"é";
   |        ^
-  --> $DIR/raw-byte-string-literals.rs:3:6
+error: found invalid character; only `#` is allowed in raw string delimitation: ~
  --> $DIR/raw-byte-string-literals.rs:6:6
   |
 LL |     br##~"a"~##;
   |      ^^^
-error: aborting due to 2 previous errors
+
 error: aborting due to 3 previous errors