Auto merge of #60793 - Xanewok:raw-string-cleanup, r=petrochenkov

lexer: Disallow bare CR in raw byte strings Handles bare CR ~but doesn't translate `\r\n` to `\n` yet in raw strings yet~ and translates CRLF to LF in raw strings. As a side-note I think it'd be good to change the `unescape_` to return plain iterators to reduce some boilerplate (e.g. `has_error` could benefit from collecting `Result<T>` and aborting early on errors) but will do that separately, unless I missed something here that prevents it. @matklad @petrochenkov thoughts?
2019-06-10 23:32:12 +00:00
parent 02564de47b 630d5f355f
commit 5e2c11034f
8 changed files with 180 additions and 135 deletions
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
        self.ch.is_none()
    }

-    fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
+    fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
        let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
        err.span_label(self.mk_sp(pos, pos), "unterminated raw string");

@@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
        self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
    }

-    /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
-    /// escaped character to the error message
-    fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
-        let mut m = m.to_string();
-        m.push_str(": ");
-        push_escaped_char(&mut m, c);
-        self.err_span_(from_pos, to_pos, &m[..]);
-    }
-
    /// Advance peek_token to refer to the next token, and
    /// possibly update the interner.
    fn advance_token(&mut self) -> Result<(), ()> {
@@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
                        self.validate_byte_str_escape(start_with_quote);
                        (token::ByteStr, symbol)
                    },
-                    Some('r') => self.scan_raw_byte_string(),
+                    Some('r') => {
+                        let (start, end, hash_count) = self.scan_raw_string();
+                        let symbol = self.name_from_to(start, end);
+                        self.validate_raw_byte_str_escape(start, end);
+
+                        (token::ByteStrRaw(hash_count), symbol)
+                    }
                    _ => unreachable!(),  // Should have been a token::Ident above.
                };
                let suffix = self.scan_optional_raw_name();
@@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
                Ok(TokenKind::lit(token::Str, symbol, suffix))
            }
            'r' => {
-                let start_bpos = self.pos;
-                self.bump();
-                let mut hash_count: u16 = 0;
-                while self.ch_is('#') {
-                    if hash_count == 65535 {
-                        let bpos = self.next_pos;
-                        self.fatal_span_(start_bpos,
-                                         bpos,
-                                         "too many `#` symbols: raw strings may be \
-                                         delimited by up to 65535 `#` symbols").raise();
-                    }
-                    self.bump();
-                    hash_count += 1;
-                }
-
-                if self.is_eof() {
-                    self.fail_unterminated_raw_string(start_bpos, hash_count);
-                } else if !self.ch_is('"') {
-                    let last_bpos = self.pos;
-                    let curr_char = self.ch.unwrap();
-                    self.fatal_span_char(start_bpos,
-                                         last_bpos,
-                                         "found invalid character; only `#` is allowed \
-                                         in raw string delimitation",
-                                         curr_char).raise();
-                }
-                self.bump();
-                let content_start_bpos = self.pos;
-                let mut content_end_bpos;
-                let mut valid = true;
-                'outer: loop {
-                    if self.is_eof() {
-                        self.fail_unterminated_raw_string(start_bpos, hash_count);
-                    }
-                    // if self.ch_is('"') {
-                    // content_end_bpos = self.pos;
-                    // for _ in 0..hash_count {
-                    // self.bump();
-                    // if !self.ch_is('#') {
-                    // continue 'outer;
-                    let c = self.ch.unwrap();
-                    match c {
-                        '"' => {
-                            content_end_bpos = self.pos;
-                            for _ in 0..hash_count {
-                                self.bump();
-                                if !self.ch_is('#') {
-                                    continue 'outer;
-                                }
-                            }
-                            break;
-                        }
-                        '\r' => {
-                            if !self.nextch_is('\n') {
-                                let last_bpos = self.pos;
-                                self.err_span_(start_bpos,
-                                               last_bpos,
-                                               "bare CR not allowed in raw string, use \\r \
-                                                instead");
-                                valid = false;
-                            }
-                        }
-                        _ => (),
-                    }
-                    self.bump();
-                }
-
-                self.bump();
-                let symbol = if valid {
-                    self.name_from_to(content_start_bpos, content_end_bpos)
-                } else {
-                    Symbol::intern("??")
-                };
+                let (start, end, hash_count) = self.scan_raw_string();
+                let symbol = self.name_from_to(start, end);
+                self.validate_raw_str_escape(start, end);
                let suffix = self.scan_optional_raw_name();

                Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
@@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
        id
    }

-    fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
+    /// Scans a raw (byte) string, returning byte position range for `"<literal>"`
+    /// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
+    fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
        let start_bpos = self.pos;
        self.bump();
-        let mut hash_count = 0;
+        let mut hash_count: u16 = 0;
        while self.ch_is('#') {
            if hash_count == 65535 {
                let bpos = self.next_pos;
                self.fatal_span_(start_bpos,
                                 bpos,
-                                 "too many `#` symbols: raw byte strings may be \
+                                 "too many `#` symbols: raw strings may be \
                                 delimited by up to 65535 `#` symbols").raise();
            }
            self.bump();
@@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
        if self.is_eof() {
            self.fail_unterminated_raw_string(start_bpos, hash_count);
        } else if !self.ch_is('"') {
-            let pos = self.pos;
-            let ch = self.ch.unwrap();
+            let last_bpos = self.pos;
+            let curr_char = self.ch.unwrap();
            self.fatal_span_char(start_bpos,
-                                        pos,
-                                        "found invalid character; only `#` is allowed in raw \
-                                         string delimitation",
-                                        ch).raise();
+                                 last_bpos,
+                                 "found invalid character; only `#` is allowed \
+                                 in raw string delimitation",
+                                 curr_char).raise();
        }
        self.bump();
        let content_start_bpos = self.pos;
@@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
                    }
                    break;
                }
-                Some(c) => {
-                    if c > '\x7F' {
-                        let pos = self.pos;
-                        self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
-                    }
-                }
+                _ => (),
            }
            self.bump();
        }

        self.bump();

-        (token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
+        (content_start_bpos, content_end_bpos, hash_count)
    }

    fn validate_char_escape(&self, start_with_quote: BytePos) {
@@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
        });
    }

+    fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
+        self.with_str_from_to(content_start, content_end, |lit: &str| {
+            unescape::unescape_raw_str(lit, &mut |range, c| {
+                if let Err(err) = c {
+                    emit_unescape_error(
+                        &self.sess.span_diagnostic,
+                        lit,
+                        self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
+                        unescape::Mode::Str,
+                        range,
+                        err,
+                    )
+                }
+            })
+        });
+    }
+
+    fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
+        self.with_str_from_to(content_start, content_end, |lit: &str| {
+            unescape::unescape_raw_byte_str(lit, &mut |range, c| {
+                if let Err(err) = c {
+                    emit_unescape_error(
+                        &self.sess.span_diagnostic,
+                        lit,
+                        self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
+                        unescape::Mode::ByteStr,
+                        range,
+                        err,
+                    )
+                }
+            })
+        });
+    }
+
    fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
        self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
            unescape::unescape_byte_str(lit, &mut |range, c| {