auto merge of #14880 : SimonSapin/rust/byte-literals, r=alexcrichton

See #14646 (tracking issue) and rust-lang/rfcs#69. This does not close the tracking issue, as the `bytes!()` macro still needs to be removed. It will be later, after a snapshot is made with the changes in this PR, so that the new syntax can be used when bootstrapping the compiler.
2014-06-18 02:06:37 +00:00
parent 5c81a186e9 3744d82851
commit d6736a1440
24 changed files with 469 additions and 108 deletions
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -636,6 +636,67 @@ impl<'a> StringReader<'a> {
        }
    }

+    /// Scan for a single (possibly escaped) byte or char
+    /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
+    /// `start` is the position of `first_source_char`, which is already consumed.
+    fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char,
+                         ascii_only: bool, delim: char) -> Option<char> {
+        match first_source_char {
+            '\\' => {
+                // '\X' for some X must be a character constant:
+                let escaped = self.curr;
+                let escaped_pos = self.last_pos;
+                self.bump();
+                match escaped {
+                    None => {},  // EOF here is an error that will be checked later.
+                    Some(e) => {
+                        return Some(match e {
+                            'n' => '\n',
+                            'r' => '\r',
+                            't' => '\t',
+                            '\\' => '\\',
+                            '\'' => '\'',
+                            '"' => '"',
+                            '0' => '\x00',
+                            'x' => self.scan_numeric_escape(2u, delim),
+                            'u' if !ascii_only => self.scan_numeric_escape(4u, delim),
+                            'U' if !ascii_only => self.scan_numeric_escape(8u, delim),
+                            '\n' if delim == '"' => {
+                                self.consume_whitespace();
+                                return None
+                            },
+                            c => {
+                                let last_pos = self.last_pos;
+                                self.err_span_char(
+                                    escaped_pos, last_pos,
+                                    if ascii_only { "unknown byte escape" }
+                                    else { "unknown character escape" },
+                                    c);
+                                c
+                            }
+                        })
+                    }
+                }
+            }
+            '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
+                let last_pos = self.last_pos;
+                self.err_span_char(
+                    start, last_pos,
+                    if ascii_only { "byte constant must be escaped" }
+                    else { "character constant must be escaped" },
+                    first_source_char);
+            }
+            _ => if ascii_only && first_source_char > '\x7F' {
+                let last_pos = self.last_pos;
+                self.err_span_char(
+                    start, last_pos,
+                    "byte constant must be ASCII. \
+                     Use a \\xHH escape for a non-ASCII byte", first_source_char);
+            }
+        }
+        Some(first_source_char)
+    }
+
    fn binop(&mut self, op: token::BinOp) -> token::Token {
        self.bump();
        if self.curr_is('=') {
@@ -650,10 +711,15 @@ impl<'a> StringReader<'a> {
    /// token, and updates the interner
    fn next_token_inner(&mut self) -> token::Token {
        let c = self.curr;
-        if ident_start(c) && !self.nextch_is('"') && !self.nextch_is('#') {
+        if ident_start(c) && match (c.unwrap(), self.nextch(), self.nextnextch()) {
            // Note: r as in r" or r#" is part of a raw string literal,
-            // not an identifier, and is handled further down.
-
+            // b as in b' is part of a byte literal.
+            // They are not identifiers, and are handled further down.
+           ('r', Some('"'), _) | ('r', Some('#'), _) |
+           ('b', Some('"'), _) | ('b', Some('\''), _) |
+           ('b', Some('r'), Some('"')) | ('b', Some('r'), Some('#')) => false,
+           _ => true
+        } {
            let start = self.last_pos;
            while ident_continue(self.curr) {
                self.bump();
@@ -805,43 +871,7 @@ impl<'a> StringReader<'a> {
            }

            // Otherwise it is a character constant:
-            match c2 {
-                '\\' => {
-                    // '\X' for some X must be a character constant:
-                    let escaped = self.curr;
-                    let escaped_pos = self.last_pos;
-                    self.bump();
-                    match escaped {
-                        None => {}
-                        Some(e) => {
-                            c2 = match e {
-                                'n' => '\n',
-                                'r' => '\r',
-                                't' => '\t',
-                                '\\' => '\\',
-                                '\'' => '\'',
-                                '"' => '"',
-                                '0' => '\x00',
-                                'x' => self.scan_numeric_escape(2u, '\''),
-                                'u' => self.scan_numeric_escape(4u, '\''),
-                                'U' => self.scan_numeric_escape(8u, '\''),
-                                c2 => {
-                                    let last_bpos = self.last_pos;
-                                    self.err_span_char(escaped_pos, last_bpos,
-                                                         "unknown character escape", c2);
-                                    c2
-                                }
-                            }
-                        }
-                    }
-                }
-                '\t' | '\n' | '\r' | '\'' => {
-                    let last_bpos = self.last_pos;
-                    self.err_span_char( start, last_bpos,
-                        "character constant must be escaped", c2);
-                }
-                _ => {}
-            }
+            c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap();
            if !self.curr_is('\'') {
                let last_bpos = self.last_pos;
                self.fatal_span_verbose(
@@ -854,6 +884,112 @@ impl<'a> StringReader<'a> {
            self.bump(); // advance curr past token
            return token::LIT_CHAR(c2);
          }
+          'b' => {
+            self.bump();
+            return match self.curr {
+                Some('\'') => parse_byte(self),
+                Some('"') => parse_byte_string(self),
+                Some('r') => parse_raw_byte_string(self),
+                _ => unreachable!()  // Should have been a token::IDENT above.
+            };
+
+            fn parse_byte(self_: &mut StringReader) -> token::Token {
+                self_.bump();
+                let start = self_.last_pos;
+
+                // the eof will be picked up by the final `'` check below
+                let mut c2 = self_.curr.unwrap_or('\x00');
+                self_.bump();
+
+                c2 = self_.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap();
+                if !self_.curr_is('\'') {
+                    // Byte offsetting here is okay because the
+                    // character before position `start` are an
+                    // ascii single quote and ascii 'b'.
+                    let last_pos = self_.last_pos;
+                    self_.fatal_span_verbose(
+                        start - BytePos(2), last_pos,
+                        "unterminated byte constant".to_string());
+                }
+                self_.bump(); // advance curr past token
+                return token::LIT_BYTE(c2 as u8);
+            }
+
+            fn parse_byte_string(self_: &mut StringReader) -> token::Token {
+                self_.bump();
+                let start = self_.last_pos;
+                let mut value = Vec::new();
+                while !self_.curr_is('"') {
+                    if self_.is_eof() {
+                        let last_pos = self_.last_pos;
+                        self_.fatal_span(start, last_pos,
+                                         "unterminated double quote byte string");
+                    }
+
+                    let ch_start = self_.last_pos;
+                    let ch = self_.curr.unwrap();
+                    self_.bump();
+                    self_.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"')
+                        .map(|ch| value.push(ch as u8));
+                }
+                self_.bump();
+                return token::LIT_BINARY(Rc::new(value));
+            }
+
+            fn parse_raw_byte_string(self_: &mut StringReader) -> token::Token {
+                let start_bpos = self_.last_pos;
+                self_.bump();
+                let mut hash_count = 0u;
+                while self_.curr_is('#') {
+                    self_.bump();
+                    hash_count += 1;
+                }
+
+                if self_.is_eof() {
+                    let last_pos = self_.last_pos;
+                    self_.fatal_span(start_bpos, last_pos, "unterminated raw string");
+                } else if !self_.curr_is('"') {
+                    let last_pos = self_.last_pos;
+                    let ch = self_.curr.unwrap();
+                    self_.fatal_span_char(start_bpos, last_pos,
+                                    "only `#` is allowed in raw string delimitation; \
+                                     found illegal character",
+                                    ch);
+                }
+                self_.bump();
+                let content_start_bpos = self_.last_pos;
+                let mut content_end_bpos;
+                'outer: loop {
+                    match self_.curr {
+                        None => {
+                            let last_pos = self_.last_pos;
+                            self_.fatal_span(start_bpos, last_pos, "unterminated raw string")
+                        },
+                        Some('"') => {
+                            content_end_bpos = self_.last_pos;
+                            for _ in range(0, hash_count) {
+                                self_.bump();
+                                if !self_.curr_is('#') {
+                                    continue 'outer;
+                                }
+                            }
+                            break;
+                        },
+                        Some(c) => if c > '\x7F' {
+                            let last_pos = self_.last_pos;
+                            self_.err_span_char(
+                                last_pos, last_pos, "raw byte string must be ASCII", c);
+                        }
+                    }
+                    self_.bump();
+                }
+                self_.bump();
+                let bytes = self_.with_str_from_to(content_start_bpos,
+                                                   content_end_bpos,
+                                                   |s| s.as_bytes().to_owned());
+                return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count);
+            }
+          }
          '"' => {
            let mut accum_str = String::new();
            let start_bpos = self.last_pos;
@@ -864,46 +1000,11 @@ impl<'a> StringReader<'a> {
                    self.fatal_span(start_bpos, last_bpos, "unterminated double quote string");
                }

+                let ch_start = self.last_pos;
                let ch = self.curr.unwrap();
                self.bump();
-                match ch {
-                  '\\' => {
-                    if self.is_eof() {
-                        let last_bpos = self.last_pos;
-                        self.fatal_span(start_bpos, last_bpos,
-                               "unterminated double quote string");
-                    }
-
-                    let escaped = self.curr.unwrap();
-                    let escaped_pos = self.last_pos;
-                    self.bump();
-                    match escaped {
-                      'n' => accum_str.push_char('\n'),
-                      'r' => accum_str.push_char('\r'),
-                      't' => accum_str.push_char('\t'),
-                      '\\' => accum_str.push_char('\\'),
-                      '\'' => accum_str.push_char('\''),
-                      '"' => accum_str.push_char('"'),
-                      '\n' => self.consume_whitespace(),
-                      '0' => accum_str.push_char('\x00'),
-                      'x' => {
-                        accum_str.push_char(self.scan_numeric_escape(2u, '"'));
-                      }
-                      'u' => {
-                        accum_str.push_char(self.scan_numeric_escape(4u, '"'));
-                      }
-                      'U' => {
-                        accum_str.push_char(self.scan_numeric_escape(8u, '"'));
-                      }
-                      c2 => {
-                        let last_bpos = self.last_pos;
-                        self.err_span_char(escaped_pos, last_bpos,
-                                        "unknown string escape", c2);
-                      }
-                    }
-                  }
-                  _ => accum_str.push_char(ch)
-                }
+                self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"')
+                    .map(|ch| accum_str.push_char(ch));
            }
            self.bump();
            return token::LIT_STR(str_to_ident(accum_str.as_slice()));
--- a/src/libsyntax/parse/parser.rs
+++ b/src/libsyntax/parse/parser.rs
@@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod};
 use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic};
 use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl};
 use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_};
-use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar};
+use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte, LitBinary};
 use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet};
 use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal};
 use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability};
@@ -1512,6 +1512,7 @@ impl<'a> Parser<'a> {
    // matches token_lit = LIT_INT | ...
    pub fn lit_from_token(&mut self, tok: &token::Token) -> Lit_ {
        match *tok {
+            token::LIT_BYTE(i) => LitByte(i),
            token::LIT_CHAR(i) => LitChar(i),
            token::LIT_INT(i, it) => LitInt(i, it),
            token::LIT_UINT(u, ut) => LitUint(u, ut),
@@ -1528,6 +1529,8 @@ impl<'a> Parser<'a> {
            token::LIT_STR_RAW(s, n) => {
                LitStr(self.id_to_interned_str(s), ast::RawStr(n))
            }
+            token::LIT_BINARY_RAW(ref v, _) |
+            token::LIT_BINARY(ref v) => LitBinary(v.clone()),
            token::LPAREN => { self.expect(&token::RPAREN); LitNil },
            _ => { self.unexpected_last(tok); }
        }
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@@ -78,6 +78,7 @@ pub enum Token {
    DOLLAR,

    /* Literals */
+    LIT_BYTE(u8),
    LIT_CHAR(char),
    LIT_INT(i64, ast::IntTy),
    LIT_UINT(u64, ast::UintTy),
@@ -86,6 +87,8 @@ pub enum Token {
    LIT_FLOAT_UNSUFFIXED(ast::Ident),
    LIT_STR(ast::Ident),
    LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */
+    LIT_BINARY(Rc<Vec<u8>>),
+    LIT_BINARY_RAW(Rc<Vec<u8>>, uint), /* raw binary str delimited by n hash symbols */

    /* Name components */
    // an identifier contains an "is_mod_name" boolean,
@@ -193,6 +196,14 @@ pub fn to_str(t: &Token) -> String {
      DOLLAR => "$".to_string(),

      /* Literals */
+      LIT_BYTE(b) => {
+          let mut res = String::from_str("b'");
+          (b as char).escape_default(|c| {
+              res.push_char(c);
+          });
+          res.push_char('\'');
+          res
+      }
      LIT_CHAR(c) => {
          let mut res = String::from_str("'");
          c.escape_default(|c| {
@@ -222,17 +233,26 @@ pub fn to_str(t: &Token) -> String {
        body
      }
      LIT_STR(s) => {
-          (format!("\"{}\"", get_ident(s).get().escape_default())).to_string()
+          format!("\"{}\"", get_ident(s).get().escape_default())
      }
      LIT_STR_RAW(s, n) => {
-          (format!("r{delim}\"{string}\"{delim}",
-                  delim="#".repeat(n), string=get_ident(s))).to_string()
+        format!("r{delim}\"{string}\"{delim}",
+                 delim="#".repeat(n), string=get_ident(s))
+      }
+      LIT_BINARY(ref v) => {
+          format!(
+            "b\"{}\"",
+            v.iter().map(|&b| b as char).collect::<String>().escape_default())
+      }
+      LIT_BINARY_RAW(ref s, n) => {
+        format!("br{delim}\"{string}\"{delim}",
+                 delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii())
      }

      /* Name components */
      IDENT(s, _) => get_ident(s).get().to_string(),
      LIFETIME(s) => {
-          (format!("{}", get_ident(s))).to_string()
+          format!("{}", get_ident(s))
      }
      UNDERSCORE => "_".to_string(),

@@ -273,6 +293,7 @@ pub fn can_begin_expr(t: &Token) -> bool {
      IDENT(_, _) => true,
      UNDERSCORE => true,
      TILDE => true,
+      LIT_BYTE(_) => true,
      LIT_CHAR(_) => true,
      LIT_INT(_, _) => true,
      LIT_UINT(_, _) => true,
@@ -281,6 +302,8 @@ pub fn can_begin_expr(t: &Token) -> bool {
      LIT_FLOAT_UNSUFFIXED(_) => true,
      LIT_STR(_) => true,
      LIT_STR_RAW(_, _) => true,
+      LIT_BINARY(_) => true,
+      LIT_BINARY_RAW(_, _) => true,
      POUND => true,
      AT => true,
      NOT => true,
@@ -311,6 +334,7 @@ pub fn close_delimiter_for(t: &Token) -> Option<Token> {

 pub fn is_lit(t: &Token) -> bool {
    match *t {
+      LIT_BYTE(_) => true,
      LIT_CHAR(_) => true,
      LIT_INT(_, _) => true,
      LIT_UINT(_, _) => true,
@@ -319,6 +343,8 @@ pub fn is_lit(t: &Token) -> bool {
      LIT_FLOAT_UNSUFFIXED(_) => true,
      LIT_STR(_) => true,
      LIT_STR_RAW(_, _) => true,
+      LIT_BINARY(_) => true,
+      LIT_BINARY_RAW(_, _) => true,
      _ => false
    }
 }