auto merge of #14880 : SimonSapin/rust/byte-literals, r=alexcrichton

See #14646 (tracking issue) and rust-lang/rfcs#69.

This does not close the tracking issue, as the `bytes!()` macro still needs to be removed. It will be later, after a snapshot is made with the changes in this PR, so that the new syntax can be used when bootstrapping the compiler.
This commit is contained in:
bors
2014-06-18 02:06:37 +00:00
24 changed files with 469 additions and 108 deletions

View File

@@ -636,6 +636,67 @@ impl<'a> StringReader<'a> {
}
}
/// Scan for a single (possibly escaped) byte or char
/// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
/// `start` is the position of `first_source_char`, which is already consumed.
fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char,
ascii_only: bool, delim: char) -> Option<char> {
match first_source_char {
'\\' => {
// '\X' for some X must be a character constant:
let escaped = self.curr;
let escaped_pos = self.last_pos;
self.bump();
match escaped {
None => {}, // EOF here is an error that will be checked later.
Some(e) => {
return Some(match e {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'"' => '"',
'0' => '\x00',
'x' => self.scan_numeric_escape(2u, delim),
'u' if !ascii_only => self.scan_numeric_escape(4u, delim),
'U' if !ascii_only => self.scan_numeric_escape(8u, delim),
'\n' if delim == '"' => {
self.consume_whitespace();
return None
},
c => {
let last_pos = self.last_pos;
self.err_span_char(
escaped_pos, last_pos,
if ascii_only { "unknown byte escape" }
else { "unknown character escape" },
c);
c
}
})
}
}
}
'\t' | '\n' | '\r' | '\'' if delim == '\'' => {
let last_pos = self.last_pos;
self.err_span_char(
start, last_pos,
if ascii_only { "byte constant must be escaped" }
else { "character constant must be escaped" },
first_source_char);
}
_ => if ascii_only && first_source_char > '\x7F' {
let last_pos = self.last_pos;
self.err_span_char(
start, last_pos,
"byte constant must be ASCII. \
Use a \\xHH escape for a non-ASCII byte", first_source_char);
}
}
Some(first_source_char)
}
fn binop(&mut self, op: token::BinOp) -> token::Token {
self.bump();
if self.curr_is('=') {
@@ -650,10 +711,15 @@ impl<'a> StringReader<'a> {
/// token, and updates the interner
fn next_token_inner(&mut self) -> token::Token {
let c = self.curr;
if ident_start(c) && !self.nextch_is('"') && !self.nextch_is('#') {
if ident_start(c) && match (c.unwrap(), self.nextch(), self.nextnextch()) {
// Note: r as in r" or r#" is part of a raw string literal,
// not an identifier, and is handled further down.
// b as in b' is part of a byte literal.
// They are not identifiers, and are handled further down.
('r', Some('"'), _) | ('r', Some('#'), _) |
('b', Some('"'), _) | ('b', Some('\''), _) |
('b', Some('r'), Some('"')) | ('b', Some('r'), Some('#')) => false,
_ => true
} {
let start = self.last_pos;
while ident_continue(self.curr) {
self.bump();
@@ -805,43 +871,7 @@ impl<'a> StringReader<'a> {
}
// Otherwise it is a character constant:
match c2 {
'\\' => {
// '\X' for some X must be a character constant:
let escaped = self.curr;
let escaped_pos = self.last_pos;
self.bump();
match escaped {
None => {}
Some(e) => {
c2 = match e {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'"' => '"',
'0' => '\x00',
'x' => self.scan_numeric_escape(2u, '\''),
'u' => self.scan_numeric_escape(4u, '\''),
'U' => self.scan_numeric_escape(8u, '\''),
c2 => {
let last_bpos = self.last_pos;
self.err_span_char(escaped_pos, last_bpos,
"unknown character escape", c2);
c2
}
}
}
}
}
'\t' | '\n' | '\r' | '\'' => {
let last_bpos = self.last_pos;
self.err_span_char( start, last_bpos,
"character constant must be escaped", c2);
}
_ => {}
}
c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap();
if !self.curr_is('\'') {
let last_bpos = self.last_pos;
self.fatal_span_verbose(
@@ -854,6 +884,112 @@ impl<'a> StringReader<'a> {
self.bump(); // advance curr past token
return token::LIT_CHAR(c2);
}
'b' => {
self.bump();
return match self.curr {
Some('\'') => parse_byte(self),
Some('"') => parse_byte_string(self),
Some('r') => parse_raw_byte_string(self),
_ => unreachable!() // Should have been a token::IDENT above.
};
fn parse_byte(self_: &mut StringReader) -> token::Token {
self_.bump();
let start = self_.last_pos;
// the eof will be picked up by the final `'` check below
let mut c2 = self_.curr.unwrap_or('\x00');
self_.bump();
c2 = self_.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap();
if !self_.curr_is('\'') {
// Byte offsetting here is okay because the
// character before position `start` are an
// ascii single quote and ascii 'b'.
let last_pos = self_.last_pos;
self_.fatal_span_verbose(
start - BytePos(2), last_pos,
"unterminated byte constant".to_string());
}
self_.bump(); // advance curr past token
return token::LIT_BYTE(c2 as u8);
}
fn parse_byte_string(self_: &mut StringReader) -> token::Token {
self_.bump();
let start = self_.last_pos;
let mut value = Vec::new();
while !self_.curr_is('"') {
if self_.is_eof() {
let last_pos = self_.last_pos;
self_.fatal_span(start, last_pos,
"unterminated double quote byte string");
}
let ch_start = self_.last_pos;
let ch = self_.curr.unwrap();
self_.bump();
self_.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"')
.map(|ch| value.push(ch as u8));
}
self_.bump();
return token::LIT_BINARY(Rc::new(value));
}
fn parse_raw_byte_string(self_: &mut StringReader) -> token::Token {
let start_bpos = self_.last_pos;
self_.bump();
let mut hash_count = 0u;
while self_.curr_is('#') {
self_.bump();
hash_count += 1;
}
if self_.is_eof() {
let last_pos = self_.last_pos;
self_.fatal_span(start_bpos, last_pos, "unterminated raw string");
} else if !self_.curr_is('"') {
let last_pos = self_.last_pos;
let ch = self_.curr.unwrap();
self_.fatal_span_char(start_bpos, last_pos,
"only `#` is allowed in raw string delimitation; \
found illegal character",
ch);
}
self_.bump();
let content_start_bpos = self_.last_pos;
let mut content_end_bpos;
'outer: loop {
match self_.curr {
None => {
let last_pos = self_.last_pos;
self_.fatal_span(start_bpos, last_pos, "unterminated raw string")
},
Some('"') => {
content_end_bpos = self_.last_pos;
for _ in range(0, hash_count) {
self_.bump();
if !self_.curr_is('#') {
continue 'outer;
}
}
break;
},
Some(c) => if c > '\x7F' {
let last_pos = self_.last_pos;
self_.err_span_char(
last_pos, last_pos, "raw byte string must be ASCII", c);
}
}
self_.bump();
}
self_.bump();
let bytes = self_.with_str_from_to(content_start_bpos,
content_end_bpos,
|s| s.as_bytes().to_owned());
return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count);
}
}
'"' => {
let mut accum_str = String::new();
let start_bpos = self.last_pos;
@@ -864,46 +1000,11 @@ impl<'a> StringReader<'a> {
self.fatal_span(start_bpos, last_bpos, "unterminated double quote string");
}
let ch_start = self.last_pos;
let ch = self.curr.unwrap();
self.bump();
match ch {
'\\' => {
if self.is_eof() {
let last_bpos = self.last_pos;
self.fatal_span(start_bpos, last_bpos,
"unterminated double quote string");
}
let escaped = self.curr.unwrap();
let escaped_pos = self.last_pos;
self.bump();
match escaped {
'n' => accum_str.push_char('\n'),
'r' => accum_str.push_char('\r'),
't' => accum_str.push_char('\t'),
'\\' => accum_str.push_char('\\'),
'\'' => accum_str.push_char('\''),
'"' => accum_str.push_char('"'),
'\n' => self.consume_whitespace(),
'0' => accum_str.push_char('\x00'),
'x' => {
accum_str.push_char(self.scan_numeric_escape(2u, '"'));
}
'u' => {
accum_str.push_char(self.scan_numeric_escape(4u, '"'));
}
'U' => {
accum_str.push_char(self.scan_numeric_escape(8u, '"'));
}
c2 => {
let last_bpos = self.last_pos;
self.err_span_char(escaped_pos, last_bpos,
"unknown string escape", c2);
}
}
}
_ => accum_str.push_char(ch)
}
self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"')
.map(|ch| accum_str.push_char(ch));
}
self.bump();
return token::LIT_STR(str_to_ident(accum_str.as_slice()));

View File

@@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod};
use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic};
use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl};
use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_};
use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar};
use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte, LitBinary};
use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet};
use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal};
use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability};
@@ -1512,6 +1512,7 @@ impl<'a> Parser<'a> {
// matches token_lit = LIT_INT | ...
pub fn lit_from_token(&mut self, tok: &token::Token) -> Lit_ {
match *tok {
token::LIT_BYTE(i) => LitByte(i),
token::LIT_CHAR(i) => LitChar(i),
token::LIT_INT(i, it) => LitInt(i, it),
token::LIT_UINT(u, ut) => LitUint(u, ut),
@@ -1528,6 +1529,8 @@ impl<'a> Parser<'a> {
token::LIT_STR_RAW(s, n) => {
LitStr(self.id_to_interned_str(s), ast::RawStr(n))
}
token::LIT_BINARY_RAW(ref v, _) |
token::LIT_BINARY(ref v) => LitBinary(v.clone()),
token::LPAREN => { self.expect(&token::RPAREN); LitNil },
_ => { self.unexpected_last(tok); }
}

View File

@@ -78,6 +78,7 @@ pub enum Token {
DOLLAR,
/* Literals */
LIT_BYTE(u8),
LIT_CHAR(char),
LIT_INT(i64, ast::IntTy),
LIT_UINT(u64, ast::UintTy),
@@ -86,6 +87,8 @@ pub enum Token {
LIT_FLOAT_UNSUFFIXED(ast::Ident),
LIT_STR(ast::Ident),
LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */
LIT_BINARY(Rc<Vec<u8>>),
LIT_BINARY_RAW(Rc<Vec<u8>>, uint), /* raw binary str delimited by n hash symbols */
/* Name components */
// an identifier contains an "is_mod_name" boolean,
@@ -193,6 +196,14 @@ pub fn to_str(t: &Token) -> String {
DOLLAR => "$".to_string(),
/* Literals */
LIT_BYTE(b) => {
let mut res = String::from_str("b'");
(b as char).escape_default(|c| {
res.push_char(c);
});
res.push_char('\'');
res
}
LIT_CHAR(c) => {
let mut res = String::from_str("'");
c.escape_default(|c| {
@@ -222,17 +233,26 @@ pub fn to_str(t: &Token) -> String {
body
}
LIT_STR(s) => {
(format!("\"{}\"", get_ident(s).get().escape_default())).to_string()
format!("\"{}\"", get_ident(s).get().escape_default())
}
LIT_STR_RAW(s, n) => {
(format!("r{delim}\"{string}\"{delim}",
delim="#".repeat(n), string=get_ident(s))).to_string()
format!("r{delim}\"{string}\"{delim}",
delim="#".repeat(n), string=get_ident(s))
}
LIT_BINARY(ref v) => {
format!(
"b\"{}\"",
v.iter().map(|&b| b as char).collect::<String>().escape_default())
}
LIT_BINARY_RAW(ref s, n) => {
format!("br{delim}\"{string}\"{delim}",
delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii())
}
/* Name components */
IDENT(s, _) => get_ident(s).get().to_string(),
LIFETIME(s) => {
(format!("{}", get_ident(s))).to_string()
format!("{}", get_ident(s))
}
UNDERSCORE => "_".to_string(),
@@ -273,6 +293,7 @@ pub fn can_begin_expr(t: &Token) -> bool {
IDENT(_, _) => true,
UNDERSCORE => true,
TILDE => true,
LIT_BYTE(_) => true,
LIT_CHAR(_) => true,
LIT_INT(_, _) => true,
LIT_UINT(_, _) => true,
@@ -281,6 +302,8 @@ pub fn can_begin_expr(t: &Token) -> bool {
LIT_FLOAT_UNSUFFIXED(_) => true,
LIT_STR(_) => true,
LIT_STR_RAW(_, _) => true,
LIT_BINARY(_) => true,
LIT_BINARY_RAW(_, _) => true,
POUND => true,
AT => true,
NOT => true,
@@ -311,6 +334,7 @@ pub fn close_delimiter_for(t: &Token) -> Option<Token> {
pub fn is_lit(t: &Token) -> bool {
match *t {
LIT_BYTE(_) => true,
LIT_CHAR(_) => true,
LIT_INT(_, _) => true,
LIT_UINT(_, _) => true,
@@ -319,6 +343,8 @@ pub fn is_lit(t: &Token) -> bool {
LIT_FLOAT_UNSUFFIXED(_) => true,
LIT_STR(_) => true,
LIT_STR_RAW(_, _) => true,
LIT_BINARY(_) => true,
LIT_BINARY_RAW(_, _) => true,
_ => false
}
}