Files
rust/src/libregex/parse.rs

1063 lines
37 KiB
Rust
Raw Normal View History

// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::char;
use std::cmp;
use std::fmt;
use std::iter;
use std::num;
use std::str;
/// Static data containing Unicode ranges for general categories and scripts.
Add libunicode; move unicode functions from core - created new crate, libunicode, below libstd - split Char trait into Char (libcore) and UnicodeChar (libunicode) - Unicode-aware functions now live in libunicode - is_alphabetic, is_XID_start, is_XID_continue, is_lowercase, is_uppercase, is_whitespace, is_alphanumeric, is_control, is_digit, to_uppercase, to_lowercase - added width method in UnicodeChar trait - determines printed width of character in columns, or None if it is a non-NULL control character - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise) - split StrSlice into StrSlice (libcore) and UnicodeStrSlice (libunicode) - functionality formerly in StrSlice that relied upon Unicode functionality from Char is now in UnicodeStrSlice - words, is_whitespace, is_alphanumeric, trim, trim_left, trim_right - also moved Words type alias into libunicode because words method is in UnicodeStrSlice - unified Unicode tables from libcollections, libcore, and libregex into libunicode - updated unicode.py in src/etc to generate aforementioned tables - generated new tables based on latest Unicode data - added UnicodeChar and UnicodeStrSlice traits to prelude - libunicode is now the collection point for the std::char module, combining the libunicode functionality with the Char functionality from libcore - thus, moved doc comment for char from core::char to unicode::char - libcollections remains the collection point for std::str The Unicode-aware functions that previously lived in the Char and StrSlice traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and use the UnicodeChar and/or UnicodeStrSlice traits: extern crate unicode; use unicode::UnicodeChar; use unicode::UnicodeStrSlice; use unicode::Words; // if you want to use the words() method NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude. closes #15224 [breaking-change]
2014-06-30 17:04:10 -04:00
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
static MAX_REPEAT: uint = 1000;
/// Error corresponds to something that can go wrong while parsing
/// a regular expression.
///
/// (Once an expression is compiled, it is not possible to produce an error
/// via searching, splitting or replacing.)
pub struct Error {
/// The *approximate* character index of where the error occurred.
pub pos: uint,
/// A message describing the error.
pub msg: String,
}
impl fmt::Show for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Regex syntax error near position {}: {}",
self.pos, self.msg)
}
}
/// Represents the abstract syntax of a regular expression.
/// It is showable so that error messages resulting from a bug can provide
/// useful information.
/// It is cloneable so that expressions can be repeated for the counted
/// repetition feature. (No other copying is done.)
///
/// Note that this representation prevents one from reproducing the regex as
/// it was typed. (But it could be used to reproduce an equivalent regex.)
#[deriving(Show, Clone)]
pub enum Ast {
Nothing,
Literal(char, Flags),
Dot(Flags),
Class(Vec<(char, char)>, Flags),
Begin(Flags),
End(Flags),
WordBoundary(Flags),
Capture(uint, Option<String>, Box<Ast>),
// Represent concatenation as a flat vector to avoid blowing the
// stack in the compiler.
Cat(Vec<Ast>),
Alt(Box<Ast>, Box<Ast>),
Rep(Box<Ast>, Repeater, Greed),
}
#[deriving(Show, PartialEq, Clone)]
pub enum Repeater {
ZeroOne,
ZeroMore,
OneMore,
}
#[deriving(Show, Clone)]
pub enum Greed {
Greedy,
Ungreedy,
}
impl Greed {
pub fn is_greedy(&self) -> bool {
match *self {
Greedy => true,
_ => false,
}
}
fn swap(self, swapped: bool) -> Greed {
if !swapped { return self }
match self {
Greedy => Ungreedy,
Ungreedy => Greedy,
}
}
}
/// BuildAst is a regrettable type that represents intermediate state for
/// constructing an abstract syntax tree. Its central purpose is to facilitate
/// parsing groups and alternations while also maintaining a stack of flag
/// state.
#[deriving(Show)]
enum BuildAst {
Ast(Ast),
Paren(Flags, uint, String), // '('
Bar, // '|'
}
impl BuildAst {
fn paren(&self) -> bool {
match *self {
Paren(_, _, _) => true,
_ => false,
}
}
fn flags(&self) -> Flags {
match *self {
Paren(flags, _, _) => flags,
_ => fail!("Cannot get flags from {}", self),
}
}
fn capture(&self) -> Option<uint> {
match *self {
Paren(_, 0, _) => None,
Paren(_, c, _) => Some(c),
_ => fail!("Cannot get capture group from {}", self),
}
}
fn capture_name(&self) -> Option<String> {
match *self {
Paren(_, 0, _) => None,
Paren(_, _, ref name) => {
if name.len() == 0 {
None
} else {
Some(name.clone())
}
}
_ => fail!("Cannot get capture name from {}", self),
}
}
fn bar(&self) -> bool {
match *self {
Bar => true,
_ => false,
}
}
fn unwrap(self) -> Result<Ast, Error> {
match self {
Ast(x) => Ok(x),
_ => fail!("Tried to unwrap non-AST item: {}", self),
}
}
}
/// Flags represents all options that can be twiddled by a user in an
/// expression.
pub type Flags = u8;
pub static FLAG_EMPTY: u8 = 0;
pub static FLAG_NOCASE: u8 = 1 << 0; // i
pub static FLAG_MULTI: u8 = 1 << 1; // m
pub static FLAG_DOTNL: u8 = 1 << 2; // s
pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U
pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary
struct Parser<'a> {
// The input, parsed only as a sequence of UTF8 code points.
chars: Vec<char>,
// The index of the current character in the input.
chari: uint,
// The intermediate state representing the AST.
stack: Vec<BuildAst>,
// The current set of flags.
flags: Flags,
// The total number of capture groups.
// Incremented each time an opening left paren is seen (assuming it is
// opening a capture group).
caps: uint,
// A set of all capture group names used only to detect duplicates.
names: Vec<String>,
}
pub fn parse(s: &str) -> Result<Ast, Error> {
Parser {
chars: s.chars().collect(),
chari: 0,
stack: vec!(),
flags: FLAG_EMPTY,
caps: 0,
names: vec!(),
}.parse()
}
impl<'a> Parser<'a> {
fn parse(&mut self) -> Result<Ast, Error> {
if self.chars.len() == 0 {
return Ok(Nothing);
}
loop {
let c = self.cur();
match c {
'?' | '*' | '+' => try!(self.push_repeater(c)),
'\\' => {
let ast = try!(self.parse_escape());
self.push(ast)
}
'{' => try!(self.parse_counted()),
'[' => match self.try_parse_ascii() {
None => try!(self.parse_class()),
Some(class) => self.push(class),
},
'(' => {
if self.peek_is(1, '?') {
try!(self.expect('?'))
try!(self.parse_group_opts())
} else {
self.caps += 1;
self.stack.push(Paren(self.flags,
self.caps,
"".to_string()))
}
}
')' => {
let catfrom = try!(
self.pos_last(false, |x| x.paren() || x.bar()));
try!(self.concat(catfrom));
let altfrom = try!(self.pos_last(false, |x| x.paren()));
// Before we smush the alternates together and pop off the
// left paren, let's grab the old flags and see if we
// need a capture.
let (cap, cap_name, oldflags) = {
let paren = self.stack.get(altfrom-1);
(paren.capture(), paren.capture_name(), paren.flags())
};
try!(self.alternate(altfrom));
self.flags = oldflags;
// If this was a capture, pop what we just pushed in
// alternate and make it a capture.
if cap.is_some() {
let ast = try!(self.pop_ast());
self.push(Capture(cap.unwrap(), cap_name, box ast));
}
}
'|' => {
let catfrom = try!(
self.pos_last(true, |x| x.paren() || x.bar()));
try!(self.concat(catfrom));
self.stack.push(Bar);
}
_ => try!(self.push_literal(c)),
}
if !self.next_char() {
break
}
}
// Try to improve error handling. At this point, there should be
// no remaining open parens.
if self.stack.iter().any(|x| x.paren()) {
return self.err("Unclosed parenthesis.")
}
let catfrom = try!(self.pos_last(true, |x| x.bar()));
try!(self.concat(catfrom));
try!(self.alternate(0));
assert!(self.stack.len() == 1);
self.pop_ast()
}
fn noteof(&mut self, expected: &str) -> Result<(), Error> {
match self.next_char() {
true => Ok(()),
false => {
self.err(format!("Expected {} but got EOF.",
expected).as_slice())
}
}
}
fn expect(&mut self, expected: char) -> Result<(), Error> {
match self.next_char() {
true if self.cur() == expected => Ok(()),
true => self.err(format!("Expected '{}' but got '{}'.",
expected, self.cur()).as_slice()),
false => {
self.err(format!("Expected '{}' but got EOF.",
expected).as_slice())
}
}
}
fn next_char(&mut self) -> bool {
self.chari += 1;
self.chari < self.chars.len()
}
fn pop_ast(&mut self) -> Result<Ast, Error> {
match self.stack.pop().unwrap().unwrap() {
Err(e) => Err(e),
Ok(ast) => Ok(ast),
}
}
fn push(&mut self, ast: Ast) {
self.stack.push(Ast(ast))
}
fn push_repeater(&mut self, c: char) -> Result<(), Error> {
if self.stack.len() == 0 {
return self.err(
"A repeat operator must be preceded by a valid expression.")
}
let rep: Repeater = match c {
'?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
_ => fail!("Not a valid repeater operator."),
};
match self.peek(1) {
Some('*') | Some('+') =>
return self.err(
"Double repeat operators are not supported."),
_ => {},
}
let ast = try!(self.pop_ast());
match ast {
Begin(_) | End(_) | WordBoundary(_) =>
return self.err(
"Repeat arguments cannot be empty width assertions."),
_ => {}
}
let greed = try!(self.get_next_greedy());
self.push(Rep(box ast, rep, greed));
Ok(())
}
fn push_literal(&mut self, c: char) -> Result<(), Error> {
let flags = self.flags;
match c {
'.' => {
self.push(Dot(flags))
}
'^' => {
self.push(Begin(flags))
}
'$' => {
self.push(End(flags))
}
_ => {
self.push(Literal(c, flags))
}
}
Ok(())
}
// Parses all forms of character classes.
// Assumes that '[' is the current character.
fn parse_class(&mut self) -> Result<(), Error> {
let negated =
if self.peek_is(1, '^') {
try!(self.expect('^'))
FLAG_NEGATED
} else {
FLAG_EMPTY
};
let mut ranges: Vec<(char, char)> = vec!();
let mut alts: Vec<Ast> = vec!();
if self.peek_is(1, ']') {
try!(self.expect(']'))
ranges.push((']', ']'))
}
while self.peek_is(1, '-') {
try!(self.expect('-'))
ranges.push(('-', '-'))
}
loop {
try!(self.noteof("a closing ']' or a non-empty character class)"))
let mut c = self.cur();
match c {
'[' =>
match self.try_parse_ascii() {
Some(Class(asciis, flags)) => {
alts.push(Class(asciis, flags ^ negated));
continue
}
Some(ast) =>
fail!("Expected Class AST but got '{}'", ast),
// Just drop down and try to add as a regular character.
None => {},
},
'\\' => {
match try!(self.parse_escape()) {
Class(asciis, flags) => {
alts.push(Class(asciis, flags ^ negated));
continue
}
Literal(c2, _) => c = c2, // process below
Begin(_) | End(_) | WordBoundary(_) =>
return self.err(
"\\A, \\z, \\b and \\B are not valid escape \
sequences inside a character class."),
ast => fail!("Unexpected AST item '{}'", ast),
}
}
_ => {},
}
match c {
']' => {
if ranges.len() > 0 {
let flags = negated | (self.flags & FLAG_NOCASE);
let mut ast = Class(combine_ranges(ranges), flags);
for alt in alts.move_iter() {
ast = Alt(box alt, box ast)
}
self.push(ast);
} else if alts.len() > 0 {
let mut ast = alts.pop().unwrap();
for alt in alts.move_iter() {
ast = Alt(box alt, box ast)
}
self.push(ast);
}
return Ok(())
}
c => {
if self.peek_is(1, '-') && !self.peek_is(2, ']') {
try!(self.expect('-'))
try!(self.noteof("not a ']'"))
let c2 = self.cur();
if c2 < c {
return self.err(format!("Invalid character class \
range '{}-{}'",
c,
c2).as_slice())
}
ranges.push((c, self.cur()))
} else {
ranges.push((c, c))
}
}
}
}
}
// Tries to parse an ASCII character class of the form [:name:].
// If successful, returns an AST character class corresponding to name
// and moves the parser to the final ']' character.
// If unsuccessful, no state is changed and None is returned.
// Assumes that '[' is the current character.
fn try_parse_ascii(&mut self) -> Option<Ast> {
if !self.peek_is(1, ':') {
return None
}
let closer =
match self.pos(']') {
Some(i) => i,
None => return None,
};
if *self.chars.get(closer-1) != ':' {
return None
}
if closer - self.chari <= 3 {
return None
}
let mut name_start = self.chari + 2;
let negated =
if self.peek_is(2, '^') {
name_start += 1;
FLAG_NEGATED
} else {
FLAG_EMPTY
};
let name = self.slice(name_start, closer - 1);
match find_class(ASCII_CLASSES, name.as_slice()) {
None => None,
Some(ranges) => {
self.chari = closer;
let flags = negated | (self.flags & FLAG_NOCASE);
Some(Class(combine_ranges(ranges), flags))
}
}
}
// Parses counted repetition. Supports:
// {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
// Assumes that '{' is the current character.
// Returns either an error or moves the parser to the final '}' character.
// (Or the '?' character if not greedy.)
fn parse_counted(&mut self) -> Result<(), Error> {
// Scan until the closing '}' and grab the stuff in {}.
let start = self.chari;
let closer =
match self.pos('}') {
Some(i) => i,
None => {
return self.err(format!("No closing brace for counted \
repetition starting at position \
{}.",
start).as_slice())
}
};
self.chari = closer;
let greed = try!(self.get_next_greedy());
let inner = str::from_chars(
self.chars.as_slice().slice(start + 1, closer));
// Parse the min and max values from the regex.
let (mut min, mut max): (uint, Option<uint>);
if !inner.as_slice().contains(",") {
min = try!(self.parse_uint(inner.as_slice()));
max = Some(min);
} else {
let pieces: Vec<&str> = inner.as_slice().splitn(',', 1).collect();
let (smin, smax) = (*pieces.get(0), *pieces.get(1));
if smin.len() == 0 {
return self.err("Max repetitions cannot be specified \
without min repetitions.")
}
min = try!(self.parse_uint(smin));
max =
if smax.len() == 0 {
None
} else {
Some(try!(self.parse_uint(smax)))
};
}
// Do some bounds checking and make sure max >= min.
if min > MAX_REPEAT {
return self.err(format!(
"{} exceeds maximum allowed repetitions ({})",
min, MAX_REPEAT).as_slice());
}
if max.is_some() {
let m = max.unwrap();
if m > MAX_REPEAT {
return self.err(format!(
"{} exceeds maximum allowed repetitions ({})",
m, MAX_REPEAT).as_slice());
}
if m < min {
return self.err(format!(
"Max repetitions ({}) cannot be smaller than min \
repetitions ({}).", m, min).as_slice());
}
}
// Now manipulate the AST be repeating elements.
if max.is_none() {
// Require N copies of what's on the stack and then repeat it.
let ast = try!(self.pop_ast());
for _ in iter::range(0, min) {
self.push(ast.clone())
}
self.push(Rep(box ast, ZeroMore, greed));
} else {
// Require N copies of what's on the stack and then repeat it
// up to M times optionally.
let ast = try!(self.pop_ast());
for _ in iter::range(0, min) {
self.push(ast.clone())
}
if max.is_some() {
for _ in iter::range(min, max.unwrap()) {
self.push(Rep(box ast.clone(), ZeroOne, greed))
}
}
// It's possible that we popped something off the stack but
// never put anything back on it. To keep things simple, add
// a no-op expression.
if min == 0 && (max.is_none() || max == Some(0)) {
self.push(Nothing)
}
}
Ok(())
}
// Parses all escape sequences.
// Assumes that '\' is the current character.
fn parse_escape(&mut self) -> Result<Ast, Error> {
try!(self.noteof("an escape sequence following a '\\'"))
let c = self.cur();
if is_punct(c) {
return Ok(Literal(c, FLAG_EMPTY))
}
match c {
'a' => Ok(Literal('\x07', FLAG_EMPTY)),
'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
't' => Ok(Literal('\t', FLAG_EMPTY)),
'n' => Ok(Literal('\n', FLAG_EMPTY)),
'r' => Ok(Literal('\r', FLAG_EMPTY)),
'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
'A' => Ok(Begin(FLAG_EMPTY)),
'z' => Ok(End(FLAG_EMPTY)),
'b' => Ok(WordBoundary(FLAG_EMPTY)),
'B' => Ok(WordBoundary(FLAG_NEGATED)),
'0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
'x' => Ok(try!(self.parse_hex())),
'p' | 'P' => Ok(try!(self.parse_unicode_name())),
'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
let ranges = perl_unicode_class(c);
let mut flags = self.flags & FLAG_NOCASE;
if c.is_uppercase() { flags |= FLAG_NEGATED }
Ok(Class(ranges, flags))
}
_ => {
self.err(format!("Invalid escape sequence '\\\\{}'",
c).as_slice())
}
}
}
// Parses a unicode character class name, either of the form \pF where
// F is a one letter unicode class name or of the form \p{name} where
// name is the unicode class name.
// Assumes that \p or \P has been read (and 'p' or 'P' is the current
// character).
fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
let mut name: String;
if self.peek_is(1, '{') {
try!(self.expect('{'))
let closer =
match self.pos('}') {
Some(i) => i,
None => return self.err(format!(
"Missing '}}' for unclosed '{{' at position {}",
self.chari).as_slice()),
};
if closer - self.chari + 1 == 0 {
return self.err("No Unicode class name found.")
}
name = self.slice(self.chari + 1, closer);
self.chari = closer;
} else {
if self.chari + 1 >= self.chars.len() {
return self.err("No single letter Unicode class name found.")
}
name = self.slice(self.chari + 1, self.chari + 2);
self.chari += 1;
}
match find_class(UNICODE_CLASSES, name.as_slice()) {
None => {
return self.err(format!("Could not find Unicode class '{}'",
name).as_slice())
}
Some(ranges) => {
Ok(Class(ranges, negated | (self.flags & FLAG_NOCASE)))
}
}
}
// Parses an octal number, up to 3 digits.
// Assumes that \n has been read, where n is the first digit.
fn parse_octal(&mut self) -> Result<Ast, Error> {
let start = self.chari;
let mut end = start + 1;
let (d2, d3) = (self.peek(1), self.peek(2));
if d2 >= Some('0') && d2 <= Some('7') {
try!(self.noteof("expected octal character in [0-7]"))
end += 1;
if d3 >= Some('0') && d3 <= Some('7') {
try!(self.noteof("expected octal character in [0-7]"))
end += 1;
}
}
let s = self.slice(start, end);
match num::from_str_radix::<u32>(s.as_slice(), 8) {
Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
None => {
self.err(format!("Could not parse '{}' as octal number.",
s).as_slice())
}
}
}
// Parse a hex number. Either exactly two digits or anything in {}.
// Assumes that \x has been read.
fn parse_hex(&mut self) -> Result<Ast, Error> {
if !self.peek_is(1, '{') {
try!(self.expect('{'))
return self.parse_hex_two()
}
let start = self.chari + 2;
let closer =
match self.pos('}') {
None => {
return self.err(format!("Missing '}}' for unclosed \
'{{' at position {}",
start).as_slice())
}
Some(i) => i,
};
self.chari = closer;
self.parse_hex_digits(self.slice(start, closer).as_slice())
}
// Parses a two-digit hex number.
// Assumes that \xn has been read, where n is the first digit and is the
// current character.
// After return, parser will point at the second digit.
fn parse_hex_two(&mut self) -> Result<Ast, Error> {
let (start, end) = (self.chari, self.chari + 2);
let bad = self.slice(start - 2, self.chars.len());
try!(self.noteof(format!("Invalid hex escape sequence '{}'",
bad).as_slice()))
self.parse_hex_digits(self.slice(start, end).as_slice())
}
// Parses `s` as a hexadecimal number.
fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
match num::from_str_radix::<u32>(s, 16) {
Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
None => {
self.err(format!("Could not parse '{}' as hex number.",
s).as_slice())
}
}
}
// Parses a named capture.
// Assumes that '(?P<' has been consumed and that the current character
// is '<'.
// When done, parser will be at the closing '>' character.
fn parse_named_capture(&mut self) -> Result<(), Error> {
try!(self.noteof("a capture name"))
let closer =
match self.pos('>') {
Some(i) => i,
None => return self.err("Capture name must end with '>'."),
};
if closer - self.chari == 0 {
return self.err("Capture names must have at least 1 character.")
}
let name = self.slice(self.chari, closer);
if !name.as_slice().chars().all(is_valid_cap) {
return self.err(
"Capture names can only have underscores, letters and digits.")
}
if self.names.contains(&name) {
return self.err(format!("Duplicate capture group name '{}'.",
name).as_slice())
}
self.names.push(name.clone());
self.chari = closer;
self.caps += 1;
self.stack.push(Paren(self.flags, self.caps, name));
Ok(())
}
// Parses non-capture groups and options.
// Assumes that '(?' has already been consumed and '?' is the current
// character.
fn parse_group_opts(&mut self) -> Result<(), Error> {
if self.peek_is(1, 'P') && self.peek_is(2, '<') {
try!(self.expect('P')) try!(self.expect('<'))
return self.parse_named_capture()
}
let start = self.chari;
let mut flags = self.flags;
let mut sign = 1i;
let mut saw_flag = false;
loop {
try!(self.noteof("expected non-empty set of flags or closing ')'"))
match self.cur() {
'i' => { flags = flags | FLAG_NOCASE; saw_flag = true},
'm' => { flags = flags | FLAG_MULTI; saw_flag = true},
's' => { flags = flags | FLAG_DOTNL; saw_flag = true},
'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
'-' => {
if sign < 0 {
return self.err(format!(
"Cannot negate flags twice in '{}'.",
self.slice(start, self.chari + 1)).as_slice())
}
sign = -1;
saw_flag = false;
flags = flags ^ flags;
}
':' | ')' => {
if sign < 0 {
if !saw_flag {
return self.err(format!(
"A valid flag does not follow negation in '{}'",
self.slice(start, self.chari + 1)).as_slice())
}
flags = flags ^ flags;
}
if self.cur() == ':' {
// Save the old flags with the opening paren.
self.stack.push(Paren(self.flags, 0, "".to_string()));
}
self.flags = flags;
return Ok(())
}
_ => return self.err(format!(
"Unrecognized flag '{}'.", self.cur()).as_slice()),
}
}
}
// Peeks at the next character and returns whether it's ungreedy or not.
// If it is, then the next character is consumed.
fn get_next_greedy(&mut self) -> Result<Greed, Error> {
Ok(if self.peek_is(1, '?') {
try!(self.expect('?'))
Ungreedy
} else {
Greedy
}.swap(self.flags & FLAG_SWAP_GREED > 0))
}
// Searches the stack (starting at the top) until it finds an expression
// for which `pred` returns true. The index of that expression in the
// stack is returned.
// If there's no match, then one of two things happens depending on the
// values of `allow_start`. When it's true, then `0` will be returned.
// Otherwise, an error will be returned.
// Generally, `allow_start` is only true when you're *not* expecting an
// opening parenthesis.
fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool)
-> Result<uint, Error> {
let from = match self.stack.iter().rev().position(pred) {
Some(i) => i,
None => {
if allow_start {
self.stack.len()
} else {
return self.err("No matching opening parenthesis.")
}
}
};
// Adjust index since 'from' is for the reversed stack.
// Also, don't include the '(' or '|'.
Ok(self.stack.len() - from)
}
// concat starts at `from` in the parser's stack and concatenates all
// expressions up to the top of the stack. The resulting concatenation is
// then pushed on to the stack.
// Usually `from` corresponds to the position of an opening parenthesis,
// a '|' (alternation) or the start of the entire expression.
fn concat(&mut self, from: uint) -> Result<(), Error> {
let ast = try!(self.build_from(from, concat_flatten));
self.push(ast);
Ok(())
}
// concat starts at `from` in the parser's stack and alternates all
// expressions up to the top of the stack. The resulting alternation is
// then pushed on to the stack.
// Usually `from` corresponds to the position of an opening parenthesis
// or the start of the entire expression.
// This will also drop any opening parens or alternation bars found in
// the intermediate AST.
fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
// Unlike in the concatenation case, we want 'build_from' to continue
// all the way to the opening left paren (so it will be popped off and
// thrown away). But be careful with overflow---we can't count on the
// open paren to be there.
if from > 0 { from = from - 1}
let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
self.push(ast);
Ok(())
}
// build_from combines all AST elements starting at 'from' in the
// parser's stack using 'mk' to combine them. If any such element is not an
// AST then it is popped off the stack and ignored.
fn build_from(&mut self, from: uint, mk: |Ast, Ast| -> Ast)
-> Result<Ast, Error> {
if from >= self.stack.len() {
return self.err("Empty group or alternate not allowed.")
}
let mut combined = try!(self.pop_ast());
let mut i = self.stack.len();
while i > from {
i = i - 1;
match self.stack.pop().unwrap() {
Ast(x) => combined = mk(x, combined),
_ => {},
}
}
Ok(combined)
}
fn parse_uint(&self, s: &str) -> Result<uint, Error> {
match from_str::<uint>(s) {
Some(i) => Ok(i),
None => {
self.err(format!("Expected an unsigned integer but got '{}'.",
s).as_slice())
}
}
}
fn char_from_u32(&self, n: u32) -> Result<char, Error> {
match char::from_u32(n) {
Some(c) => Ok(c),
None => {
self.err(format!("Could not decode '{}' to unicode \
character.",
n).as_slice())
}
}
}
fn pos(&self, c: char) -> Option<uint> {
self.chars.iter()
.skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
}
fn err<T>(&self, msg: &str) -> Result<T, Error> {
Err(Error {
pos: self.chari,
msg: msg.to_string(),
})
}
fn peek(&self, offset: uint) -> Option<char> {
if self.chari + offset >= self.chars.len() {
return None
}
Some(*self.chars.get(self.chari + offset))
}
fn peek_is(&self, offset: uint, is: char) -> bool {
self.peek(offset) == Some(is)
}
fn cur(&self) -> char {
*self.chars.get(self.chari)
}
fn slice(&self, start: uint, end: uint) -> String {
str::from_chars(self.chars.as_slice().slice(start, end)).to_string()
}
}
// Given an unordered collection of character ranges, combine_ranges returns
// an ordered sequence of character ranges where no two ranges overlap. They
// are ordered from least to greatest (using start position).
fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
// Returns true iff the two character classes overlap or share a boundary.
// e.g., ('a', 'g') and ('h', 'm') would return true.
fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
}
// This is currently O(n^2), but I think with sufficient cleverness,
// it can be reduced to O(n) **if necessary**.
let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
for (us, ue) in unordered.move_iter() {
let (mut us, mut ue) = (us, ue);
assert!(us <= ue);
let mut which: Option<uint> = None;
for (i, &(os, oe)) in ordered.iter().enumerate() {
if should_merge((us, ue), (os, oe)) {
us = cmp::min(us, os);
ue = cmp::max(ue, oe);
which = Some(i);
break
}
}
match which {
None => ordered.push((us, ue)),
Some(i) => *ordered.get_mut(i) = (us, ue),
}
}
ordered.sort();
ordered
}
// Constructs a Unicode friendly Perl character class from \d, \s or \w
// (or any of their negated forms). Note that this does not handle negation.
fn perl_unicode_class(which: char) -> Vec<(char, char)> {
match which.to_lowercase() {
'd' => Vec::from_slice(PERLD),
's' => Vec::from_slice(PERLS),
'w' => Vec::from_slice(PERLW),
_ => unreachable!(),
}
}
// Returns a concatenation of two expressions. This also guarantees that a
// `Cat` expression will never be a direct child of another `Cat` expression.
fn concat_flatten(x: Ast, y: Ast) -> Ast {
match (x, y) {
(Cat(mut xs), Cat(ys)) => { xs.push_all_move(ys); Cat(xs) }
(Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
(ast, Cat(mut xs)) => { xs.unshift(ast); Cat(xs) }
(ast1, ast2) => Cat(vec!(ast1, ast2)),
}
}
pub fn is_punct(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
'[' | ']' | '{' | '}' | '^' | '$' => true,
_ => false,
}
}
fn is_valid_cap(c: char) -> bool {
c == '_' || (c >= '0' && c <= '9')
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}
fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
match classes.bsearch(|&(s, _)| s.cmp(&name)) {
Some(i) => Some(Vec::from_slice(classes[i].val1())),
None => None,
}
}
type Class = &'static [(char, char)];
type NamedClasses = &'static [(&'static str, Class)];
static ASCII_CLASSES: NamedClasses = &[
// Classes must be in alphabetical order so that bsearch works.
// [:alnum:] alphanumeric (== [0-9A-Za-z])
// [:alpha:] alphabetic (== [A-Za-z])
// [:ascii:] ASCII (== [\x00-\x7F])
// [:blank:] blank (== [\t ])
// [:cntrl:] control (== [\x00-\x1F\x7F])
// [:digit:] digits (== [0-9])
// [:graph:] graphical (== [!-~])
// [:lower:] lower case (== [a-z])
// [:print:] printable (== [ -~] == [ [:graph:]])
// [:punct:] punctuation (== [!-/:-@[-`{-~])
// [:space:] whitespace (== [\t\n\v\f\r ])
// [:upper:] upper case (== [A-Z])
// [:word:] word characters (== [0-9A-Za-z_])
// [:xdigit:] hex digit (== [0-9A-Fa-f])
// Taken from: http://golang.org/pkg/regex/syntax/
("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]),
("alpha", &[('A', 'Z'), ('a', 'z')]),
("ascii", &[('\x00', '\x7F')]),
("blank", &[(' ', ' '), ('\t', '\t')]),
("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]),
("digit", &[('0', '9')]),
("graph", &[('!', '~')]),
("lower", &[('a', 'z')]),
("print", &[(' ', '~')]),
("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]),
("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
('\r', '\r'), (' ', ' ')]),
("upper", &[('A', 'Z')]),
("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]),
("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]),
];