Optimized IR generation for UTF-8 and UTF-16 encoding
- Both can now be inlined and constant folded away - Both can no longer cause failure - Both now return an `Option` instead Removed debug `assert!()`s over the valid ranges of a `char` - It affected optimizations due to unwinding - Char handling is now sound enought that they became uneccessary
This commit is contained in:
@@ -1680,7 +1680,7 @@ mod tests {
|
|||||||
fn test_chars_decoding() {
|
fn test_chars_decoding() {
|
||||||
let mut bytes = [0u8, ..4];
|
let mut bytes = [0u8, ..4];
|
||||||
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
|
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
|
||||||
let len = c.encode_utf8(bytes);
|
let len = c.encode_utf8(bytes).unwrap_or(0);
|
||||||
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
|
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
|
||||||
if Some(c) != s.chars().next() {
|
if Some(c) != s.chars().next() {
|
||||||
fail!("character {:x}={} does not decode correctly", c as u32, c);
|
fail!("character {:x}={} does not decode correctly", c as u32, c);
|
||||||
@@ -1692,7 +1692,7 @@ mod tests {
|
|||||||
fn test_chars_rev_decoding() {
|
fn test_chars_rev_decoding() {
|
||||||
let mut bytes = [0u8, ..4];
|
let mut bytes = [0u8, ..4];
|
||||||
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
|
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
|
||||||
let len = c.encode_utf8(bytes);
|
let len = c.encode_utf8(bytes).unwrap_or(0);
|
||||||
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
|
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
|
||||||
if Some(c) != s.chars().rev().next() {
|
if Some(c) != s.chars().rev().next() {
|
||||||
fail!("character {:x}={} does not decode correctly", c as u32, c);
|
fail!("character {:x}={} does not decode correctly", c as u32, c);
|
||||||
|
|||||||
@@ -503,7 +503,7 @@ impl String {
|
|||||||
data: self.vec.as_ptr().offset(cur_len as int),
|
data: self.vec.as_ptr().offset(cur_len as int),
|
||||||
len: 4,
|
len: 4,
|
||||||
};
|
};
|
||||||
let used = ch.encode_utf8(mem::transmute(slice));
|
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
|
||||||
self.vec.set_len(cur_len + used);
|
self.vec.set_len(cur_len + used);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
use mem::transmute;
|
use mem::transmute;
|
||||||
use option::{None, Option, Some};
|
use option::{None, Option, Some};
|
||||||
use iter::range_step;
|
use iter::range_step;
|
||||||
|
use collections::Collection;
|
||||||
|
|
||||||
// UTF-8 ranges and tags for encoding characters
|
// UTF-8 ranges and tags for encoding characters
|
||||||
static TAG_CONT: u8 = 0b1000_0000u8;
|
static TAG_CONT: u8 = 0b1000_0000u8;
|
||||||
@@ -27,7 +28,6 @@ static TAG_FOUR_B: u8 = 0b1111_0000u8;
|
|||||||
static MAX_ONE_B: u32 = 0x80u32;
|
static MAX_ONE_B: u32 = 0x80u32;
|
||||||
static MAX_TWO_B: u32 = 0x800u32;
|
static MAX_TWO_B: u32 = 0x800u32;
|
||||||
static MAX_THREE_B: u32 = 0x10000u32;
|
static MAX_THREE_B: u32 = 0x10000u32;
|
||||||
static MAX_FOUR_B: u32 = 0x200000u32;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Lu Uppercase_Letter an uppercase letter
|
Lu Uppercase_Letter an uppercase letter
|
||||||
@@ -217,14 +217,14 @@ pub fn escape_default(c: char, f: |char|) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the amount of bytes this `char` would need if encoded in UTF-8
|
/// Returns the amount of bytes this `char` would need if encoded in UTF-8
|
||||||
|
#[inline]
|
||||||
pub fn len_utf8_bytes(c: char) -> uint {
|
pub fn len_utf8_bytes(c: char) -> uint {
|
||||||
let code = c as u32;
|
let code = c as u32;
|
||||||
match () {
|
match () {
|
||||||
_ if code < MAX_ONE_B => 1u,
|
_ if code < MAX_ONE_B => 1u,
|
||||||
_ if code < MAX_TWO_B => 2u,
|
_ if code < MAX_TWO_B => 2u,
|
||||||
_ if code < MAX_THREE_B => 3u,
|
_ if code < MAX_THREE_B => 3u,
|
||||||
_ if code < MAX_FOUR_B => 4u,
|
_ => 4u,
|
||||||
_ => fail!("invalid character!"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,21 +297,19 @@ pub trait Char {
|
|||||||
/// UTF-8.
|
/// UTF-8.
|
||||||
fn len_utf8_bytes(&self) -> uint;
|
fn len_utf8_bytes(&self) -> uint;
|
||||||
|
|
||||||
/// Encodes this character as UTF-8 into the provided byte buffer.
|
/// Encodes this character as UTF-8 into the provided byte buffer,
|
||||||
|
/// and then returns the number of bytes written.
|
||||||
///
|
///
|
||||||
/// The buffer must be at least 4 bytes long or a runtime failure may
|
/// If the buffer is not large enough, nothing will be written into it
|
||||||
/// occur.
|
/// and a `None` will be returned.
|
||||||
///
|
fn encode_utf8(&self, dst: &mut [u8]) -> Option<uint>;
|
||||||
/// This will then return the number of bytes written to the slice.
|
|
||||||
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
|
|
||||||
|
|
||||||
/// Encodes this character as UTF-16 into the provided `u16` buffer.
|
/// Encodes this character as UTF-16 into the provided `u16` buffer,
|
||||||
|
/// and then returns the number of `u16`s written.
|
||||||
///
|
///
|
||||||
/// The buffer must be at least 2 elements long or a runtime failure may
|
/// If the buffer is not large enough, nothing will be written into it
|
||||||
/// occur.
|
/// and a `None` will be returned.
|
||||||
///
|
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint>;
|
||||||
/// This will then return the number of `u16`s written to the slice.
|
|
||||||
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Char for char {
|
impl Char for char {
|
||||||
@@ -325,45 +323,52 @@ impl Char for char {
|
|||||||
|
|
||||||
fn escape_default(&self, f: |char|) { escape_default(*self, f) }
|
fn escape_default(&self, f: |char|) { escape_default(*self, f) }
|
||||||
|
|
||||||
|
#[inline]
|
||||||
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
|
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
|
||||||
|
|
||||||
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
|
#[inline]
|
||||||
|
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> Option<uint> {
|
||||||
|
// Marked #[inline] to allow llvm optimizing it away
|
||||||
let code = *self as u32;
|
let code = *self as u32;
|
||||||
if code < MAX_ONE_B {
|
if code < MAX_ONE_B && dst.len() >= 1 {
|
||||||
dst[0] = code as u8;
|
dst[0] = code as u8;
|
||||||
1
|
Some(1)
|
||||||
} else if code < MAX_TWO_B {
|
} else if code < MAX_TWO_B && dst.len() >= 2 {
|
||||||
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
|
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
|
||||||
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
2
|
Some(2)
|
||||||
} else if code < MAX_THREE_B {
|
} else if code < MAX_THREE_B && dst.len() >= 3 {
|
||||||
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
|
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
|
||||||
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
|
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
3
|
Some(3)
|
||||||
} else {
|
} else if dst.len() >= 4 {
|
||||||
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
|
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
|
||||||
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
|
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
|
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
|
||||||
4
|
Some(4)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn encode_utf16(&self, dst: &mut [u16]) -> uint {
|
#[inline]
|
||||||
|
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint> {
|
||||||
|
// Marked #[inline] to allow llvm optimizing it away
|
||||||
let mut ch = *self as u32;
|
let mut ch = *self as u32;
|
||||||
if (ch & 0xFFFF_u32) == ch {
|
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
|
||||||
// The BMP falls through (assuming non-surrogate, as it should)
|
// The BMP falls through (assuming non-surrogate, as it should)
|
||||||
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
|
|
||||||
dst[0] = ch as u16;
|
dst[0] = ch as u16;
|
||||||
1
|
Some(1)
|
||||||
} else {
|
} else if dst.len() >= 2 {
|
||||||
// Supplementary planes break into surrogates.
|
// Supplementary planes break into surrogates.
|
||||||
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
|
|
||||||
ch -= 0x1_0000_u32;
|
ch -= 0x1_0000_u32;
|
||||||
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
|
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
|
||||||
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
|
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
|
||||||
2
|
Some(2)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -364,7 +364,7 @@ impl<'a> Formatter<'a> {
|
|||||||
let write_prefix = |f: &mut Formatter| {
|
let write_prefix = |f: &mut Formatter| {
|
||||||
for c in sign.move_iter() {
|
for c in sign.move_iter() {
|
||||||
let mut b = [0, ..4];
|
let mut b = [0, ..4];
|
||||||
let n = c.encode_utf8(b);
|
let n = c.encode_utf8(b).unwrap_or(0);
|
||||||
try!(f.buf.write(b.slice_to(n)));
|
try!(f.buf.write(b.slice_to(n)));
|
||||||
}
|
}
|
||||||
if prefixed { f.buf.write(prefix.as_bytes()) }
|
if prefixed { f.buf.write(prefix.as_bytes()) }
|
||||||
@@ -464,7 +464,7 @@ impl<'a> Formatter<'a> {
|
|||||||
try!(f(self));
|
try!(f(self));
|
||||||
}
|
}
|
||||||
let mut fill = [0u8, ..4];
|
let mut fill = [0u8, ..4];
|
||||||
let len = self.fill.encode_utf8(fill);
|
let len = self.fill.encode_utf8(fill).unwrap_or(0);
|
||||||
for _ in range(0, padding) {
|
for _ in range(0, padding) {
|
||||||
try!(self.buf.write(fill.slice_to(len)));
|
try!(self.buf.write(fill.slice_to(len)));
|
||||||
}
|
}
|
||||||
@@ -540,7 +540,7 @@ impl<'a, T: str::Str> String for T {
|
|||||||
impl Char for char {
|
impl Char for char {
|
||||||
fn fmt(&self, f: &mut Formatter) -> Result {
|
fn fmt(&self, f: &mut Formatter) -> Result {
|
||||||
let mut utf8 = [0u8, ..4];
|
let mut utf8 = [0u8, ..4];
|
||||||
let amt = self.encode_utf8(utf8);
|
let amt = self.encode_utf8(utf8).unwrap_or(0);
|
||||||
let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };
|
let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };
|
||||||
secret_string(&s, f)
|
secret_string(&s, f)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ use iter::range;
|
|||||||
use num::{CheckedMul, Saturating};
|
use num::{CheckedMul, Saturating};
|
||||||
use option::{Option, None, Some};
|
use option::{Option, None, Some};
|
||||||
use raw::Repr;
|
use raw::Repr;
|
||||||
use slice::ImmutableSlice;
|
use slice::{ImmutableSlice, MutableSlice};
|
||||||
use slice;
|
use slice;
|
||||||
use uint;
|
use uint;
|
||||||
|
|
||||||
@@ -646,7 +646,7 @@ impl<'a> Iterator<u16> for Utf16CodeUnits<'a> {
|
|||||||
|
|
||||||
let mut buf = [0u16, ..2];
|
let mut buf = [0u16, ..2];
|
||||||
self.chars.next().map(|ch| {
|
self.chars.next().map(|ch| {
|
||||||
let n = ch.encode_utf16(buf /* as mut slice! */);
|
let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
|
||||||
if n == 2 { self.extra = buf[1]; }
|
if n == 2 { self.extra = buf[1]; }
|
||||||
buf[0]
|
buf[0]
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -173,7 +173,7 @@ fn test_escape_unicode() {
|
|||||||
fn test_encode_utf8() {
|
fn test_encode_utf8() {
|
||||||
fn check(input: char, expect: &[u8]) {
|
fn check(input: char, expect: &[u8]) {
|
||||||
let mut buf = [0u8, ..4];
|
let mut buf = [0u8, ..4];
|
||||||
let n = input.encode_utf8(buf /* as mut slice! */);
|
let n = input.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
|
||||||
assert_eq!(buf.slice_to(n), expect);
|
assert_eq!(buf.slice_to(n), expect);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,7 +187,7 @@ fn test_encode_utf8() {
|
|||||||
fn test_encode_utf16() {
|
fn test_encode_utf16() {
|
||||||
fn check(input: char, expect: &[u16]) {
|
fn check(input: char, expect: &[u16]) {
|
||||||
let mut buf = [0u16, ..2];
|
let mut buf = [0u16, ..2];
|
||||||
let n = input.encode_utf16(buf /* as mut slice! */);
|
let n = input.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
|
||||||
assert_eq!(buf.slice_to(n), expect);
|
assert_eq!(buf.slice_to(n), expect);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1110,7 +1110,7 @@ pub trait Writer {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn write_char(&mut self, c: char) -> IoResult<()> {
|
fn write_char(&mut self, c: char) -> IoResult<()> {
|
||||||
let mut buf = [0u8, ..4];
|
let mut buf = [0u8, ..4];
|
||||||
let n = c.encode_utf8(buf.as_mut_slice());
|
let n = c.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
|
||||||
self.write(buf.slice_to(n))
|
self.write(buf.slice_to(n))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user